From: Marcin Koziej Date: Wed, 14 Dec 2011 13:01:17 +0000 (+0100) Subject: Picture support X-Git-Tag: 1.7~182^2~2 X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/a3c860f00c7d12ae5852ddab056c98f52ee43072?hp=--cc Picture support --- a3c860f00c7d12ae5852ddab056c98f52ee43072 diff --git a/librarian/__init__.py b/librarian/__init__.py index 32a3af0..42bc518 100644 --- a/librarian/__init__.py +++ b/librarian/__init__.py @@ -138,9 +138,9 @@ class DocProvider(object): """Should return a file-like object with a WL document XML.""" return self.by_slug_and_lang(slug) - def by_uri(self, uri): + def by_uri(self, uri, wluri=WLURI): """Should return a file-like object with a WL document XML.""" - wluri = WLURI(uri) + wluri = wluri(uri) return self.by_slug_and_lang(wluri.slug, wluri.language) diff --git a/librarian/dcparser.py b/librarian/dcparser.py index 992029e..f5fd42a 100644 --- a/librarian/dcparser.py +++ b/librarian/dcparser.py @@ -118,36 +118,48 @@ class Field(object): return self.validate_value(f) + def __eq__(self, other): + if isinstance(other, Field) and other.name == self.name: + return True + return False +class DCInfo(type): + def __new__(meta, classname, bases, class_dict): + fields = class_dict['FIELDS'] + + for base in bases[::-1]: + if hasattr(base, 'FIELDS'): + for field in base.FIELDS[::-1]: + try: + fields.index(field) + except ValueError: + fields = (field,) + fields + + class_dict['FIELDS'] = fields + return super(DCInfo, meta).__new__(meta, classname, bases, class_dict) + + +class WorkInfo(object): + __metaclass__ = DCInfo -class BookInfo(object): FIELDS = ( Field( DCNS('creator'), 'author', as_person), Field( DCNS('title'), 'title'), + Field( DCNS('type'), 'type', required=False, multiple=True), + Field( DCNS('subject.period'), 'epochs', salias='epoch', multiple=True), Field( DCNS('subject.type'), 'kinds', salias='kind', multiple=True), Field( DCNS('subject.genre'), 'genres', salias='genre', multiple=True), - Field( DCNS('audience'), 'audiences', salias='audience', multiple=True, - required=False), + Field( DCNS('date'), 'created_at', as_date), Field( DCNS('date.pd'), 'released_to_public_domain_at', as_date, required=False), - Field( DCNS('contributor.editor'), 'editors', \ - as_person, salias='editor', multiple=True, default=[]), - Field( DCNS('contributor.translator'), 'translators', \ - as_person, salias='translator', multiple=True, default=[]), - Field( DCNS('contributor.technical_editor'), 'technical_editors', - as_person, salias='technical_editor', multiple=True, default=[]), Field( DCNS('publisher'), 'publisher'), + Field( DCNS('source'), 'source_name', required=False), Field( DCNS('source.URL'), 'source_url', required=False), Field( DCNS('identifier.url'), 'url', WLURI), - Field( DCNS('relation.hasPart'), 'parts', WLURI, multiple=True, required=False), - Field( DCNS('rights.license'), 'license', required=False), - Field( DCNS('rights'), 'license_description'), - Field( DCNS('language'), 'language'), - Field( DCNS('description'), 'description', required=False), - ) + ) @classmethod def from_string(cls, xml): @@ -186,7 +198,7 @@ class BookInfo(object): def from_element(cls, rdf_tag): # the tree is already parsed, so we don't need to worry about Expat errors field_dict = {} - desc = rdf_tag.find(".//" + RDFNS('Description') ) + desc = rdf_tag.find(".//" + RDFNS('Description')) if desc is None: raise NoDublinCore("No DublinCore section found.") @@ -196,7 +208,7 @@ class BookInfo(object): fv.append(e.text) field_dict[e.tag] = fv - return cls( desc.attrib, field_dict ) + return cls(desc.attrib, field_dict) def __init__(self, rdf_attrs, dc_fields): """rdf_attrs should be a dictionary-like object with any attributes of the RDF:Description. @@ -283,7 +295,6 @@ class BookInfo(object): return root - def serialize(self): rdf = {} rdf['about'] = { 'uri': RDFNS('about'), 'value': self.about } @@ -321,5 +332,24 @@ class BookInfo(object): return result -def parse(file_name): - return BookInfo.from_file(file_name) + +class BookInfo(WorkInfo): + FIELDS = ( + Field( DCNS('audience'), 'audiences', salias='audience', multiple=True, + required=False), + Field( DCNS('contributor.editor'), 'editors', \ + as_person, salias='editor', multiple=True, default=[]), + Field( DCNS('contributor.translator'), 'translators', \ + as_person, salias='translator', multiple=True, default=[]), + Field( DCNS('contributor.technical_editor'), 'technical_editors', + as_person, salias='technical_editor', multiple=True, default=[]), + Field( DCNS('relation.hasPart'), 'parts', WLURI, multiple=True, required=False), + Field( DCNS('rights.license'), 'license', required=False), + Field( DCNS('rights'), 'license_description'), + Field( DCNS('language'), 'language'), + Field( DCNS('description'), 'description', required=False), + ) + + +def parse(file_name, cls=BookInfo): + return cls.from_file(file_name) diff --git a/librarian/picture.py b/librarian/picture.py new file mode 100644 index 0000000..7830528 --- /dev/null +++ b/librarian/picture.py @@ -0,0 +1,151 @@ + +from dcparser import (as_person, as_date, Field, WorkInfo, DCNS) +from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI) +from xml.parsers.expat import ExpatError +from os import path +from StringIO import StringIO +from lxml import etree +from lxml.etree import (XMLSyntaxError, XSLTApplyError) +import re + + +class WLPictureURI(WLURI): + _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/' + '(?P[-a-z0-9]+)(/(?P[a-z]{3}))?/?$') + + def __init__(self, *args, **kw): + super(WLPictureURI, self).__init__(*args, **kw) + + @classmethod + def from_slug_and_lang(cls, slug, lang): + uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug + return cls(uri) + + def filename_stem(self): + return self.slug + + +class PictureInfo(WorkInfo): + """ + Dublin core metadata for a picture + """ + FIELDS = ( + Field(DCNS('format.dimensions.digital'), 'dimensions', required=False), + Field(DCNS('format.dimensions.original'), 'dimensions_original', required=False), + Field(DCNS('format.physical'), 'physical', required=False), + Field(DCNS('format'), 'mime_type', required=False), + Field(DCNS('identifier.url'), 'url', WLPictureURI), + ) + + def validate(self): + """ + WorkInfo has a language validation code only, which we do not need. + """ + pass + + +class ImageStore(object): + EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp' + 'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc', + 'aiff', 'wbmp', 'xbm'] + MIME = ['image/gif', 'image/jpeg', 'image/png', + 'application/x-shockwave-flash', 'image/psd', 'image/bmp', + 'image/tiff', 'image/tiff', 'application/octet-stream', + 'image/jp2', 'application/octet-stream', 'application/octet-stream', + 'application/x-shockwave-flash', 'image/iff', 'image/vnd.wap.wbmp', 'image/xbm'] + + def __init__(self, dir_): + self.dir = dir_ + return super(ImageStore, self).__init__() + + def path(self, slug, mime_type): + """ + Finds file by slug and mime type in our iamge store. + Returns a file objects (perhaps should return a filename?) + """ + try: + i = self.MIME.index(mime_type) + except ValueError: + err = ValueError("Picture %s has unknown mime type: %s" % (slug, mime_type)) + err.slug = slug + err.mime_type = mime_type + raise err + ext = self.EXT[i] + # add some common extensions tiff->tif, jpeg->jpg + return path.join(self.dir, slug + '.' + ext) + + +class WLPicture(object): + def __init__(self, edoc, parse_dublincore=True, image_store=None): + self.edoc = edoc + self.image_store = image_store + + root_elem = edoc.getroot() + + dc_path = './/' + RDFNS('RDF') + + if root_elem.tag != 'picture': + raise ValidationError("Invalid root element. Found '%s', should be 'picture'" % root_elem.tag) + + if parse_dublincore: + self.rdf_elem = root_elem.find(dc_path) + + if self.rdf_elem is None: + raise NoDublinCore('Document has no DublinCore - which is required.') + + self.picture_info = PictureInfo.from_element(self.rdf_elem) + else: + self.picture_info = None + + @classmethod + def from_string(cls, xml, *args, **kwargs): + return cls.from_file(StringIO(xml), *args, **kwargs) + + @classmethod + def from_file(cls, xmlfile, parse_dublincore=True, image_store=None): + + # first, prepare for parsing + if isinstance(xmlfile, basestring): + file = open(xmlfile, 'rb') + try: + data = file.read() + finally: + file.close() + else: + data = xmlfile.read() + + if not isinstance(data, unicode): + data = data.decode('utf-8') + + data = data.replace(u'\ufeff', '') + + # assume images are in the same directory + if image_store is None and xmlfile.name is not None: + image_store = ImageStore(path.dirname(xmlfile.name)) + + try: + parser = etree.XMLParser(remove_blank_text=False) + tree = etree.parse(StringIO(data.encode('utf-8')), parser) + + return cls(tree, parse_dublincore=parse_dublincore, image_store=image_store) + except (ExpatError, XMLSyntaxError, XSLTApplyError), e: + raise ParseError(e) + + @property + def mime_type(self): + if self.picture_info is None: + raise ValueError('DC is not loaded, hence we don\'t know the image type') + return self.picture_info.mime_type + + @property + def slug(self): + return self.picture_info.url.slug + + @property + def image_path(self): + if self.image_store is None: + raise ValueError("No image store associated with whis WLPicture.") + return self.image_store.path(self.slug, self.mime_type) + + def image_file(self, *args, **kwargs): + return open(self.image_path, *args, **kwargs) diff --git a/tests/files/picture/angelus-novus.png b/tests/files/picture/angelus-novus.png new file mode 100644 index 0000000..9925dad Binary files /dev/null and b/tests/files/picture/angelus-novus.png differ diff --git a/tests/files/picture/angelus-novus.xml b/tests/files/picture/angelus-novus.xml new file mode 100644 index 0000000..ae2afbc --- /dev/null +++ b/tests/files/picture/angelus-novus.xml @@ -0,0 +1,36 @@ + + + + Klee, Paul + Angelus Novus + Fundacja Nowoczesna Polska + Modernizm + Obraz + Akwarela + Publikacja zrealizowana w ramach projektu Wolne Lektury (http://wolnelektury.pl). Reprodukcja cyfrowa wykonana przez Bibliotekę Narodową z egzemplarza pochodzącego ze zbiorów BN. + http://wolnelektury.pl/katalog/obraz/angelus-novus + http://katilifox.files.wordpress.com/2011/04/1190051611_angelus-novus.jpg + + Domena publiczna - Paul Klee zm. 1940 + 1940 + Image + image/png + 1645 x 2000 px + 31.8 × 24.2 cm + Akwarela na papierze + 1920 + + + + +
+ + +
+ + +
+
+ + + diff --git a/tests/test_picture.py b/tests/test_picture.py new file mode 100644 index 0000000..4e3b252 --- /dev/null +++ b/tests/test_picture.py @@ -0,0 +1,40 @@ +# -*- coding: utf-8 -*- +# +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# +from librarian import picture, dcparser +from lxml import etree +from nose.tools import * +from os.path import splitext +from tests.utils import get_all_fixtures, get_fixture +import codecs +from os import path + +def test_wlpictureuri(): + uri = picture.WLPictureURI('http://wolnelektury.pl/katalog/obraz/angelus-novus') + +def check_load(xml_file): + pi = dcparser.parse(xml_file, picture.PictureInfo) + assert pi is not None + assert isinstance(pi, picture.PictureInfo) + + +def test_load(): + for fixture in get_all_fixtures('picture', '*.xml'): + yield check_load, fixture + + +def test_wlpicture(): + wlp = picture.WLPicture.from_file(open(get_fixture('picture', 'angelus-novus.xml'))) + pi = wlp.picture_info + + # from nose.tools import set_trace; set_trace() + assert pi.type[0] == u"Image" + assert pi.mime_type == u'image/png' == wlp.mime_type + assert wlp.slug == 'angelus-novus' + + assert path.exists(wlp.image_path) + + f = wlp.image_file('r') + f.close()