Epub: only attach images referenced in the text.
[librarian.git] / librarian / picture.py
index 7b98ff1..10d2ae7 100644 (file)
@@ -1,24 +1,27 @@
+# -*- coding: utf-8 -*-
+from __future__ import unicode_literals
 
 
-from dcparser import (as_person, as_date, Field, WorkInfo, DCNS)
+from operator import and_
+
+from .dcparser import Field, WorkInfo, DCNS
 from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI)
 from xml.parsers.expat import ExpatError
 from os import path
 from librarian import (RDFNS, ValidationError, NoDublinCore, ParseError, WLURI)
 from xml.parsers.expat import ExpatError
 from os import path
-from StringIO import StringIO
 from lxml import etree
 from lxml.etree import (XMLSyntaxError, XSLTApplyError, Element)
 import re
 from lxml import etree
 from lxml.etree import (XMLSyntaxError, XSLTApplyError, Element)
 import re
-from functools import *
-from operator import *
+import six
+
 
 class WLPictureURI(WLURI):
 
 class WLPictureURI(WLURI):
-    _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/'
-            '(?P<slug>[-a-z0-9]+)/?$')
+    _re_wl_uri = re.compile('http://wolnelektury.pl/katalog/obraz/(?P<slug>[-a-z0-9]+)/?$')
 
     @classmethod
     def from_slug(cls, slug):
         uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug
         return cls(uri)
 
 
     @classmethod
     def from_slug(cls, slug):
         uri = 'http://wolnelektury.pl/katalog/obraz/%s/' % slug
         return cls(uri)
 
+
 def as_wlpictureuri_strict(text):
     return WLPictureURI.strict(text)
 
 def as_wlpictureuri_strict(text):
     return WLPictureURI.strict(text)
 
@@ -32,21 +35,21 @@ class PictureInfo(WorkInfo):
         Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True),
         Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True),
         Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False),
         Field(DCNS('subject.period'), 'epochs', salias='epoch', multiple=True),
         Field(DCNS('subject.type'), 'kinds', salias='kind', multiple=True),
         Field(DCNS('subject.genre'), 'genres', salias='genre', multiple=True, required=False),
+        Field(DCNS('subject.style'), 'styles', salias='style', multiple=True, required=False),
 
         Field(DCNS('format.dimensions'), 'dimensions', required=False),
         Field(DCNS('format.checksum.sha1'), 'sha1', required=True),
         Field(DCNS('description.medium'), 'medium', required=False),
         Field(DCNS('description.dimensions'), 'original_dimensions', required=False),
         Field(DCNS('format'), 'mime_type', required=False),
 
         Field(DCNS('format.dimensions'), 'dimensions', required=False),
         Field(DCNS('format.checksum.sha1'), 'sha1', required=True),
         Field(DCNS('description.medium'), 'medium', required=False),
         Field(DCNS('description.dimensions'), 'original_dimensions', required=False),
         Field(DCNS('format'), 'mime_type', required=False),
-        Field(DCNS('identifier.url'), 'url', WLPictureURI,
-            strict=as_wlpictureuri_strict),
-        )
+        Field(DCNS('identifier.url'), 'url', WLPictureURI, strict=as_wlpictureuri_strict)
+    )
 
 
 class ImageStore(object):
     EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp'
 
 
 class ImageStore(object):
     EXT = ['gif', 'jpeg', 'png', 'swf', 'psd', 'bmp'
-            'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc',
-            'aiff', 'wbmp', 'xbm']
+           'tiff', 'tiff', 'jpc', 'jp2', 'jpf', 'jb2', 'swc',
+           'aiff', 'wbmp', 'xbm']
     MIME = ['image/gif', 'image/jpeg', 'image/png',
             'application/x-shockwave-flash', 'image/psd', 'image/bmp',
             'image/tiff', 'image/tiff', 'application/octet-stream',
     MIME = ['image/gif', 'image/jpeg', 'image/png',
             'application/x-shockwave-flash', 'image/psd', 'image/bmp',
             'image/tiff', 'image/tiff', 'application/octet-stream',
@@ -54,8 +57,8 @@ class ImageStore(object):
             'application/x-shockwave-flash', 'image/iff', 'image/vnd.wap.wbmp', 'image/xbm']
 
     def __init__(self, dir_):
             'application/x-shockwave-flash', 'image/iff', 'image/vnd.wap.wbmp', 'image/xbm']
 
     def __init__(self, dir_):
+        super(ImageStore, self).__init__()
         self.dir = dir_
         self.dir = dir_
-        return super(ImageStore, self).__init__()
 
     def path(self, slug, mime_type):
         """
 
     def path(self, slug, mime_type):
         """
@@ -95,16 +98,17 @@ class WLPicture(object):
             self.picture_info = PictureInfo.from_element(self.rdf_elem)
         else:
             self.picture_info = None
             self.picture_info = PictureInfo.from_element(self.rdf_elem)
         else:
             self.picture_info = None
+        self.frame = None
 
     @classmethod
 
     @classmethod
-    def from_string(cls, xml, *args, **kwargs):
-        return cls.from_file(StringIO(xml), *args, **kwargs)
+    def from_bytes(cls, xml, *args, **kwargs):
+        return cls.from_file(six.BytesIO(xml), *args, **kwargs)
 
     @classmethod
     def from_file(cls, xmlfile, parse_dublincore=True, image_store=None):
 
         # first, prepare for parsing
 
     @classmethod
     def from_file(cls, xmlfile, parse_dublincore=True, image_store=None):
 
         # first, prepare for parsing
-        if isinstance(xmlfile, basestring):
+        if isinstance(xmlfile, six.text_type):
             file = open(xmlfile, 'rb')
             try:
                 data = file.read()
             file = open(xmlfile, 'rb')
             try:
                 data = file.read()
@@ -113,23 +117,23 @@ class WLPicture(object):
         else:
             data = xmlfile.read()
 
         else:
             data = xmlfile.read()
 
-        if not isinstance(data, unicode):
+        if not isinstance(data, six.text_type):
             data = data.decode('utf-8')
 
         data = data.replace(u'\ufeff', '')
 
         # assume images are in the same directory
             data = data.decode('utf-8')
 
         data = data.replace(u'\ufeff', '')
 
         # assume images are in the same directory
-        if image_store is None and xmlfile.name is not None:
+        if image_store is None and getattr(xmlfile, 'name', None):
             image_store = ImageStore(path.dirname(xmlfile.name))
 
         try:
             parser = etree.XMLParser(remove_blank_text=False)
             image_store = ImageStore(path.dirname(xmlfile.name))
 
         try:
             parser = etree.XMLParser(remove_blank_text=False)
-            tree = etree.parse(StringIO(data.encode('utf-8')), parser)
+            tree = etree.parse(six.BytesIO(data.encode('utf-8')), parser)
 
 
-            me =  cls(tree, parse_dublincore=parse_dublincore, image_store=image_store)
+            me = cls(tree, parse_dublincore=parse_dublincore, image_store=image_store)
             me.load_frame_info()
             return me
             me.load_frame_info()
             return me
-        except (ExpatError, XMLSyntaxError, XSLTApplyError), e:
+        except (ExpatError, XMLSyntaxError, XSLTApplyError) as e:
             raise ParseError(e)
 
     @property
             raise ParseError(e)
 
     @property
@@ -150,40 +154,42 @@ class WLPicture(object):
         return self.image_store.path(self.slug, self.mime_type)
 
     def image_file(self, *args, **kwargs):
         return self.image_store.path(self.slug, self.mime_type)
 
     def image_file(self, *args, **kwargs):
-        return open(self.image_path, *args, **kwargs)
+        return open(self.image_path, 'rb', *args, **kwargs)
 
     def get_sem_coords(self, sem):
         area = sem.find("div[@type='rect']")
         if area is None:
             area = sem.find("div[@type='whole']")
 
     def get_sem_coords(self, sem):
         area = sem.find("div[@type='rect']")
         if area is None:
             area = sem.find("div[@type='whole']")
-            return ((0, 0), (-1, -1))
+            return [[0, 0], [-1, -1]]
 
         def has_all_props(node, props):
             return reduce(and_, map(lambda prop: prop in node.attrib, props))
 
 
         def has_all_props(node, props):
             return reduce(and_, map(lambda prop: prop in node.attrib, props))
 
-        if has_all_props(area,  ['x1', 'x2', 'y1', 'y2']) == False:
+        if not has_all_props(area, ['x1', 'x2', 'y1', 'y2']):
             return None
             return None
-            
+
         def n(prop): return int(area.get(prop))
         def n(prop): return int(area.get(prop))
-        return ((n('x1'), n('y1')), (n('x2'), n('y2')))
-        
+        return [[n('x1'), n('y1')], [n('x2'), n('y2')]]
 
     def partiter(self):
         """
         Iterates the parts of this picture and returns them and their metadata
         """
         # omg no support for //sem[(@type='theme') or (@type='object')] ?
 
     def partiter(self):
         """
         Iterates the parts of this picture and returns them and their metadata
         """
         # omg no support for //sem[(@type='theme') or (@type='object')] ?
-        for part in list(self.edoc.iterfind("//sem[@type='theme']")) + list(self.edoc.iterfind("//sem[@type='object']")):
-            pd = {}
-            pd['type'] = part.get('type')
+        for part in list(self.edoc.iterfind("//sem[@type='theme']")) +\
+                list(self.edoc.iterfind("//sem[@type='object']")):
+            pd = {'type': part.get('type')}
 
             coords = self.get_sem_coords(part)
 
             coords = self.get_sem_coords(part)
-            if coords is None: continue
+            if coords is None:
+                continue
             pd['coords'] = coords
 
             def want_unicode(x):
             pd['coords'] = coords
 
             def want_unicode(x):
-                if not isinstance(x, unicode): return x.decode('utf-8')
-                else: return x
+                if not isinstance(x, six.text_type):
+                    return x.decode('utf-8')
+                else:
+                    return x
             pd['object'] = part.attrib['type'] == 'object' and want_unicode(part.attrib.get('object', u'')) or None
             pd['themes'] = part.attrib['type'] == 'theme' and [part.attrib.get('theme', u'')] or []
             yield pd
             pd['object'] = part.attrib['type'] == 'object' and want_unicode(part.attrib.get('object', u'')) or None
             pd['themes'] = part.attrib['type'] == 'theme' and [part.attrib.get('theme', u'')] or []
             yield pd