New EPUB builder, other minor changes.
[librarian.git] / src / librarian / epub.py
index fc5ee16..0fb91e5 100644 (file)
@@ -15,6 +15,7 @@ from mimetypes import guess_type
 
 from ebooklib import epub
 from lxml import etree
+from PIL import Image
 from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
@@ -29,6 +30,7 @@ functions.reg_person_name()
 
 
 def squeeze_whitespace(s):
+    return s
     return re.sub(b'\\s+', b' ', s)
 
 
@@ -61,33 +63,6 @@ def hyphenate_and_fix_conjunctions(source_tree, hyph):
             parent.tail = newt
 
 
-def inner_xml(node):
-    """ returns node's text and children as a string
-
-    >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
-    x<b>y</b>z
-    """
-
-    nt = node.text if node.text is not None else ''
-    return ''.join(
-        [nt] + [etree.tostring(child, encoding='unicode') for child in node]
-    )
-
-
-def set_inner_xml(node, text):
-    """ sets node's text and children from a string
-
-    >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
-    >>> set_inner_xml(e, 'x<b>y</b>z')
-    >>> print(etree.tostring(e, encoding='unicode'))
-    <a>x<b>y</b>z</a>
-    """
-
-    p = etree.fromstring('<x>%s</x>' % text)
-    node.text = p.text
-    node[:] = p[:]
-
-
 def node_name(node):
     """ Find out a node's name
 
@@ -376,17 +351,8 @@ def remove_empty_lists_from_toc(toc):
                 toc[i] = e[0]
 
 
-def transform(wldoc, verbose=False, style=None,
-              sample=None, cover=None, flags=None, hyphenate=False,
-              ilustr_path='', output_type='epub'):
-    """ produces a EPUB file
-
-    sample=n: generate sample e-book (with at least n paragraphs)
-    cover: a cover.Cover factory or True for default
-    flags: less-advertising, without-fonts, working-copy
-    """
 
-    def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
+def transform_file(wldoc, chunk_counter=1, first=True, sample=None, hyphenate=False, output_type='epub', spine=None, output=None, annotations=None):
         """ processes one input file and proceeds to its children """
 
         replace_characters(wldoc.edoc.getroot())
@@ -447,6 +413,14 @@ def transform(wldoc, verbose=False, style=None,
             output.add_item(item)
             spine.append(item)
 
+            toc[-1][1].append(
+                epub.Link(
+                    "part1.xhtml",
+                    "Początek utworu",
+                    "part1"
+                )
+            )
+
         elif wldoc.book_info.parts:
             # write title page for every parent
             if sample is not None and sample <= 0:
@@ -509,12 +483,27 @@ def transform(wldoc, verbose=False, style=None,
 
         for child in wldoc.parts():
             child_toc, chunk_counter, chunk_chars, sample = transform_file(
-                child, chunk_counter, first=False, sample=sample)
+                child, chunk_counter, first=False, sample=sample,
+                hyphenate=hyphenate, output_type=output_type,
+                spine=spine, output=output, annotations=annotations,
+            )
             toc[-1][1].extend(child_toc)
             chars = chars.union(chunk_chars)
 
         return toc, chunk_counter, chars, sample
 
+                
+def transform(wldoc, verbose=False, style=None,
+              sample=None, cover=None, flags=None, hyphenate=False,
+              base_url='file://./', output_type='epub'):
+    """ produces a EPUB file
+
+    sample=n: generate sample e-book (with at least n paragraphs)
+    cover: a cover.Cover factory or True for default
+    flags: less-advertising, without-fonts, working-copy
+    """
+
+
     document = deepcopy(wldoc)
     del wldoc
 
@@ -540,16 +529,18 @@ def transform(wldoc, verbose=False, style=None,
     output.set_identifier(six.text_type(document.book_info.url))
     output.set_language(functions.lang_code_3to2(document.book_info.language))
     output.set_title(document.book_info.title)
-    for author in document.book_info.authors:
+    for i, author in enumerate(document.book_info.authors):
         output.add_author(
             author.readable(),
-            file_as=six.text_type(author)
+            file_as=six.text_type(author),
+            uid='creator{}'.format(i)
         )
     for translator in document.book_info.translators:
         output.add_author(
             translator.readable(),
             file_as=six.text_type(translator),
-            role='translator'
+            role='trl',
+            uid='translator{}'.format(i)
         )
     for publisher in document.book_info.publisher:
         output.add_metadata("DC", "publisher", publisher)
@@ -567,23 +558,42 @@ def transform(wldoc, verbose=False, style=None,
 
     functions.reg_mathml_epub(output)
 
-    if os.path.isdir(ilustr_path):
-        ilustr_elements = set(ilustr.get('src')
-                              for ilustr in document.edoc.findall('//ilustr'))
-        for i, filename in enumerate(os.listdir(ilustr_path)):
-            if filename not in ilustr_elements:
-                continue
-            file_path = os.path.join(ilustr_path, filename)
-            with open(file_path, 'rb') as f:
-                output.add_item(
-                    epub.EpubItem(
-                        uid='image%s' % i,
-                        file_name=filename,
-                        media_type=guess_type(file_path)[0],
-                        content=f.read()
-                    )
-                )
+    # FIXME
+    for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
+        url = six.moves.urllib.parse.urljoin(
+            base_url,
+            ilustr.get('src')
+        )
+        imgfile = six.moves.urllib.request.urlopen(url)
+        img = Image.open(imgfile)
 
+        th_format, ext, media_type = {
+            'GIF': ('GIF', 'gif', 'image/gif'),
+            'PNG': ('PNG', 'png', 'image/png'),
+        }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
+
+        width = 1200
+        if img.size[0] < width:
+            th = img
+        else:
+            th = img.resize((width, round(width * img.size[1] / img.size[0])))
+
+        imgfile.close()
+            
+        buffer = six.BytesIO()
+        th.save(buffer, format=th_format)
+
+        file_name = 'image%d.%s' % (i, ext)
+        ilustr.set('src', file_name)
+        output.add_item(
+            epub.EpubItem(
+                uid='image%s' % i,
+                file_name=file_name,
+                media_type=media_type,
+                content=buffer.getvalue()
+            )
+        )
+            
     # write static elements
 
     with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
@@ -649,18 +659,13 @@ def transform(wldoc, verbose=False, style=None,
 
     annotations = etree.Element('annotations')
 
-    toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
+    toc, chunk_counter, chars, sample = transform_file(
+        document, sample=sample,
+        hyphenate=hyphenate, output_type=output_type,
+        spine=spine, output=output, annotations=annotations
+    )
     output.toc = toc[0][1]
 
-    if len(toc) < 2:
-        output.toc.append(
-            epub.Link(
-                "part1.xhtml",
-                "Początek utworu",
-                "part1"
-            )
-        )
-
     # Last modifications in container files and EPUB creation
     if len(annotations) > 0:
         output.toc.append(
@@ -767,6 +772,7 @@ def transform(wldoc, verbose=False, style=None,
             os.chdir(cwd)
 
     remove_empty_lists_from_toc(output.toc)
+    print(output.toc)
 
     output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
                                      delete=False)