Epub: only attach images referenced in the text.
[librarian.git] / librarian / epub.py
index 831639b..be9488a 100644 (file)
@@ -3,21 +3,23 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from __future__ import with_statement
+from __future__ import print_function, unicode_literals
 
 import os
 import os.path
 import re
 import subprocess
 
 import os
 import os.path
 import re
 import subprocess
-from StringIO import StringIO
+from six import BytesIO
 from copy import deepcopy
 from copy import deepcopy
+from mimetypes import guess_type
+
 from lxml import etree
 import zipfile
 from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
 from lxml import etree
 import zipfile
 from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
-from librarian.cover import DefaultEbookCover
+from librarian.cover import make_cover
 
 from librarian import functions, get_resource
 
 
 from librarian import functions, get_resource
 
@@ -27,12 +29,16 @@ functions.reg_person_name()
 functions.reg_lang_code_3to2()
 
 
 functions.reg_lang_code_3to2()
 
 
+def squeeze_whitespace(s):
+    return re.sub(b'\\s+', b' ', s)
+
+
 def set_hyph_language(source_tree):
     def get_short_lng_code(text):
         result = ''
         text = ''.join(text)
         with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
 def set_hyph_language(source_tree):
     def get_short_lng_code(text):
         result = ''
         text = ''.join(text)
         with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
-            for line in f:
+            for line in f.read().decode('latin1').split('\n'):
                 list = line.strip().split('|')
                 if list[0] == text:
                     result = list[2]
                 list = line.strip().split('|')
                 if list[0] == text:
                     result = list[2]
@@ -51,30 +57,32 @@ def set_hyph_language(source_tree):
 
 
 def hyphenate_and_fix_conjunctions(source_tree, hyph):
 
 
 def hyphenate_and_fix_conjunctions(source_tree, hyph):
-    if hyph is not None:
-        texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
-        for t in texts:
-            parent = t.getparent()
+    texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
+    for t in texts:
+        parent = t.getparent()
+        if hyph is not None:
             newt = ''
             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
             for w in wlist:
                 newt += hyph.inserted(w, u'\u00AD')
             newt = ''
             wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
             for w in wlist:
                 newt += hyph.inserted(w, u'\u00AD')
-            newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
-            if t.is_text:
-                parent.text = newt
-            elif t.is_tail:
-                parent.tail = newt
+        else:
+            newt = t
+        newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
+        if t.is_text:
+            parent.text = newt
+        elif t.is_tail:
+            parent.tail = newt
 
 
 def inner_xml(node):
     """ returns node's text and children as a string
 
 
 
 def inner_xml(node):
     """ returns node's text and children as a string
 
-    >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
+    >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
     x<b>y</b>z
     """
 
     nt = node.text if node.text is not None else ''
     x<b>y</b>z
     """
 
     nt = node.text if node.text is not None else ''
-    return ''.join([nt] + [etree.tostring(child) for child in node])
+    return ''.join([nt] + [etree.tostring(child, encoding='unicode') for child in node])
 
 
 def set_inner_xml(node, text):
 
 
 def set_inner_xml(node, text):
@@ -82,7 +90,7 @@ def set_inner_xml(node, text):
 
     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
     >>> set_inner_xml(e, 'x<b>y</b>z')
 
     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
     >>> set_inner_xml(e, 'x<b>y</b>z')
-    >>> print etree.tostring(e)
+    >>> print(etree.tostring(e, encoding='unicode'))
     <a>x<b>y</b>z</a>
     """
 
     <a>x<b>y</b>z</a>
     """
 
@@ -94,7 +102,7 @@ def set_inner_xml(node, text):
 def node_name(node):
     """ Find out a node's name
 
 def node_name(node):
     """ Find out a node's name
 
-    >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
+    >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
     XYZ
     """
 
     XYZ
     """
 
@@ -109,11 +117,13 @@ def node_name(node):
     return tempnode.text
 
 
     return tempnode.text
 
 
-def xslt(xml, sheet):
+def xslt(xml, sheet, **kwargs):
     if isinstance(xml, etree._Element):
         xml = etree.ElementTree(xml)
     with open(sheet) as xsltf:
     if isinstance(xml, etree._Element):
         xml = etree.ElementTree(xml)
     with open(sheet) as xsltf:
-        return xml.xslt(etree.parse(xsltf))
+        transform = etree.XSLT(etree.parse(xsltf))
+        params = dict((key, transform.strparam(value)) for key, value in kwargs.items())
+        return transform(xml, **params)
 
 
 def replace_characters(node):
 
 
 def replace_characters(node):
@@ -162,8 +172,8 @@ class Stanza(object):
 
     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
     >>> Stanza(s).versify()
 
     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
     >>> Stanza(s).versify()
-    >>> print etree.tostring(s)
-    <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
+    >>> print(etree.tostring(s, encoding='unicode'))
+    <strofa><wers_normalny>a <b>c</b><b>c</b></wers_normalny><wers_normalny>b<x>x/
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
     """
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
     """
@@ -180,7 +190,7 @@ class Stanza(object):
         tail = self.stanza.tail
         self.stanza.clear()
         self.stanza.tail = tail
         tail = self.stanza.tail
         self.stanza.clear()
         self.stanza.tail = tail
-        self.stanza.extend(self.verses)
+        self.stanza.extend(verse for verse in self.verses if verse.text or len(verse) > 0)
 
     def open_normal_verse(self):
         self.open_verse = self.stanza.makeelement("wers_normalny")
 
     def open_normal_verse(self):
         self.open_verse = self.stanza.makeelement("wers_normalny")
@@ -197,6 +207,8 @@ class Stanza(object):
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
             if i:
                 self.open_normal_verse()
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
             if i:
                 self.open_normal_verse()
+            if not verse_text.strip():
+                continue
             verse = self.get_open_verse()
             if len(verse):
                 verse[-1].tail = (verse[-1].tail or "") + verse_text
             verse = self.get_open_verse()
             if len(verse):
                 verse[-1].tail = (verse[-1].tail or "") + verse_text
@@ -313,8 +325,8 @@ class TOC(object):
         return "\n".join(texts)
 
     def html(self):
         return "\n".join(texts)
 
     def html(self):
-        with open(get_resource('epub/toc.html')) as f:
-            t = unicode(f.read(), 'utf-8')
+        with open(get_resource('epub/toc.html'), 'rb') as f:
+            t = f.read().decode('utf-8')
         return t % self.html_part()
 
 
         return t % self.html_part()
 
 
@@ -404,9 +416,8 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s
     return output_html, toc, chars
 
 
     return output_html, toc, chars
 
 
-def transform(wldoc, verbose=False,
-              style=None, html_toc=False,
-              sample=None, cover=None, flags=None, hyphenate=False):
+def transform(wldoc, verbose=False, style=None, html_toc=False,
+              sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
     """ produces a EPUB file
 
     sample=n: generate sample e-book (with at least n paragraphs)
     """ produces a EPUB file
 
     sample=n: generate sample e-book (with at least n paragraphs)
@@ -419,9 +430,8 @@ def transform(wldoc, verbose=False,
 
         replace_characters(wldoc.edoc.getroot())
 
 
         replace_characters(wldoc.edoc.getroot())
 
-        if hyphenate:
-            hyphenator = set_hyph_language(wldoc.edoc.getroot())
-            hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
+        hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
+        hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
 
         # every input file will have a TOC entry,
         # pointing to starting chunk
 
         # every input file will have a TOC entry,
         # pointing to starting chunk
@@ -429,17 +439,15 @@ def transform(wldoc, verbose=False,
         chars = set()
         if first:
             # write book title page
         chars = set()
         if first:
             # write book title page
-            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
+            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
             chars = used_chars(html_tree.getroot())
             chars = used_chars(html_tree.getroot())
-            zip.writestr(
-                'OPS/title.html',
-                etree.tostring(
-                    html_tree, pretty_print=True, xml_declaration=True,
-                    encoding="utf-8",
-                    doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
-                            ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-                )
+            html_string = etree.tostring(
+                html_tree, pretty_print=True, xml_declaration=True,
+                encoding="utf-8",
+                doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
+                        ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
             )
             )
+            zip.writestr('OPS/title.html', squeeze_whitespace(html_string))
             # add a title page TOC entry
             toc.add(u"Strona tytułowa", "title.html")
         elif wldoc.book_info.parts:
             # add a title page TOC entry
             toc.add(u"Strona tytułowa", "title.html")
         elif wldoc.book_info.parts:
@@ -456,7 +464,7 @@ def transform(wldoc, verbose=False,
                     doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                             ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                 )
                     doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                             ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                 )
-            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
+            zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(html_string))
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
@@ -482,7 +490,7 @@ def transform(wldoc, verbose=False,
 
                 toc.extend(chunk_toc)
                 chars = chars.union(chunk_chars)
 
                 toc.extend(chunk_toc)
                 chars = chars.union(chunk_chars)
-                zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
+                zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(chunk_html))
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
                 chunk_counter += 1
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
                 chunk_counter += 1
@@ -526,20 +534,31 @@ def transform(wldoc, verbose=False,
 
     functions.reg_mathml_epub(zip)
 
 
     functions.reg_mathml_epub(zip)
 
+    if os.path.isdir(ilustr_path):
+        ilustr_elements = set(ilustr.get('src') for ilustr in document.edoc.findall('//ilustr'))
+        for i, filename in enumerate(os.listdir(ilustr_path)):
+            if filename not in ilustr_elements:
+                continue
+            file_path = os.path.join(ilustr_path, filename)
+            zip.write(file_path, os.path.join('OPS', filename))
+            image_id = 'image%s' % i
+            manifest.append(etree.fromstring(
+                '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
+
     # write static elements
     mime = zipfile.ZipInfo()
     mime.filename = 'mimetype'
     mime.compress_type = zipfile.ZIP_STORED
     # write static elements
     mime = zipfile.ZipInfo()
     mime.filename = 'mimetype'
     mime.compress_type = zipfile.ZIP_STORED
-    mime.extra = ''
-    zip.writestr(mime, 'application/epub+zip')
+    mime.extra = b''
+    zip.writestr(mime, b'application/epub+zip')
     zip.writestr(
         'META-INF/container.xml',
     zip.writestr(
         'META-INF/container.xml',
-        '<?xml version="1.0" ?>'
-        '<container version="1.0" '
-        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
-        '<rootfiles><rootfile full-path="OPS/content.opf" '
-        'media-type="application/oebps-package+xml" />'
-        '</rootfiles></container>'
+        b'<?xml version="1.0" ?>'
+        b'<container version="1.0" '
+        b'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
+        b'<rootfiles><rootfile full-path="OPS/content.opf" '
+        b'media-type="application/oebps-package+xml" />'
+        b'</rootfiles></container>'
     )
     zip.write(get_resource('res/wl-logo-small.png'),
               os.path.join('OPS', 'logo_wolnelektury.png'))
     )
     zip.write(get_resource('res/wl-logo-small.png'),
               os.path.join('OPS', 'logo_wolnelektury.png'))
@@ -551,9 +570,9 @@ def transform(wldoc, verbose=False,
 
     if cover:
         if cover is True:
 
     if cover:
         if cover is True:
-            cover = DefaultEbookCover
+            cover = make_cover
 
 
-        cover_file = StringIO()
+        cover_file = BytesIO()
         bound_cover = cover(document.book_info)
         bound_cover.save(cover_file)
         cover_name = 'cover.%s' % bound_cover.ext()
         bound_cover = cover(document.book_info)
         bound_cover.save(cover_file)
         cover_name = 'cover.%s' % bound_cover.ext()
@@ -586,12 +605,12 @@ def transform(wldoc, verbose=False,
     annotations = etree.Element('annotations')
 
     toc_file = etree.fromstring(
     annotations = etree.Element('annotations')
 
     toc_file = etree.fromstring(
-        '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
-        '"-//NISO//DTD ncx 2005-1//EN" '
-        '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
-        '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
-        'version="2005-1"><head></head><docTitle></docTitle><navMap>'
-        '</navMap></ncx>'
+        b'<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
+        b'"-//NISO//DTD ncx 2005-1//EN" '
+        b'"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
+        b'<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
+        b'version="2005-1"><head></head><docTitle></docTitle><navMap>'
+        b'</navMap></ncx>'
     )
     nav_map = toc_file[-1]
 
     )
     nav_map = toc_file[-1]
 
@@ -629,23 +648,23 @@ def transform(wldoc, verbose=False,
         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="support" />'))
         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="support" />'))
-    html_string = open(get_resource('epub/support.html')).read()
+    html_string = open(get_resource('epub/support.html'), 'rb').read()
     chars.update(used_chars(etree.fromstring(html_string)))
     chars.update(used_chars(etree.fromstring(html_string)))
-    zip.writestr('OPS/support.html', html_string)
+    zip.writestr('OPS/support.html', squeeze_whitespace(html_string))
 
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="last" />'))
 
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
         '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="last" />'))
-    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
+    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
     chars.update(used_chars(html_tree.getroot()))
     chars.update(used_chars(html_tree.getroot()))
-    zip.writestr('OPS/last.html', etree.tostring(
+    zip.writestr('OPS/last.html', squeeze_whitespace(etree.tostring(
         html_tree, pretty_print=True, xml_declaration=True,
         encoding="utf-8",
         doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
         html_tree, pretty_print=True, xml_declaration=True,
         encoding="utf-8",
         doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-    ))
+    )))
 
     if not flags or 'without-fonts' not in flags:
         # strip fonts
 
     if not flags or 'without-fonts' not in flags:
         # strip fonts
@@ -661,11 +680,13 @@ def transform(wldoc, verbose=False,
                               ''.join(chars).encode('utf-8'),
                               get_resource('fonts/' + fname),
                               os.path.join(tmpdir, fname)]
                               ''.join(chars).encode('utf-8'),
                               get_resource('fonts/' + fname),
                               os.path.join(tmpdir, fname)]
+            env = {"PERL_USE_UNSAFE_INC": "1"}
             if verbose:
             if verbose:
-                print "Running font-optimizer"
-                subprocess.check_call(optimizer_call)
+                print("Running font-optimizer")
+                subprocess.check_call(optimizer_call, env=env)
             else:
             else:
-                subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                dev_null = open(os.devnull, 'w')
+                subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null, env=env)
             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
             manifest.append(etree.fromstring(
                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
             manifest.append(etree.fromstring(
                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))