Fixes #3952.
[librarian.git] / librarian / epub.py
index 95e65f1..e9670d5 100644 (file)
@@ -3,13 +3,13 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
-from __future__ import with_statement
+from __future__ import print_function, unicode_literals
 
 import os
 import os.path
 import re
 import subprocess
 
 import os
 import os.path
 import re
 import subprocess
-from StringIO import StringIO
+from six import BytesIO
 from copy import deepcopy
 from mimetypes import guess_type
 
 from copy import deepcopy
 from mimetypes import guess_type
 
@@ -19,7 +19,7 @@ from tempfile import mkdtemp, NamedTemporaryFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
 from shutil import rmtree
 
 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
-from librarian.cover import DefaultEbookCover
+from librarian.cover import make_cover
 
 from librarian import functions, get_resource
 
 
 from librarian import functions, get_resource
 
@@ -29,12 +29,16 @@ functions.reg_person_name()
 functions.reg_lang_code_3to2()
 
 
 functions.reg_lang_code_3to2()
 
 
+def squeeze_whitespace(s):
+    return re.sub(b'\\s+', b' ', s)
+
+
 def set_hyph_language(source_tree):
     def get_short_lng_code(text):
         result = ''
         text = ''.join(text)
         with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
 def set_hyph_language(source_tree):
     def get_short_lng_code(text):
         result = ''
         text = ''.join(text)
         with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
-            for line in f:
+            for line in f.read().decode('latin1').split('\n'):
                 list = line.strip().split('|')
                 if list[0] == text:
                     result = list[2]
                 list = line.strip().split('|')
                 if list[0] == text:
                     result = list[2]
@@ -73,12 +77,12 @@ def hyphenate_and_fix_conjunctions(source_tree, hyph):
 def inner_xml(node):
     """ returns node's text and children as a string
 
 def inner_xml(node):
     """ returns node's text and children as a string
 
-    >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
+    >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
     x<b>y</b>z
     """
 
     nt = node.text if node.text is not None else ''
     x<b>y</b>z
     """
 
     nt = node.text if node.text is not None else ''
-    return ''.join([nt] + [etree.tostring(child) for child in node])
+    return ''.join([nt] + [etree.tostring(child, encoding='unicode') for child in node])
 
 
 def set_inner_xml(node, text):
 
 
 def set_inner_xml(node, text):
@@ -86,7 +90,7 @@ def set_inner_xml(node, text):
 
     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
     >>> set_inner_xml(e, 'x<b>y</b>z')
 
     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
     >>> set_inner_xml(e, 'x<b>y</b>z')
-    >>> print etree.tostring(e)
+    >>> print(etree.tostring(e, encoding='unicode'))
     <a>x<b>y</b>z</a>
     """
 
     <a>x<b>y</b>z</a>
     """
 
@@ -98,7 +102,7 @@ def set_inner_xml(node, text):
 def node_name(node):
     """ Find out a node's name
 
 def node_name(node):
     """ Find out a node's name
 
-    >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
+    >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
     XYZ
     """
 
     XYZ
     """
 
@@ -118,7 +122,7 @@ def xslt(xml, sheet, **kwargs):
         xml = etree.ElementTree(xml)
     with open(sheet) as xsltf:
         transform = etree.XSLT(etree.parse(xsltf))
         xml = etree.ElementTree(xml)
     with open(sheet) as xsltf:
         transform = etree.XSLT(etree.parse(xsltf))
-        params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
+        params = dict((key, transform.strparam(value)) for key, value in kwargs.items())
         return transform(xml, **params)
 
 
         return transform(xml, **params)
 
 
@@ -168,8 +172,8 @@ class Stanza(object):
 
     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
     >>> Stanza(s).versify()
 
     >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
     >>> Stanza(s).versify()
-    >>> print etree.tostring(s)
-    <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
+    >>> print(etree.tostring(s, encoding='unicode'))
+    <strofa><wers_normalny>a <b>c</b><b>c</b></wers_normalny><wers_normalny>b<x>x/
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
     """
     y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
 
     """
@@ -186,7 +190,7 @@ class Stanza(object):
         tail = self.stanza.tail
         self.stanza.clear()
         self.stanza.tail = tail
         tail = self.stanza.tail
         self.stanza.clear()
         self.stanza.tail = tail
-        self.stanza.extend(self.verses)
+        self.stanza.extend(verse for verse in self.verses if verse.text or len(verse) > 0)
 
     def open_normal_verse(self):
         self.open_verse = self.stanza.makeelement("wers_normalny")
 
     def open_normal_verse(self):
         self.open_verse = self.stanza.makeelement("wers_normalny")
@@ -201,10 +205,10 @@ class Stanza(object):
         if not text:
             return
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
         if not text:
             return
         for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
-            if not verse_text.strip():
-                continue
             if i:
                 self.open_normal_verse()
             if i:
                 self.open_normal_verse()
+            if not verse_text.strip():
+                continue
             verse = self.get_open_verse()
             if len(verse):
                 verse[-1].tail = (verse[-1].tail or "") + verse_text
             verse = self.get_open_verse()
             if len(verse):
                 verse[-1].tail = (verse[-1].tail or "") + verse_text
@@ -321,8 +325,8 @@ class TOC(object):
         return "\n".join(texts)
 
     def html(self):
         return "\n".join(texts)
 
     def html(self):
-        with open(get_resource('epub/toc.html')) as f:
-            t = unicode(f.read(), 'utf-8')
+        with open(get_resource('epub/toc.html'), 'rb') as f:
+            t = f.read().decode('utf-8')
         return t % self.html_part()
 
 
         return t % self.html_part()
 
 
@@ -437,15 +441,13 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
             # write book title page
             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
             chars = used_chars(html_tree.getroot())
             # write book title page
             html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
             chars = used_chars(html_tree.getroot())
-            zip.writestr(
-                'OPS/title.html',
-                etree.tostring(
-                    html_tree, pretty_print=True, xml_declaration=True,
-                    encoding="utf-8",
-                    doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
-                            ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-                )
+            html_string = etree.tostring(
+                html_tree, pretty_print=True, xml_declaration=True,
+                encoding="utf-8",
+                doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
+                        ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
             )
             )
+            zip.writestr('OPS/title.html', squeeze_whitespace(html_string))
             # add a title page TOC entry
             toc.add(u"Strona tytułowa", "title.html")
         elif wldoc.book_info.parts:
             # add a title page TOC entry
             toc.add(u"Strona tytułowa", "title.html")
         elif wldoc.book_info.parts:
@@ -462,7 +464,7 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
                     doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                             ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                 )
                     doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                             ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                 )
-            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
+            zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(html_string))
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
@@ -488,7 +490,7 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
 
                 toc.extend(chunk_toc)
                 chars = chars.union(chunk_chars)
 
                 toc.extend(chunk_toc)
                 chars = chars.union(chunk_chars)
-                zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
+                zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(chunk_html))
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
                 chunk_counter += 1
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
                 chunk_counter += 1
@@ -544,16 +546,16 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
     mime = zipfile.ZipInfo()
     mime.filename = 'mimetype'
     mime.compress_type = zipfile.ZIP_STORED
     mime = zipfile.ZipInfo()
     mime.filename = 'mimetype'
     mime.compress_type = zipfile.ZIP_STORED
-    mime.extra = ''
-    zip.writestr(mime, 'application/epub+zip')
+    mime.extra = b''
+    zip.writestr(mime, b'application/epub+zip')
     zip.writestr(
         'META-INF/container.xml',
     zip.writestr(
         'META-INF/container.xml',
-        '<?xml version="1.0" ?>'
-        '<container version="1.0" '
-        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
-        '<rootfiles><rootfile full-path="OPS/content.opf" '
-        'media-type="application/oebps-package+xml" />'
-        '</rootfiles></container>'
+        b'<?xml version="1.0" ?>'
+        b'<container version="1.0" '
+        b'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
+        b'<rootfiles><rootfile full-path="OPS/content.opf" '
+        b'media-type="application/oebps-package+xml" />'
+        b'</rootfiles></container>'
     )
     zip.write(get_resource('res/wl-logo-small.png'),
               os.path.join('OPS', 'logo_wolnelektury.png'))
     )
     zip.write(get_resource('res/wl-logo-small.png'),
               os.path.join('OPS', 'logo_wolnelektury.png'))
@@ -565,9 +567,9 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
 
     if cover:
         if cover is True:
 
     if cover:
         if cover is True:
-            cover = DefaultEbookCover
+            cover = make_cover
 
 
-        cover_file = StringIO()
+        cover_file = BytesIO()
         bound_cover = cover(document.book_info)
         bound_cover.save(cover_file)
         cover_name = 'cover.%s' % bound_cover.ext()
         bound_cover = cover(document.book_info)
         bound_cover.save(cover_file)
         cover_name = 'cover.%s' % bound_cover.ext()
@@ -600,12 +602,12 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
     annotations = etree.Element('annotations')
 
     toc_file = etree.fromstring(
     annotations = etree.Element('annotations')
 
     toc_file = etree.fromstring(
-        '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
-        '"-//NISO//DTD ncx 2005-1//EN" '
-        '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
-        '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
-        'version="2005-1"><head></head><docTitle></docTitle><navMap>'
-        '</navMap></ncx>'
+        b'<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
+        b'"-//NISO//DTD ncx 2005-1//EN" '
+        b'"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
+        b'<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
+        b'version="2005-1"><head></head><docTitle></docTitle><navMap>'
+        b'</navMap></ncx>'
     )
     nav_map = toc_file[-1]
 
     )
     nav_map = toc_file[-1]
 
@@ -643,9 +645,9 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="support" />'))
         '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
     spine.append(etree.fromstring(
         '<itemref idref="support" />'))
-    html_string = open(get_resource('epub/support.html')).read()
+    html_string = open(get_resource('epub/support.html'), 'rb').read()
     chars.update(used_chars(etree.fromstring(html_string)))
     chars.update(used_chars(etree.fromstring(html_string)))
-    zip.writestr('OPS/support.html', html_string)
+    zip.writestr('OPS/support.html', squeeze_whitespace(html_string))
 
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
 
     toc.add("Strona redakcyjna", "last.html")
     manifest.append(etree.fromstring(
@@ -654,12 +656,12 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
         '<itemref idref="last" />'))
     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
     chars.update(used_chars(html_tree.getroot()))
         '<itemref idref="last" />'))
     html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
     chars.update(used_chars(html_tree.getroot()))
-    zip.writestr('OPS/last.html', etree.tostring(
+    zip.writestr('OPS/last.html', squeeze_whitespace(etree.tostring(
         html_tree, pretty_print=True, xml_declaration=True,
         encoding="utf-8",
         doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
         html_tree, pretty_print=True, xml_declaration=True,
         encoding="utf-8",
         doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-    ))
+    )))
 
     if not flags or 'without-fonts' not in flags:
         # strip fonts
 
     if not flags or 'without-fonts' not in flags:
         # strip fonts
@@ -675,12 +677,13 @@ def transform(wldoc, verbose=False, style=None, html_toc=False,
                               ''.join(chars).encode('utf-8'),
                               get_resource('fonts/' + fname),
                               os.path.join(tmpdir, fname)]
                               ''.join(chars).encode('utf-8'),
                               get_resource('fonts/' + fname),
                               os.path.join(tmpdir, fname)]
+            env = {"PERL_USE_UNSAFE_INC": "1"}
             if verbose:
             if verbose:
-                print "Running font-optimizer"
-                subprocess.check_call(optimizer_call)
+                print("Running font-optimizer")
+                subprocess.check_call(optimizer_call, env=env)
             else:
                 dev_null = open(os.devnull, 'w')
             else:
                 dev_null = open(os.devnull, 'w')
-                subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null)
+                subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null, env=env)
             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
             manifest.append(etree.fromstring(
                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
             zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
             manifest.append(etree.fromstring(
                 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))