fix for newlines in epub

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index bf2d4d9..5f017d4 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -11,13 +11,15 @@ import re
  import subprocess
  from StringIO import StringIO
  from copy import deepcopy
  import subprocess
  from StringIO import StringIO
  from copy import deepcopy
+from mimetypes import guess_type
+
  from lxml import etree
  import zipfile
  from tempfile import mkdtemp, NamedTemporaryFile
  from shutil import rmtree
  
  from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
  from lxml import etree
  import zipfile
  from tempfile import mkdtemp, NamedTemporaryFile
  from shutil import rmtree
  
  from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
-from librarian.cover import DefaultEbookCover
+from librarian.cover import make_cover
  
  from librarian import functions, get_resource
  
  
  from librarian import functions, get_resource
  
@@ -27,6 +29,10 @@ functions.reg_person_name()
  functions.reg_lang_code_3to2()
  
  
  functions.reg_lang_code_3to2()
  
  
+def squeeze_whitespace(s):
+    return re.sub(r'\s+', ' ', s)
+
+
  def set_hyph_language(source_tree):
      def get_short_lng_code(text):
          result = ''
  def set_hyph_language(source_tree):
      def get_short_lng_code(text):
          result = ''
@@ -51,19 +57,21 @@ def set_hyph_language(source_tree):
  
  
  def hyphenate_and_fix_conjunctions(source_tree, hyph):
  
  
  def hyphenate_and_fix_conjunctions(source_tree, hyph):
-    if hyph is not None:
-        texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
-        for t in texts:
-            parent = t.getparent()
+    texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
+    for t in texts:
+        parent = t.getparent()
+        if hyph is not None:
              newt = ''
              wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
              for w in wlist:
                  newt += hyph.inserted(w, u'\u00AD')
              newt = ''
              wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
              for w in wlist:
                  newt += hyph.inserted(w, u'\u00AD')
-            newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
-            if t.is_text:
-                parent.text = newt
-            elif t.is_tail:
-                parent.tail = newt
+        else:
+            newt = t
+        newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
+        if t.is_text:
+            parent.text = newt
+        elif t.is_tail:
+            parent.tail = newt
  
  
  def inner_xml(node):
  
  
  def inner_xml(node):
@@ -109,11 +117,13 @@ def node_name(node):
      return tempnode.text
  
  
      return tempnode.text
  
  
-def xslt(xml, sheet):
+def xslt(xml, sheet, **kwargs):
      if isinstance(xml, etree._Element):
          xml = etree.ElementTree(xml)
      with open(sheet) as xsltf:
      if isinstance(xml, etree._Element):
          xml = etree.ElementTree(xml)
      with open(sheet) as xsltf:
-        return xml.xslt(etree.parse(xsltf))
+        transform = etree.XSLT(etree.parse(xsltf))
+        params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
+        return transform(xml, **params)
  
  
  def replace_characters(node):
  
  
  def replace_characters(node):
@@ -197,6 +207,8 @@ class Stanza(object):
          for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
              if i:
                  self.open_normal_verse()
          for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
              if i:
                  self.open_normal_verse()
+            if not verse_text.strip():
+                continue
              verse = self.get_open_verse()
              if len(verse):
                  verse[-1].tail = (verse[-1].tail or "") + verse_text
              verse = self.get_open_verse()
              if len(verse):
                  verse[-1].tail = (verse[-1].tail or "") + verse_text
@@ -404,9 +416,8 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s
      return output_html, toc, chars
  
  
      return output_html, toc, chars
  
  
-def transform(wldoc, verbose=False,
-              style=None, html_toc=False,
-              sample=None, cover=None, flags=None):
+def transform(wldoc, verbose=False, style=None, html_toc=False,
+              sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
      """ produces a EPUB file
  
      sample=n: generate sample e-book (with at least n paragraphs)
      """ produces a EPUB file
  
      sample=n: generate sample e-book (with at least n paragraphs)
@@ -419,7 +430,7 @@ def transform(wldoc, verbose=False,
  
          replace_characters(wldoc.edoc.getroot())
  
  
          replace_characters(wldoc.edoc.getroot())
  
-        hyphenator = set_hyph_language(wldoc.edoc.getroot())
+        hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
          hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
  
          # every input file will have a TOC entry,
          hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
  
          # every input file will have a TOC entry,
@@ -428,17 +439,15 @@ def transform(wldoc, verbose=False,
          chars = set()
          if first:
              # write book title page
          chars = set()
          if first:
              # write book title page
-            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
+            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
              chars = used_chars(html_tree.getroot())
              chars = used_chars(html_tree.getroot())
-            zip.writestr(
-                'OPS/title.html',
-                etree.tostring(
-                    html_tree, pretty_print=True, xml_declaration=True,
-                    encoding="utf-8",
-                    doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
-                            ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-                )
+            html_string = etree.tostring(
+                html_tree, pretty_print=True, xml_declaration=True,
+                encoding="utf-8",
+                doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
+                        ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
              )
              )
+            zip.writestr('OPS/title.html', squeeze_whitespace(html_string))
              # add a title page TOC entry
              toc.add(u"Strona tytułowa", "title.html")
          elif wldoc.book_info.parts:
              # add a title page TOC entry
              toc.add(u"Strona tytułowa", "title.html")
          elif wldoc.book_info.parts:
@@ -455,7 +464,7 @@ def transform(wldoc, verbose=False,
                      doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                              ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                  )
                      doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
                              ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
                  )
-            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
+            zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(html_string))
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
@@ -481,7 +490,7 @@ def transform(wldoc, verbose=False,
  
                  toc.extend(chunk_toc)
                  chars = chars.union(chunk_chars)
  
                  toc.extend(chunk_toc)
                  chars = chars.union(chunk_chars)
-                zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
+                zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(chunk_html))
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
                  chunk_counter += 1
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
                  chunk_counter += 1
@@ -525,6 +534,14 @@ def transform(wldoc, verbose=False,
  
      functions.reg_mathml_epub(zip)
  
  
      functions.reg_mathml_epub(zip)
  
+    if os.path.isdir(ilustr_path):
+        for i, filename in enumerate(os.listdir(ilustr_path)):
+            file_path = os.path.join(ilustr_path, filename)
+            zip.write(file_path, os.path.join('OPS', filename))
+            image_id = 'image%s' % i
+            manifest.append(etree.fromstring(
+                '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
+
      # write static elements
      mime = zipfile.ZipInfo()
      mime.filename = 'mimetype'
      # write static elements
      mime = zipfile.ZipInfo()
      mime.filename = 'mimetype'
@@ -550,7 +567,7 @@ def transform(wldoc, verbose=False,
  
      if cover:
          if cover is True:
  
      if cover:
          if cover is True:
-            cover = DefaultEbookCover
+            cover = make_cover
  
          cover_file = StringIO()
          bound_cover = cover(document.book_info)
  
          cover_file = StringIO()
          bound_cover = cover(document.book_info)
@@ -630,21 +647,21 @@ def transform(wldoc, verbose=False,
          '<itemref idref="support" />'))
      html_string = open(get_resource('epub/support.html')).read()
      chars.update(used_chars(etree.fromstring(html_string)))
          '<itemref idref="support" />'))
      html_string = open(get_resource('epub/support.html')).read()
      chars.update(used_chars(etree.fromstring(html_string)))
-    zip.writestr('OPS/support.html', html_string)
+    zip.writestr('OPS/support.html', squeeze_whitespace(html_string))
  
      toc.add("Strona redakcyjna", "last.html")
      manifest.append(etree.fromstring(
          '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
      spine.append(etree.fromstring(
          '<itemref idref="last" />'))
  
      toc.add("Strona redakcyjna", "last.html")
      manifest.append(etree.fromstring(
          '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
      spine.append(etree.fromstring(
          '<itemref idref="last" />'))
-    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
+    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
      chars.update(used_chars(html_tree.getroot()))
      chars.update(used_chars(html_tree.getroot()))
-    zip.writestr('OPS/last.html', etree.tostring(
+    zip.writestr('OPS/last.html', squeeze_whitespace(etree.tostring(
          html_tree, pretty_print=True, xml_declaration=True,
          encoding="utf-8",
          doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                  '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
          html_tree, pretty_print=True, xml_declaration=True,
          encoding="utf-8",
          doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
                  '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
-    ))
+    )))
  
      if not flags or 'without-fonts' not in flags:
          # strip fonts
  
      if not flags or 'without-fonts' not in flags:
          # strip fonts
@@ -660,11 +677,13 @@ def transform(wldoc, verbose=False,
                                ''.join(chars).encode('utf-8'),
                                get_resource('fonts/' + fname),
                                os.path.join(tmpdir, fname)]
                                ''.join(chars).encode('utf-8'),
                                get_resource('fonts/' + fname),
                                os.path.join(tmpdir, fname)]
+            env = {"PERL_USE_UNSAFE_INC": "1"}
              if verbose:
                  print "Running font-optimizer"
              if verbose:
                  print "Running font-optimizer"
-                subprocess.check_call(optimizer_call)
+                subprocess.check_call(optimizer_call, env=env)
              else:
              else:
-                subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+                dev_null = open(os.devnull, 'w')
+                subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null, env=env)
              zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
              manifest.append(etree.fromstring(
                  '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
              zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
              manifest.append(etree.fromstring(
                  '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))