Fixes #2570: Text spilling into fragments from outside.

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index c5942a2..2da6b31 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -7,18 +7,17 @@ from __future__ import with_statement
  
  import os
  import os.path
  
  import os
  import os.path
+import re
  import subprocess
  from StringIO import StringIO
  from copy import deepcopy
  from lxml import etree
  import zipfile
  import subprocess
  from StringIO import StringIO
  from copy import deepcopy
  from lxml import etree
  import zipfile
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
  from shutil import rmtree
  
  from shutil import rmtree
  
-import sys
-
-from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
-from librarian.dcparser import BookInfo
+from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
+from librarian.cover import DefaultEbookCover
  
  from librarian import functions, get_resource
  
  
  from librarian import functions, get_resource
  
@@ -111,31 +110,74 @@ def find_annotations(annotations, source, part_no):
              find_annotations(annotations, child, part_no)
  
  
              find_annotations(annotations, child, part_no)
  
  
+class Stanza(object):
+    """
+    Converts / verse endings into verse elements in a stanza.
+
+    Slashes may only occur directly in the stanza. Any slashes in subelements
+    will be ignored, and the subelements will be put inside verse elements.
+
+    >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
+    >>> Stanza(s).versify()
+    >>> print etree.tostring(s)
+    <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
+    y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
+    
+    """
+    def __init__(self, stanza_elem):
+        self.stanza = stanza_elem
+        self.verses = []
+        self.open_verse = None
+
+    def versify(self):
+        self.push_text(self.stanza.text)
+        for elem in self.stanza:
+            self.push_elem(elem)
+            self.push_text(elem.tail)
+        tail = self.stanza.tail
+        self.stanza.clear()
+        self.stanza.tail = tail
+        self.stanza.extend(self.verses)
+
+    def open_normal_verse(self):
+        self.open_verse = self.stanza.makeelement("wers_normalny")
+        self.verses.append(self.open_verse)
+
+    def get_open_verse(self):
+        if self.open_verse is None:
+            self.open_normal_verse()
+        return self.open_verse
+
+    def push_text(self, text):
+        if not text:
+            return
+        for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
+            if i:
+                self.open_normal_verse()
+            verse = self.get_open_verse()
+            if len(verse):
+                verse[-1].tail = (verse[-1].tail or "") + verse_text
+            else:
+                verse.text = (verse.text or "") + verse_text
+
+    def push_elem(self, elem):
+        if elem.tag.startswith("wers"):
+            verse = deepcopy(elem)
+            verse.tail = None
+            self.verses.append(verse)
+            self.open_verse = verse
+        else:
+            appended = deepcopy(elem)
+            appended.tail = None
+            self.get_open_verse().append(appended)
+
+
  def replace_by_verse(tree):
      """ Find stanzas and create new verses in place of a '/' character """
  
      stanzas = tree.findall('.//' + WLNS('strofa'))
  def replace_by_verse(tree):
      """ Find stanzas and create new verses in place of a '/' character """
  
      stanzas = tree.findall('.//' + WLNS('strofa'))
-    for node in stanzas:
-        for child_node in node:
-            if child_node.tag in ('slowo_obce', 'wyroznienie'):
-                foreign_verses = inner_xml(child_node).split('/\n')
-                if len(foreign_verses) > 1:
-                    new_foreign = ''
-                    for foreign_verse in foreign_verses:
-                        if foreign_verse.startswith('<wers'):
-                            new_foreign += foreign_verse
-                        else:
-                            new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
-                    set_inner_xml(child_node, new_foreign)
-        verses = inner_xml(node).split('/\n')
-        if len(verses) > 1:
-            modified_inner_xml = ''
-            for verse in verses:
-                if verse.startswith('<wers') or verse.startswith('<extra'):
-                    modified_inner_xml += verse
-                else:
-                    modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
-            set_inner_xml(node, modified_inner_xml)
+    for stanza in stanzas:
+        Stanza(stanza).versify()
  
  
  def add_to_manifest(manifest, partno):
  
  
  def add_to_manifest(manifest, partno):
@@ -167,7 +209,7 @@ class TOC(object):
      def add(self, name, part_href, level=0, is_part=True, index=None):
          assert level == 0 or index is None
          if level > 0 and self.children:
      def add(self, name, part_href, level=0, is_part=True, index=None):
          assert level == 0 or index is None
          if level > 0 and self.children:
-            return self.children[-1].add(name, "part%d.html" % part_href, level-1, is_part)
+            return self.children[-1].add(name, part_href, level-1, is_part)
          else:
              t = TOC(name)
              t.part_href = part_href
          else:
              t = TOC(name)
              t.part_href = part_href
@@ -287,47 +329,40 @@ def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_s
      return output_html, toc, chars
  
  
      return output_html, toc, chars
  
  
-def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+def transform(wldoc, verbose=False,
                style=None, html_toc=False,
                sample=None, cover=None, flags=None):
      """ produces a EPUB file
  
                style=None, html_toc=False,
                sample=None, cover=None, flags=None):
      """ produces a EPUB file
  
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    output_file: file-like object or path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
      sample=n: generate sample e-book (with at least n paragraphs)
      sample=n: generate sample e-book (with at least n paragraphs)
-    cover: a cover.Cover object
-    flags: less-advertising, without-fonts
+    cover: a cover.Cover factory or True for default
+    flags: less-advertising, without-fonts, working-copy
      """
  
      """
  
-    def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
+    def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
          """ processes one input file and proceeds to its children """
  
          """ processes one input file and proceeds to its children """
  
-        replace_characters(input_xml.getroot())
-
-        children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
+        replace_characters(wldoc.edoc.getroot())
  
          # every input file will have a TOC entry,
          # pointing to starting chunk
  
          # every input file will have a TOC entry,
          # pointing to starting chunk
-        toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
+        toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
          chars = set()
          if first:
              # write book title page
          chars = set()
          if first:
              # write book title page
-            html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
+            html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
                   etree.tostring(html_tree, method="html", pretty_print=True))
              # add a title page TOC entry
              toc.add(u"Strona tytułowa", "title.html")
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
                   etree.tostring(html_tree, method="html", pretty_print=True))
              # add a title page TOC entry
              toc.add(u"Strona tytułowa", "title.html")
-        elif children:
+        elif wldoc.book_info.parts:
              # write title page for every parent
              if sample is not None and sample <= 0:
                  chars = set()
                  html_string = open(get_resource('epub/emptyChunk.html')).read()
              else:
              # write title page for every parent
              if sample is not None and sample <= 0:
                  chars = set()
                  html_string = open(get_resource('epub/emptyChunk.html')).read()
              else:
-                html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
+                html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
                  chars = used_chars(html_tree.getroot())
                  html_string = etree.tostring(html_tree, method="html", pretty_print=True)
              zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
                  chars = used_chars(html_tree.getroot())
                  html_string = etree.tostring(html_tree, method="html", pretty_print=True)
              zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
@@ -335,12 +370,12 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
  
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
  
-        if len(input_xml.getroot()) > 1:
+        if len(wldoc.edoc.getroot()) > 1:
              # rdf before style master
              # rdf before style master
-            main_text = input_xml.getroot()[1]
+            main_text = wldoc.edoc.getroot()[1]
          else:
              # rdf in style master
          else:
              # rdf in style master
-            main_text = input_xml.getroot()[0]
+            main_text = wldoc.edoc.getroot()[0]
              if main_text.tag == RDFNS('RDF'):
                  main_text = None
  
              if main_text.tag == RDFNS('RDF'):
                  main_text = None
  
@@ -361,51 +396,37 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
                  add_to_spine(spine, chunk_counter)
                  chunk_counter += 1
  
                  add_to_spine(spine, chunk_counter)
                  chunk_counter += 1
  
-        if children:
-            for child in children:
-                child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
-                toc.append(child_toc)
-                chars = chars.union(chunk_chars)
+        for child in wldoc.parts():
+            child_toc, chunk_counter, chunk_chars, sample = transform_file(
+                child, chunk_counter, first=False, sample=sample)
+            toc.append(child_toc)
+            chars = chars.union(chunk_chars)
  
          return toc, chunk_counter, chars, sample
  
  
          return toc, chunk_counter, chars, sample
  
-    # read metadata from the first file
-    if file_path:
-        if slug:
-            raise ValueError('slug or file_path should be specified, not both')
-        f = open(file_path, 'r')
-        input_xml = etree.parse(f)
-        f.close()
-    else:
-        if not slug:
-            raise ValueError('either slug or file_path should be specified')
-        input_xml = etree.parse(provider[slug])
+
+    document = deepcopy(wldoc)
+    del wldoc
  
      if flags:
          for flag in flags:
  
      if flags:
          for flag in flags:
-            input_xml.getroot().set(flag, 'yes')
-
-    metadata = input_xml.find('.//'+RDFNS('Description'))
-    if metadata is None:
-        raise NoDublinCore('Document has no DublinCore - which is required.')
-    book_info = BookInfo.from_element(input_xml)
-    metadata = etree.ElementTree(metadata)
-
-    # if output to dir, create the file
-    if output_dir is not None:
-        if make_dir:
-            author = unicode(book_info.author)
-            output_dir = os.path.join(output_dir, author)
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-        if slug:
-            output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
-        else:
-            output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
+            document.edoc.getroot().set(flag, 'yes')
+
+    # add editors info
+    document.edoc.getroot().set('editors', u', '.join(sorted(
+        editor.readable() for editor in document.editors())))
+    if document.book_info.funders:
+        document.edoc.getroot().set('funders', u', '.join(
+            document.book_info.funders))
+    if document.book_info.thanks:
+        document.edoc.getroot().set('thanks', document.book_info.thanks)
+
+    opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
+    manifest = opf.find('.//' + OPFNS('manifest'))
+    guide = opf.find('.//' + OPFNS('guide'))
+    spine = opf.find('.//' + OPFNS('spine'))
  
  
+    output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
      # write static elements
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
      # write static elements
@@ -425,28 +446,32 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
          style = get_resource('epub/style.css')
      zip.write(style, os.path.join('OPS', 'style.css'))
  
          style = get_resource('epub/style.css')
      zip.write(style, os.path.join('OPS', 'style.css'))
  
-    opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
-    manifest = opf.find('.//' + OPFNS('manifest'))
-    guide = opf.find('.//' + OPFNS('guide'))
-    spine = opf.find('.//' + OPFNS('spine'))
-
      if cover:
      if cover:
+        if cover is True:
+            cover = DefaultEbookCover
+
          cover_file = StringIO()
          cover_file = StringIO()
-        c = cover(book_info.author.readable(), book_info.title)
-        c.save(cover_file)
-        c_name = 'cover.%s' % c.ext()
-        zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
+        bound_cover = cover(document.book_info)
+        bound_cover.save(cover_file)
+        cover_name = 'cover.%s' % bound_cover.ext()
+        zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
          del cover_file
  
          cover_tree = etree.parse(get_resource('epub/cover.html'))
          del cover_file
  
          cover_tree = etree.parse(get_resource('epub/cover.html'))
-        cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
+        cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
          zip.writestr('OPS/cover.html', etree.tostring(
                          cover_tree, method="html", pretty_print=True))
  
          zip.writestr('OPS/cover.html', etree.tostring(
                          cover_tree, method="html", pretty_print=True))
  
+        if bound_cover.uses_dc_cover:
+            if document.book_info.cover_by:
+                document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
+            if document.book_info.cover_source:
+                document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
+
          manifest.append(etree.fromstring(
              '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
          manifest.append(etree.fromstring(
          manifest.append(etree.fromstring(
              '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
          manifest.append(etree.fromstring(
-            '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
+            '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
          spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
          opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
          guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
          spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
          opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
          guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
@@ -468,7 +493,7 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
              '<itemref idref="html_toc" />'))
          guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
  
              '<itemref idref="html_toc" />'))
          guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
  
-    toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
+    toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
  
      if len(toc.children) < 2:
          toc.add(u"Początek utworu", "part1.html")
  
      if len(toc.children) < 2:
          toc.add(u"Początek utworu", "part1.html")
@@ -486,12 +511,21 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
          zip.writestr('OPS/annotations.html', etree.tostring(
                              html_tree, method="html", pretty_print=True))
  
          zip.writestr('OPS/annotations.html', etree.tostring(
                              html_tree, method="html", pretty_print=True))
  
+    toc.add("Wesprzyj Wolne Lektury", "support.html")
+    manifest.append(etree.fromstring(
+        '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
+    spine.append(etree.fromstring(
+        '<itemref idref="support" />'))
+    html_string = open(get_resource('epub/support.html')).read()
+    chars.update(used_chars(etree.fromstring(html_string)))
+    zip.writestr('OPS/support.html', html_string)
+
      toc.add("Strona redakcyjna", "last.html")
      manifest.append(etree.fromstring(
          '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
      spine.append(etree.fromstring(
          '<itemref idref="last" />'))
      toc.add("Strona redakcyjna", "last.html")
      manifest.append(etree.fromstring(
          '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
      spine.append(etree.fromstring(
          '<itemref idref="last" />'))
-    html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
+    html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
      chars.update(used_chars(html_tree.getroot()))
      zip.writestr('OPS/last.html', etree.tostring(
                          html_tree, method="html", pretty_print=True))
      chars.update(used_chars(html_tree.getroot()))
      zip.writestr('OPS/last.html', etree.tostring(
                          html_tree, method="html", pretty_print=True))
@@ -499,7 +533,10 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
      if not flags or not 'without-fonts' in flags:
          # strip fonts
          tmpdir = mkdtemp('-librarian-epub')
      if not flags or not 'without-fonts' in flags:
          # strip fonts
          tmpdir = mkdtemp('-librarian-epub')
-        cwd = os.getcwd()
+        try:
+            cwd = os.getcwd()
+        except OSError:
+            cwd = None
  
          os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
          for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
  
          os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
          for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
@@ -514,11 +551,11 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
              manifest.append(etree.fromstring(
                  '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
          rmtree(tmpdir)
              manifest.append(etree.fromstring(
                  '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
          rmtree(tmpdir)
-        os.chdir(cwd)
+        if cwd is not None:
+            os.chdir(cwd)
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
-    contents = []
-    title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
+    title = document.book_info.title
      attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
      for st in attributes:
          meta = toc_file.makeelement(NCXNS('meta'))
      attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
      for st in attributes:
          meta = toc_file.makeelement(NCXNS('meta'))
@@ -536,3 +573,5 @@ def transform(provider, slug=None, file_path=None, output_file=None, output_dir=
      toc.write_to_xml(nav_map)
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
      toc.write_to_xml(nav_map)
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
+
+    return OutputFile.from_filename(output_file.name)