epub: subtitle fix

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index 2584029..ad84ab0 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -8,6 +8,7 @@ from __future__ import with_statement
  import os
  import os.path
  import subprocess
  import os
  import os.path
  import subprocess
+from StringIO import StringIO
  from copy import deepcopy
  from lxml import etree
  import zipfile
  from copy import deepcopy
  from lxml import etree
  import zipfile
@@ -15,11 +16,14 @@ from tempfile import mkdtemp
  from shutil import rmtree
  
  import sys
  from shutil import rmtree
  
  import sys
-sys.path.append('..') # for running from working copy
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
+from librarian import functions, get_resource
+
+functions.reg_person_name()
+
  
  def inner_xml(node):
      """ returns node's text and children as a string
  
  def inner_xml(node):
      """ returns node's text and children as a string
@@ -70,16 +74,12 @@ def xslt(xml, sheet):
          return xml.xslt(etree.parse(xsltf))
  
  
          return xml.xslt(etree.parse(xsltf))
  
  
-_resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
-def res(fname):
-    return os.path.join(_resdir, fname)
-
-
  def replace_characters(node):
      def replace_chars(text):
          if text is None:
              return None
  def replace_characters(node):
      def replace_chars(text):
          if text is None:
              return None
-        return text.replace("---", u"\u2014")\
+        return text.replace(u"\ufeff", u"")\
+                   .replace("---", u"\u2014")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
@@ -106,7 +106,7 @@ def find_annotations(annotations, source, part_no):
              child.clear()
              child.tail = tail
              child.text = number
              child.clear()
              child.tail = tail
              child.text = number
-        if child.tag not in ('extra', 'podtytul'):
+        if child.tag not in ('extra',):
              find_annotations(annotations, child, part_no)
  
  
              find_annotations(annotations, child, part_no)
  
  
@@ -211,7 +211,6 @@ class TOC(object):
  
  def used_chars(element):
      """ Lists characters used in an ETree Element """
  
  def used_chars(element):
      """ Lists characters used in an ETree Element """
-    print (element.text or '') + (element.tail or '')
      chars = set((element.text or '') + (element.tail or ''))
      for child in element:
          chars = chars.union(used_chars(child))
      chars = set((element.text or '') + (element.tail or ''))
      for child in element:
          chars = chars.union(used_chars(child))
@@ -242,7 +241,7 @@ def chop(main_text):
      yield part_xml
  
  
      yield part_xml
  
  
-def transform_chunk(chunk_xml, chunk_no, annotations):
+def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
      """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
  
      toc = TOC()
      """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
  
      toc = TOC()
@@ -252,25 +251,37 @@ def transform_chunk(chunk_xml, chunk_no, annotations):
          elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
              subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
              element.set('sub', str(subnumber))
          elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
              subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
              element.set('sub', str(subnumber))
-    find_annotations(annotations, chunk_xml, chunk_no)
-    replace_by_verse(chunk_xml)
-    html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
-    chars = used_chars(html_tree.getroot())
-    output_html = etree.tostring(html_tree, pretty_print=True)
+    if empty:
+        if not _empty_html_static:
+            _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
+        chars = set()
+        output_html = _empty_html_static[0]
+    else:
+        find_annotations(annotations, chunk_xml, chunk_no)
+        replace_by_verse(chunk_xml)
+        html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
+        chars = used_chars(html_tree.getroot())
+        output_html = etree.tostring(html_tree, method="html", pretty_print=True)
      return output_html, toc, chars
  
  
      return output_html, toc, chars
  
  
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces an epub
+def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None, cover_fn=None):
+    """ produces a EPUB file
  
  
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
+    provider: a DocProvider
+    slug: slug of file to process, available by provider
+    output_file: file-like object or path to output file
+    output_dir: path to directory to save output file to; either this or output_file must be present
+    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
+    sample=n: generate sample e-book (with at least n paragraphs)
+    cover_fn: function(author, title) -> cover image
      """
  
      """
  
-    def transform_file(input_xml, chunk_counter=1, first=True):
+    def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
          """ processes one input file and proceeds to its children """
  
          """ processes one input file and proceeds to its children """
  
+        replace_characters(input_xml.getroot())
+
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
@@ -279,16 +290,20 @@ def transform(provider, slug, output_file=None, output_dir=None):
          chars = set()
          if first:
              # write book title page
          chars = set()
          if first:
              # write book title page
-            html_tree = xslt(input_xml, res('xsltTitle.xsl'))
+            html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
-                 etree.tostring(html_tree, pretty_print=True))
+                 etree.tostring(html_tree, method="html", pretty_print=True))
          elif children:
              # write title page for every parent
          elif children:
              # write title page for every parent
-            html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
-            chars = used_chars(html_tree.getroot())
-            zip.writestr('OPS/part%d.html' % chunk_counter, 
-                etree.tostring(html_tree, pretty_print=True))
+            if sample is not None and sample <= 0:
+                chars = set()
+                html_string = open(get_resource('epub/emptyChunk.html')).read()
+            else:
+                html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
+                chars = used_chars(html_tree.getroot())
+                html_string = etree.tostring(html_tree, method="html", pretty_print=True)
+            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
@@ -303,10 +318,15 @@ def transform(provider, slug, output_file=None, output_dir=None):
                  main_text = None
  
          if main_text is not None:
                  main_text = None
  
          if main_text is not None:
-            replace_characters(main_text)
-
              for chunk_xml in chop(main_text):
              for chunk_xml in chop(main_text):
-                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
+                empty = False
+                if sample is not None:
+                    if sample <= 0:
+                        empty = True
+                    else:
+                        sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
+                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
+
                  toc.extend(chunk_toc)
                  chars = chars.union(chunk_chars)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                  toc.extend(chunk_toc)
                  chars = chars.union(chunk_chars)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
@@ -317,14 +337,24 @@ def transform(provider, slug, output_file=None, output_dir=None):
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
+                child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
                  toc.append(child_toc)
                  chars = chars.union(chunk_chars)
  
                  toc.append(child_toc)
                  chars = chars.union(chunk_chars)
  
-        return toc, chunk_counter, chars
+        return toc, chunk_counter, chars, sample
  
      # read metadata from the first file
  
      # read metadata from the first file
-    input_xml = etree.parse(provider[slug])
+    if file_path:
+        if slug:
+            raise ValueError('slug or file_path should be specified, not both')
+        f = open(file_path, 'r')
+        input_xml = etree.parse(f)
+        f.close()
+    else:
+        if not slug:
+            raise ValueError('either slug or file_path should be specified')
+        input_xml = etree.parse(provider[slug])
+
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
@@ -333,14 +363,17 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
      # if output to dir, create the file
      if output_dir is not None:
  
      # if output to dir, create the file
      if output_dir is not None:
-        author = unicode(book_info.author)
-        author_dir = os.path.join(output_dir, author)
-        try:
-            os.makedirs(author_dir)
-        except OSError:
-            pass
-        output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
-
+        if make_dir:
+            author = unicode(book_info.author)
+            output_dir = os.path.join(output_dir, author)
+            try:
+                os.makedirs(output_dir)
+            except OSError:
+                pass
+        if slug:
+            output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
+        else:
+            output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
@@ -355,13 +388,28 @@ def transform(provider, slug, output_file=None, output_dir=None):
                         '<rootfiles><rootfile full-path="OPS/content.opf" ' \
                         'media-type="application/oebps-package+xml" />' \
                         '</rootfiles></container>')
                         '<rootfiles><rootfile full-path="OPS/content.opf" ' \
                         'media-type="application/oebps-package+xml" />' \
                         '</rootfiles></container>')
-    for fname in 'style.css', 'logo_wolnelektury.png':
-        zip.write(res(fname), os.path.join('OPS', fname))
+    zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
+    zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
  
  
-    opf = xslt(metadata, res('xsltContent.xsl'))
+    opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
      manifest = opf.find('.//' + OPFNS('manifest'))
      spine = opf.find('.//' + OPFNS('spine'))
  
      manifest = opf.find('.//' + OPFNS('manifest'))
      spine = opf.find('.//' + OPFNS('spine'))
  
+    if cover_fn:
+        cover = StringIO()
+        cover_fn(book_info.author.readable(), book_info.title).save(cover, format='JPEG')
+        zip.writestr(os.path.join('OPS', 'cover.jpg'), cover.getvalue())
+        del cover
+        zip.writestr('OPS/cover.html', open(get_resource('epub/cover.html')).read())
+        manifest.append(etree.fromstring(
+            '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
+        manifest.append(etree.fromstring(
+            '<item id="cover-image" href="cover.jpg" media-type="image/jpeg" />'))
+        spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
+        opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
+        opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
+
+
      annotations = etree.Element('annotations')
  
      toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
      annotations = etree.Element('annotations')
  
      toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
@@ -373,7 +421,7 @@ def transform(provider, slug, output_file=None, output_dir=None):
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
-    toc, chunk_counter, chars = transform_file(input_xml)
+    toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
  
      if not toc.children:
          toc.add(u"Początek utworu", 1)
  
      if not toc.children:
          toc.add(u"Początek utworu", 1)
@@ -384,23 +432,42 @@ def transform(provider, slug, output_file=None, output_dir=None):
          nav_map.append(etree.fromstring(
              '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
              '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
          nav_map.append(etree.fromstring(
              '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
              '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
+        toc_counter += 1
          manifest.append(etree.fromstring(
              '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
          manifest.append(etree.fromstring(
              '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
-        html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
+        html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
          chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
          chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
-                            html_tree, pretty_print=True))
+                            html_tree, method="html", pretty_print=True))
+
+    nav_map.append(etree.fromstring(
+        '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
+        '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
+    manifest.append(etree.fromstring(
+        '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
+    spine.append(etree.fromstring(
+        '<itemref idref="last" />'))
+    html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
+    chars.update(used_chars(html_tree.getroot()))
+    zip.writestr('OPS/last.html', etree.tostring(
+                        html_tree, method="html", pretty_print=True))
  
      # strip fonts
      tmpdir = mkdtemp('-librarian-epub')
      cwd = os.getcwd()
  
  
      # strip fonts
      tmpdir = mkdtemp('-librarian-epub')
      cwd = os.getcwd()
  
-    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../font-optimizer'))
+    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
      for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
      for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
-        subprocess.check_call(['./subset.pl', '--chars', ''.join(chars), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
+        optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), 
+                          get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
+        if verbose:
+            print "Running font-optimizer"
+            subprocess.check_call(optimizer_call)
+        else:
+            subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
          zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
      rmtree(tmpdir)
      os.chdir(cwd)
          zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
      rmtree(tmpdir)
      os.chdir(cwd)
@@ -419,18 +486,3 @@ def transform(provider, slug, output_file=None, output_dir=None):
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
-
-
-if __name__ == '__main__':
-    from librarian import DirDocProvider
-
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python epub.py <input file>'
-        sys.exit(1)
-
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
-