updated text test

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index 84a745f..3ee487a 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -7,16 +7,22 @@ from __future__ import with_statement
  
  import os
  import os.path
  
  import os
  import os.path
+import subprocess
  from copy import deepcopy
  from lxml import etree
  import zipfile
  from copy import deepcopy
  from lxml import etree
  import zipfile
+from tempfile import mkdtemp
+from shutil import rmtree
  
  import sys
  
  import sys
-sys.path.append('..') # for running from working copy
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
+from librarian import functions
+
+functions.reg_person_name()
+
  
  def inner_xml(node):
      """ returns node's text and children as a string
  
  def inner_xml(node):
      """ returns node's text and children as a string
@@ -206,6 +212,14 @@ class TOC(object):
          return counter
  
  
          return counter
  
  
+def used_chars(element):
+    """ Lists characters used in an ETree Element """
+    chars = set((element.text or '') + (element.tail or ''))
+    for child in element:
+        chars = chars.union(used_chars(child))
+    return chars
+
+
  def chop(main_text):
      """ divide main content of the XML file into chunks """
  
  def chop(main_text):
      """ divide main content of the XML file into chunks """
  
@@ -231,7 +245,7 @@ def chop(main_text):
  
  
  def transform_chunk(chunk_xml, chunk_no, annotations):
  
  
  def transform_chunk(chunk_xml, chunk_no, annotations):
-    """ transforms one chunk, returns a HTML string and a TOC object """
+    """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
  
      toc = TOC()
      for element in chunk_xml[0]:
  
      toc = TOC()
      for element in chunk_xml[0]:
@@ -242,16 +256,20 @@ def transform_chunk(chunk_xml, chunk_no, annotations):
              element.set('sub', str(subnumber))
      find_annotations(annotations, chunk_xml, chunk_no)
      replace_by_verse(chunk_xml)
              element.set('sub', str(subnumber))
      find_annotations(annotations, chunk_xml, chunk_no)
      replace_by_verse(chunk_xml)
-    output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
-    return output_html, toc
+    html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
+    chars = used_chars(html_tree.getroot())
+    output_html = etree.tostring(html_tree, pretty_print=True)
+    return output_html, toc, chars
  
  
  
  
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces an epub
+def transform(provider, slug, output_file=None, output_dir=None, make_dir=False):
+    """ produces a EPUB file
  
  
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
+    provider: a DocProvider
+    slug: slug of file to process, available by provider
+    output_file: file-like object or path to output file
+    output_dir: path to directory to save output file to; either this or output_file must be present
+    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
      """
  
      def transform_file(input_xml, chunk_counter=1, first=True):
      """
  
      def transform_file(input_xml, chunk_counter=1, first=True):
@@ -262,14 +280,19 @@ def transform(provider, slug, output_file=None, output_dir=None):
          # every input file will have a TOC entry,
          # pointing to starting chunk
          toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
          # every input file will have a TOC entry,
          # pointing to starting chunk
          toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
+        chars = set()
          if first:
              # write book title page
          if first:
              # write book title page
+            html_tree = xslt(input_xml, res('xsltTitle.xsl'))
+            chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
              zip.writestr('OPS/title.html',
-                 etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
+                 etree.tostring(html_tree, pretty_print=True))
          elif children:
              # write title page for every parent
          elif children:
              # write title page for every parent
+            html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
+            chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/part%d.html' % chunk_counter, 
              zip.writestr('OPS/part%d.html' % chunk_counter, 
-                etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
+                etree.tostring(html_tree, pretty_print=True))
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
@@ -287,8 +310,9 @@ def transform(provider, slug, output_file=None, output_dir=None):
              replace_characters(main_text)
  
              for chunk_xml in chop(main_text):
              replace_characters(main_text)
  
              for chunk_xml in chop(main_text):
-                chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
+                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
                  toc.extend(chunk_toc)
                  toc.extend(chunk_toc)
+                chars = chars.union(chunk_chars)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
@@ -297,10 +321,11 @@ def transform(provider, slug, output_file=None, output_dir=None):
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
+                child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
                  toc.append(child_toc)
                  toc.append(child_toc)
+                chars = chars.union(chunk_chars)
  
  
-        return toc, chunk_counter
+        return toc, chunk_counter, chars
  
      # read metadata from the first file
      input_xml = etree.parse(provider[slug])
  
      # read metadata from the first file
      input_xml = etree.parse(provider[slug])
@@ -312,14 +337,14 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
      # if output to dir, create the file
      if output_dir is not None:
  
      # if output to dir, create the file
      if output_dir is not None:
-        author = unicode(book_info.author)
-        author_dir = os.path.join(output_dir, author)
-        try:
-            os.makedirs(author_dir)
-        except OSError:
-            pass
-        output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
-
+        if make_dir:
+            author = unicode(book_info.author)
+            output_dir = os.path.join(output_dir, author)
+            try:
+                os.makedirs(output_dir)
+            except OSError:
+                pass
+        output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
@@ -352,7 +377,8 @@ def transform(provider, slug, output_file=None, output_dir=None):
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
-    toc, chunk_counter = transform_file(input_xml)
+    toc, chunk_counter, chars = transform_file(input_xml)
+
      if not toc.children:
          toc.add(u"Początek utworu", 1)
      toc_counter = toc.write_to_xml(nav_map, 2)
      if not toc.children:
          toc.add(u"Początek utworu", 1)
      toc_counter = toc.write_to_xml(nav_map, 2)
@@ -367,8 +393,21 @@ def transform(provider, slug, output_file=None, output_dir=None):
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
+        html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
+        chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
          zip.writestr('OPS/annotations.html', etree.tostring(
-                            xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
+                            html_tree, pretty_print=True))
+
+    # strip fonts
+    tmpdir = mkdtemp('-librarian-epub')
+    cwd = os.getcwd()
+
+    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
+    for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
+        subprocess.check_call(['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
+        zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
+    rmtree(tmpdir)
+    os.chdir(cwd)
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
      contents = []
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
      contents = []
@@ -384,18 +423,3 @@ def transform(provider, slug, output_file=None, output_dir=None):
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
-
-
-if __name__ == '__main__':
-    from librarian import DirDocProvider
-
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python epub.py <input file>'
-        sys.exit(1)
-
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
-