updated text test
[librarian.git] / librarian / epub.py
index 84a745f..3ee487a 100644 (file)
@@ -7,16 +7,22 @@ from __future__ import with_statement
 
 import os
 import os.path
 
 import os
 import os.path
+import subprocess
 from copy import deepcopy
 from lxml import etree
 import zipfile
 from copy import deepcopy
 from lxml import etree
 import zipfile
+from tempfile import mkdtemp
+from shutil import rmtree
 
 import sys
 
 import sys
-sys.path.append('..') # for running from working copy
 
 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
 from librarian.dcparser import BookInfo
 
 
 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
 from librarian.dcparser import BookInfo
 
+from librarian import functions
+
+functions.reg_person_name()
+
 
 def inner_xml(node):
     """ returns node's text and children as a string
 
 def inner_xml(node):
     """ returns node's text and children as a string
@@ -206,6 +212,14 @@ class TOC(object):
         return counter
 
 
         return counter
 
 
+def used_chars(element):
+    """ Lists characters used in an ETree Element """
+    chars = set((element.text or '') + (element.tail or ''))
+    for child in element:
+        chars = chars.union(used_chars(child))
+    return chars
+
+
 def chop(main_text):
     """ divide main content of the XML file into chunks """
 
 def chop(main_text):
     """ divide main content of the XML file into chunks """
 
@@ -231,7 +245,7 @@ def chop(main_text):
 
 
 def transform_chunk(chunk_xml, chunk_no, annotations):
 
 
 def transform_chunk(chunk_xml, chunk_no, annotations):
-    """ transforms one chunk, returns a HTML string and a TOC object """
+    """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
 
     toc = TOC()
     for element in chunk_xml[0]:
 
     toc = TOC()
     for element in chunk_xml[0]:
@@ -242,16 +256,20 @@ def transform_chunk(chunk_xml, chunk_no, annotations):
             element.set('sub', str(subnumber))
     find_annotations(annotations, chunk_xml, chunk_no)
     replace_by_verse(chunk_xml)
             element.set('sub', str(subnumber))
     find_annotations(annotations, chunk_xml, chunk_no)
     replace_by_verse(chunk_xml)
-    output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
-    return output_html, toc
+    html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
+    chars = used_chars(html_tree.getroot())
+    output_html = etree.tostring(html_tree, pretty_print=True)
+    return output_html, toc, chars
 
 
 
 
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces an epub
+def transform(provider, slug, output_file=None, output_dir=None, make_dir=False):
+    """ produces a EPUB file
 
 
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
+    provider: a DocProvider
+    slug: slug of file to process, available by provider
+    output_file: file-like object or path to output file
+    output_dir: path to directory to save output file to; either this or output_file must be present
+    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
     """
 
     def transform_file(input_xml, chunk_counter=1, first=True):
     """
 
     def transform_file(input_xml, chunk_counter=1, first=True):
@@ -262,14 +280,19 @@ def transform(provider, slug, output_file=None, output_dir=None):
         # every input file will have a TOC entry,
         # pointing to starting chunk
         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
         # every input file will have a TOC entry,
         # pointing to starting chunk
         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
+        chars = set()
         if first:
             # write book title page
         if first:
             # write book title page
+            html_tree = xslt(input_xml, res('xsltTitle.xsl'))
+            chars = used_chars(html_tree.getroot())
             zip.writestr('OPS/title.html',
             zip.writestr('OPS/title.html',
-                 etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
+                 etree.tostring(html_tree, pretty_print=True))
         elif children:
             # write title page for every parent
         elif children:
             # write title page for every parent
+            html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
+            chars = used_chars(html_tree.getroot())
             zip.writestr('OPS/part%d.html' % chunk_counter, 
             zip.writestr('OPS/part%d.html' % chunk_counter, 
-                etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
+                etree.tostring(html_tree, pretty_print=True))
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
             add_to_manifest(manifest, chunk_counter)
             add_to_spine(spine, chunk_counter)
             chunk_counter += 1
@@ -287,8 +310,9 @@ def transform(provider, slug, output_file=None, output_dir=None):
             replace_characters(main_text)
 
             for chunk_xml in chop(main_text):
             replace_characters(main_text)
 
             for chunk_xml in chop(main_text):
-                chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
+                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
                 toc.extend(chunk_toc)
                 toc.extend(chunk_toc)
+                chars = chars.union(chunk_chars)
                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                 add_to_manifest(manifest, chunk_counter)
                 add_to_spine(spine, chunk_counter)
@@ -297,10 +321,11 @@ def transform(provider, slug, output_file=None, output_dir=None):
         if children:
             for child in children:
                 child_xml = etree.parse(provider.by_uri(child))
         if children:
             for child in children:
                 child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
+                child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
                 toc.append(child_toc)
                 toc.append(child_toc)
+                chars = chars.union(chunk_chars)
 
 
-        return toc, chunk_counter
+        return toc, chunk_counter, chars
 
     # read metadata from the first file
     input_xml = etree.parse(provider[slug])
 
     # read metadata from the first file
     input_xml = etree.parse(provider[slug])
@@ -312,14 +337,14 @@ def transform(provider, slug, output_file=None, output_dir=None):
 
     # if output to dir, create the file
     if output_dir is not None:
 
     # if output to dir, create the file
     if output_dir is not None:
-        author = unicode(book_info.author)
-        author_dir = os.path.join(output_dir, author)
-        try:
-            os.makedirs(author_dir)
-        except OSError:
-            pass
-        output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
-
+        if make_dir:
+            author = unicode(book_info.author)
+            output_dir = os.path.join(output_dir, author)
+            try:
+                os.makedirs(output_dir)
+            except OSError:
+                pass
+        output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
 
     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 
 
     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
 
@@ -352,7 +377,8 @@ def transform(provider, slug, output_file=None, output_dir=None):
                                '</navPoint></navMap></ncx>')
     nav_map = toc_file[-1]
 
                                '</navPoint></navMap></ncx>')
     nav_map = toc_file[-1]
 
-    toc, chunk_counter = transform_file(input_xml)
+    toc, chunk_counter, chars = transform_file(input_xml)
+
     if not toc.children:
         toc.add(u"Początek utworu", 1)
     toc_counter = toc.write_to_xml(nav_map, 2)
     if not toc.children:
         toc.add(u"Początek utworu", 1)
     toc_counter = toc.write_to_xml(nav_map, 2)
@@ -367,8 +393,21 @@ def transform(provider, slug, output_file=None, output_dir=None):
         spine.append(etree.fromstring(
             '<itemref idref="annotations" />'))
         replace_by_verse(annotations)
         spine.append(etree.fromstring(
             '<itemref idref="annotations" />'))
         replace_by_verse(annotations)
+        html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
+        chars = chars.union(used_chars(html_tree.getroot()))
         zip.writestr('OPS/annotations.html', etree.tostring(
         zip.writestr('OPS/annotations.html', etree.tostring(
-                            xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
+                            html_tree, pretty_print=True))
+
+    # strip fonts
+    tmpdir = mkdtemp('-librarian-epub')
+    cwd = os.getcwd()
+
+    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
+    for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
+        subprocess.check_call(['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
+        zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
+    rmtree(tmpdir)
+    os.chdir(cwd)
 
     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
     contents = []
 
     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
     contents = []
@@ -384,18 +423,3 @@ def transform(provider, slug, output_file=None, output_dir=None):
     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
     zip.close()
     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
     zip.close()
-
-
-if __name__ == '__main__':
-    from librarian import DirDocProvider
-
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python epub.py <input file>'
-        sys.exit(1)
-
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
-