fixes #1060: sorted and unique editors' names

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index 84a745f..527d050 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -7,16 +7,23 @@ from __future__ import with_statement
  
  import os
  import os.path
  
  import os
  import os.path
+import subprocess
+from StringIO import StringIO
  from copy import deepcopy
  from lxml import etree
  import zipfile
  from copy import deepcopy
  from lxml import etree
  import zipfile
+from tempfile import mkdtemp
+from shutil import rmtree
  
  import sys
  
  import sys
-sys.path.append('..') # for running from working copy
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
+from librarian import functions, get_resource
+
+functions.reg_person_name()
+
  
  def inner_xml(node):
      """ returns node's text and children as a string
  
  def inner_xml(node):
      """ returns node's text and children as a string
@@ -67,16 +74,12 @@ def xslt(xml, sheet):
          return xml.xslt(etree.parse(xsltf))
  
  
          return xml.xslt(etree.parse(xsltf))
  
  
-_resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
-def res(fname):
-    return os.path.join(_resdir, fname)
-
-
  def replace_characters(node):
      def replace_chars(text):
          if text is None:
              return None
  def replace_characters(node):
      def replace_chars(text):
          if text is None:
              return None
-        return text.replace("---", u"\u2014")\
+        return text.replace(u"\ufeff", u"")\
+                   .replace("---", u"\u2014")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
@@ -103,7 +106,7 @@ def find_annotations(annotations, source, part_no):
              child.clear()
              child.tail = tail
              child.text = number
              child.clear()
              child.tail = tail
              child.text = number
-        if child.tag not in ('extra', 'podtytul'):
+        if child.tag not in ('extra',):
              find_annotations(annotations, child, part_no)
  
  
              find_annotations(annotations, child, part_no)
  
  
@@ -206,6 +209,14 @@ class TOC(object):
          return counter
  
  
          return counter
  
  
+def used_chars(element):
+    """ Lists characters used in an ETree Element """
+    chars = set((element.text or '') + (element.tail or ''))
+    for child in element:
+        chars = chars.union(used_chars(child))
+    return chars
+
+
  def chop(main_text):
      """ divide main content of the XML file into chunks """
  
  def chop(main_text):
      """ divide main content of the XML file into chunks """
  
@@ -230,8 +241,8 @@ def chop(main_text):
      yield part_xml
  
  
      yield part_xml
  
  
-def transform_chunk(chunk_xml, chunk_no, annotations):
-    """ transforms one chunk, returns a HTML string and a TOC object """
+def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
+    """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
  
      toc = TOC()
      for element in chunk_xml[0]:
  
      toc = TOC()
      for element in chunk_xml[0]:
@@ -240,36 +251,61 @@ def transform_chunk(chunk_xml, chunk_no, annotations):
          elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
              subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
              element.set('sub', str(subnumber))
          elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
              subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
              element.set('sub', str(subnumber))
-    find_annotations(annotations, chunk_xml, chunk_no)
-    replace_by_verse(chunk_xml)
-    output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
-    return output_html, toc
-
-
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces an epub
-
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
+    if empty:
+        if not _empty_html_static:
+            _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
+        chars = set()
+        output_html = _empty_html_static[0]
+    else:
+        find_annotations(annotations, chunk_xml, chunk_no)
+        replace_by_verse(chunk_xml)
+        html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
+        chars = used_chars(html_tree.getroot())
+        output_html = etree.tostring(html_tree, method="html", pretty_print=True)
+    return output_html, toc, chars
+
+
+def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
+              sample=None, cover_fn=None, flags=None):
+    """ produces a EPUB file
+
+    provider: a DocProvider
+    slug: slug of file to process, available by provider
+    output_file: file-like object or path to output file
+    output_dir: path to directory to save output file to; either this or output_file must be present
+    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
+    sample=n: generate sample e-book (with at least n paragraphs)
+    cover_fn: function(author, title) -> cover image
+    flags: less-advertising,
      """
  
      """
  
-    def transform_file(input_xml, chunk_counter=1, first=True):
+    def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
          """ processes one input file and proceeds to its children """
  
          """ processes one input file and proceeds to its children """
  
+        replace_characters(input_xml.getroot())
+
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
          # pointing to starting chunk
          toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
          # pointing to starting chunk
          toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
+        chars = set()
          if first:
              # write book title page
          if first:
              # write book title page
+            html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
+            chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
              zip.writestr('OPS/title.html',
-                 etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
+                 etree.tostring(html_tree, method="html", pretty_print=True))
          elif children:
              # write title page for every parent
          elif children:
              # write title page for every parent
-            zip.writestr('OPS/part%d.html' % chunk_counter, 
-                etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
+            if sample is not None and sample <= 0:
+                chars = set()
+                html_string = open(get_resource('epub/emptyChunk.html')).read()
+            else:
+                html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
+                chars = used_chars(html_tree.getroot())
+                html_string = etree.tostring(html_tree, method="html", pretty_print=True)
+            zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
@@ -284,11 +320,17 @@ def transform(provider, slug, output_file=None, output_dir=None):
                  main_text = None
  
          if main_text is not None:
                  main_text = None
  
          if main_text is not None:
-            replace_characters(main_text)
-
              for chunk_xml in chop(main_text):
              for chunk_xml in chop(main_text):
-                chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
+                empty = False
+                if sample is not None:
+                    if sample <= 0:
+                        empty = True
+                    else:
+                        sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
+                chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
+
                  toc.extend(chunk_toc)
                  toc.extend(chunk_toc)
+                chars = chars.union(chunk_chars)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
                  zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
                  add_to_manifest(manifest, chunk_counter)
                  add_to_spine(spine, chunk_counter)
@@ -297,13 +339,28 @@ def transform(provider, slug, output_file=None, output_dir=None):
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
          if children:
              for child in children:
                  child_xml = etree.parse(provider.by_uri(child))
-                child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
+                child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
                  toc.append(child_toc)
                  toc.append(child_toc)
+                chars = chars.union(chunk_chars)
  
  
-        return toc, chunk_counter
+        return toc, chunk_counter, chars, sample
  
      # read metadata from the first file
  
      # read metadata from the first file
-    input_xml = etree.parse(provider[slug])
+    if file_path:
+        if slug:
+            raise ValueError('slug or file_path should be specified, not both')
+        f = open(file_path, 'r')
+        input_xml = etree.parse(f)
+        f.close()
+    else:
+        if not slug:
+            raise ValueError('either slug or file_path should be specified')
+        input_xml = etree.parse(provider[slug])
+
+    if flags:
+        for flag in flags:
+            input_xml.getroot().set(flag, 'yes')
+
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
@@ -312,14 +369,17 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
      # if output to dir, create the file
      if output_dir is not None:
  
      # if output to dir, create the file
      if output_dir is not None:
-        author = unicode(book_info.author)
-        author_dir = os.path.join(output_dir, author)
-        try:
-            os.makedirs(author_dir)
-        except OSError:
-            pass
-        output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
-
+        if make_dir:
+            author = unicode(book_info.author)
+            output_dir = os.path.join(output_dir, author)
+            try:
+                os.makedirs(output_dir)
+            except OSError:
+                pass
+        if slug:
+            output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
+        else:
+            output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
@@ -334,13 +394,28 @@ def transform(provider, slug, output_file=None, output_dir=None):
                         '<rootfiles><rootfile full-path="OPS/content.opf" ' \
                         'media-type="application/oebps-package+xml" />' \
                         '</rootfiles></container>')
                         '<rootfiles><rootfile full-path="OPS/content.opf" ' \
                         'media-type="application/oebps-package+xml" />' \
                         '</rootfiles></container>')
-    for fname in 'style.css', 'logo_wolnelektury.png':
-        zip.write(res(fname), os.path.join('OPS', fname))
+    zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
+    zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
  
  
-    opf = xslt(metadata, res('xsltContent.xsl'))
+    opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
      manifest = opf.find('.//' + OPFNS('manifest'))
      spine = opf.find('.//' + OPFNS('spine'))
  
      manifest = opf.find('.//' + OPFNS('manifest'))
      spine = opf.find('.//' + OPFNS('spine'))
  
+    if cover_fn:
+        cover = StringIO()
+        cover_fn(book_info.author.readable(), book_info.title).save(cover, format='JPEG')
+        zip.writestr(os.path.join('OPS', 'cover.jpg'), cover.getvalue())
+        del cover
+        zip.writestr('OPS/cover.html', open(get_resource('epub/cover.html')).read())
+        manifest.append(etree.fromstring(
+            '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
+        manifest.append(etree.fromstring(
+            '<item id="cover-image" href="cover.jpg" media-type="image/jpeg" />'))
+        spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
+        opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
+        opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
+
+
      annotations = etree.Element('annotations')
  
      toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
      annotations = etree.Element('annotations')
  
      toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
@@ -352,7 +427,8 @@ def transform(provider, slug, output_file=None, output_dir=None):
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
                                 '</navPoint></navMap></ncx>')
      nav_map = toc_file[-1]
  
-    toc, chunk_counter = transform_file(input_xml)
+    toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
+
      if not toc.children:
          toc.add(u"Początek utworu", 1)
      toc_counter = toc.write_to_xml(nav_map, 2)
      if not toc.children:
          toc.add(u"Początek utworu", 1)
      toc_counter = toc.write_to_xml(nav_map, 2)
@@ -362,13 +438,45 @@ def transform(provider, slug, output_file=None, output_dir=None):
          nav_map.append(etree.fromstring(
              '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
              '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
          nav_map.append(etree.fromstring(
              '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
              '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
+        toc_counter += 1
          manifest.append(etree.fromstring(
              '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
          manifest.append(etree.fromstring(
              '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
          spine.append(etree.fromstring(
              '<itemref idref="annotations" />'))
          replace_by_verse(annotations)
+        html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
+        chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
          zip.writestr('OPS/annotations.html', etree.tostring(
-                            xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
+                            html_tree, method="html", pretty_print=True))
+
+    nav_map.append(etree.fromstring(
+        '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
+        '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
+    manifest.append(etree.fromstring(
+        '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
+    spine.append(etree.fromstring(
+        '<itemref idref="last" />'))
+    html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
+    chars.update(used_chars(html_tree.getroot()))
+    zip.writestr('OPS/last.html', etree.tostring(
+                        html_tree, method="html", pretty_print=True))
+
+    # strip fonts
+    tmpdir = mkdtemp('-librarian-epub')
+    cwd = os.getcwd()
+
+    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
+    for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
+        optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), 
+                          get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
+        if verbose:
+            print "Running font-optimizer"
+            subprocess.check_call(optimizer_call)
+        else:
+            subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
+        zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
+    rmtree(tmpdir)
+    os.chdir(cwd)
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
      contents = []
  
      zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
      contents = []
@@ -384,18 +492,3 @@ def transform(provider, slug, output_file=None, output_dir=None):
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
-
-
-if __name__ == '__main__':
-    from librarian import DirDocProvider
-
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python epub.py <input file>'
-        sys.exit(1)
-
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
-