minor fix

[librarian.git] / librarian / epub.py
diff --git a/librarian/epub.py b/librarian/epub.py

index 2584029..a5607f7 100644 (file)
--- a/librarian/epub.py
+++ b/librarian/epub.py
@@ -15,11 +15,14 @@ from tempfile import mkdtemp
  from shutil import rmtree
  
  import sys
  from shutil import rmtree
  
  import sys
-sys.path.append('..') # for running from working copy
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
  
  from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
  from librarian.dcparser import BookInfo
  
+from librarian import functions
+
+functions.reg_person_name()
+
  
  def inner_xml(node):
      """ returns node's text and children as a string
  
  def inner_xml(node):
      """ returns node's text and children as a string
@@ -79,7 +82,8 @@ def replace_characters(node):
      def replace_chars(text):
          if text is None:
              return None
      def replace_chars(text):
          if text is None:
              return None
-        return text.replace("---", u"\u2014")\
+        return text.replace(u"\ufeff", u"")\
+                   .replace("---", u"\u2014")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
                     .replace("--", u"\u2013")\
                     .replace(",,", u"\u201E")\
                     .replace('"', u"\u201D")\
@@ -211,7 +215,6 @@ class TOC(object):
  
  def used_chars(element):
      """ Lists characters used in an ETree Element """
  
  def used_chars(element):
      """ Lists characters used in an ETree Element """
-    print (element.text or '') + (element.tail or '')
      chars = set((element.text or '') + (element.tail or ''))
      for child in element:
          chars = chars.union(used_chars(child))
      chars = set((element.text or '') + (element.tail or ''))
      for child in element:
          chars = chars.union(used_chars(child))
@@ -256,21 +259,25 @@ def transform_chunk(chunk_xml, chunk_no, annotations):
      replace_by_verse(chunk_xml)
      html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
      chars = used_chars(html_tree.getroot())
      replace_by_verse(chunk_xml)
      html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
      chars = used_chars(html_tree.getroot())
-    output_html = etree.tostring(html_tree, pretty_print=True)
+    output_html = etree.tostring(html_tree, method="html", pretty_print=True)
      return output_html, toc, chars
  
  
      return output_html, toc, chars
  
  
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces an epub
+def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
+    """ produces a EPUB file
  
  
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
+    provider: a DocProvider
+    slug: slug of file to process, available by provider
+    output_file: file-like object or path to output file
+    output_dir: path to directory to save output file to; either this or output_file must be present
+    make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
      """
  
      def transform_file(input_xml, chunk_counter=1, first=True):
          """ processes one input file and proceeds to its children """
  
      """
  
      def transform_file(input_xml, chunk_counter=1, first=True):
          """ processes one input file and proceeds to its children """
  
+        replace_characters(input_xml.getroot())
+
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
          children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
  
          # every input file will have a TOC entry,
@@ -282,13 +289,13 @@ def transform(provider, slug, output_file=None, output_dir=None):
              html_tree = xslt(input_xml, res('xsltTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
              html_tree = xslt(input_xml, res('xsltTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/title.html',
-                 etree.tostring(html_tree, pretty_print=True))
+                 etree.tostring(html_tree, method="html", pretty_print=True))
          elif children:
              # write title page for every parent
              html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/part%d.html' % chunk_counter, 
          elif children:
              # write title page for every parent
              html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
              chars = used_chars(html_tree.getroot())
              zip.writestr('OPS/part%d.html' % chunk_counter, 
-                etree.tostring(html_tree, pretty_print=True))
+                etree.tostring(html_tree, method="html", pretty_print=True))
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
              add_to_manifest(manifest, chunk_counter)
              add_to_spine(spine, chunk_counter)
              chunk_counter += 1
@@ -303,8 +310,6 @@ def transform(provider, slug, output_file=None, output_dir=None):
                  main_text = None
  
          if main_text is not None:
                  main_text = None
  
          if main_text is not None:
-            replace_characters(main_text)
-
              for chunk_xml in chop(main_text):
                  chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
                  toc.extend(chunk_toc)
              for chunk_xml in chop(main_text):
                  chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
                  toc.extend(chunk_toc)
@@ -324,7 +329,17 @@ def transform(provider, slug, output_file=None, output_dir=None):
          return toc, chunk_counter, chars
  
      # read metadata from the first file
          return toc, chunk_counter, chars
  
      # read metadata from the first file
-    input_xml = etree.parse(provider[slug])
+    if file_path:
+        if slug:
+            raise ValueError('slug or file_path should be specified, not both')
+        f = open(file_path, 'r')
+        input_xml = etree.parse(f)
+        f.close()
+    else:
+        if not slug:
+            raise ValueError('either slug or file_path should be specified')
+        input_xml = etree.parse(provider[slug])
+
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
      metadata = input_xml.find('.//'+RDFNS('Description'))
      if metadata is None:
          raise NoDublinCore('Document has no DublinCore - which is required.')
@@ -333,14 +348,17 @@ def transform(provider, slug, output_file=None, output_dir=None):
  
      # if output to dir, create the file
      if output_dir is not None:
  
      # if output to dir, create the file
      if output_dir is not None:
-        author = unicode(book_info.author)
-        author_dir = os.path.join(output_dir, author)
-        try:
-            os.makedirs(author_dir)
-        except OSError:
-            pass
-        output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
-
+        if make_dir:
+            author = unicode(book_info.author)
+            output_dir = os.path.join(output_dir, author)
+            try:
+                os.makedirs(output_dir)
+            except OSError:
+                pass
+        if slug:
+            output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
+        else:
+            output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
  
      zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
@@ -392,15 +410,20 @@ def transform(provider, slug, output_file=None, output_dir=None):
          html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
          chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
          html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
          chars = chars.union(used_chars(html_tree.getroot()))
          zip.writestr('OPS/annotations.html', etree.tostring(
-                            html_tree, pretty_print=True))
+                            html_tree, method="html", pretty_print=True))
  
      # strip fonts
      tmpdir = mkdtemp('-librarian-epub')
      cwd = os.getcwd()
  
  
      # strip fonts
      tmpdir = mkdtemp('-librarian-epub')
      cwd = os.getcwd()
  
-    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../font-optimizer'))
+    os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
      for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
      for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
-        subprocess.check_call(['./subset.pl', '--chars', ''.join(chars), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
+        optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
+        if verbose:
+            print "Running font-optimizer"
+            subprocess.check_call(optimizer_call)
+        else:
+            subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
          zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
      rmtree(tmpdir)
      os.chdir(cwd)
          zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
      rmtree(tmpdir)
      os.chdir(cwd)
@@ -419,18 +442,3 @@ def transform(provider, slug, output_file=None, output_dir=None):
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
      set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
      zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
      zip.close()
-
-
-if __name__ == '__main__':
-    from librarian import DirDocProvider
-
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python epub.py <input file>'
-        sys.exit(1)
-
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
-