updated tests

[librarian.git] / librarian / pdf.py
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 94441db..a0a0899 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -19,14 +19,13 @@ from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
+from librarian.dcparser import Person
  from librarian.parser import WLDocument
  from librarian.parser import WLDocument
-from librarian import ParseError
+from librarian import ParseError, DCNS
  from librarian import functions
  
  
  from librarian import functions
  
  
-
  functions.reg_substitute_entities()
  functions.reg_substitute_entities()
-functions.reg_person_name()
  functions.reg_strip()
  functions.reg_starts_white()
  functions.reg_ends_white()
  functions.reg_strip()
  functions.reg_starts_white()
  functions.reg_ends_white()
@@ -45,27 +44,23 @@ def insert_tags(doc, split_re, tagname):
      <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
      """
  
      <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
      """
  
-    for elem in doc.iter():
-        try:
-            if elem.text:
-                chunks = split_re.split(elem.text)
-                while len(chunks) > 1:
-                    ins = etree.Element(tagname)
-                    ins.tail = chunks.pop()
-                    elem.insert(0, ins)
-                elem.text = chunks.pop(0)
-            if elem.tail:
-                chunks = split_re.split(elem.tail)
-                parent = elem.getparent()
-                ins_index = parent.index(elem) + 1
-                while len(chunks) > 1:
-                    ins = etree.Element(tagname)
-                    ins.tail = chunks.pop()
-                    parent.insert(ins_index, ins)
-                elem.tail = chunks.pop(0)
-        except TypeError, e:
-            # element with no children, like comment
-            pass
+    for elem in doc.iter(tag=etree.Element):
+        if elem.text:
+            chunks = split_re.split(elem.text)
+            while len(chunks) > 1:
+                ins = etree.Element(tagname)
+                ins.tail = chunks.pop()
+                elem.insert(0, ins)
+            elem.text = chunks.pop(0)
+        if elem.tail:
+            chunks = split_re.split(elem.tail)
+            parent = elem.getparent()
+            ins_index = parent.index(elem) + 1
+            while len(chunks) > 1:
+                ins = etree.Element(tagname)
+                ins.tail = chunks.pop()
+                parent.insert(ins_index, ins)
+            elem.tail = chunks.pop(0)
  
  
  def substitute_hyphens(doc):
  
  
  def substitute_hyphens(doc):
@@ -126,6 +121,16 @@ def hack_motifs(doc):
                  break
  
  
                  break
  
  
+def parse_creator(doc):
+    """ find all dc:creator tags and add dc:creator_parsed with forenames first """
+    for creator in doc.findall('//'+DCNS('creator')):
+        p = Person.from_text(creator.text)
+        creator_parsed = deepcopy(creator)
+        creator_parsed.tag = DCNS('creator_parsed')
+        creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
+        creator.getparent().insert(0, creator_parsed)
+
+
  def get_resource(path):
      return os.path.join(os.path.dirname(__file__), path)
  
  def get_resource(path):
      return os.path.join(os.path.dirname(__file__), path)
  
@@ -153,11 +158,12 @@ def package_available(package, args='', verbose=False):
      return p == 0
  
  
      return p == 0
  
  
-def transform(provider, slug, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
+def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
      """ produces a PDF file with XeLaTeX
  
      provider: a DocProvider
      slug: slug of file to process, available by provider
      """ produces a PDF file with XeLaTeX
  
      provider: a DocProvider
      slug: slug of file to process, available by provider
+    file_path can be provided instead of a slug
      output_file: file-like object or path to output file
      output_dir: path to directory to save output file to; either this or output_file must be present
      make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
      output_file: file-like object or path to output file
      output_dir: path to directory to save output file to; either this or output_file must be present
      make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
@@ -167,20 +173,24 @@ def transform(provider, slug, output_file=None, output_dir=None, make_dir=False,
  
      # Parse XSLT
      try:
  
      # Parse XSLT
      try:
-        # check for latex packages
-        if not package_available('morefloats', 'maxfloats=19', verbose=verbose):
-            document.edoc.getroot().set('old-morefloats', 'yes')
-            print >> sys.stderr, """
-==============================================================================
-LaTeX `morefloats' package is older than v.1.0c or not available at all.
-Some documents with many motifs in long stanzas or paragraphs may not compile.
-=============================================================================="""
+        if file_path:
+            if slug:
+                raise ValueError('slug or file_path should be specified, not both')
+            document = load_including_children(provider, file_path=file_path)
+        else:
+            if not slug:
+                raise ValueError('either slug or file_path should be specified')
+            document = load_including_children(provider, slug=slug)
  
  
-        document = load_including_children(provider, slug)
+        # check for LaTeX packages
+        if not package_available('morefloats', 'maxfloats=19'):
+            # using old morefloats or none at all
+            document.edoc.getroot().set('old-morefloats', 'yes')
  
          # hack the tree
          move_motifs_inside(document.edoc)
          hack_motifs(document.edoc)
  
          # hack the tree
          move_motifs_inside(document.edoc)
          hack_motifs(document.edoc)
+        parse_creator(document.edoc)
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
  
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
  
@@ -229,7 +239,10 @@ Some documents with many motifs in long stanzas or paragraphs may not compile.
                  os.makedirs(output_dir)
              except OSError:
                  pass
                  os.makedirs(output_dir)
              except OSError:
                  pass
-            output_path = os.path.join(output_dir, '%s.pdf' % slug)
+            if slug:
+                output_path = os.path.join(output_dir, '%s.pdf' % slug)
+            else:
+                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
              shutil.move(pdf_path, output_path)
          else:
              if hasattr(output_file, 'write'):
              shutil.move(pdf_path, output_path)
          else:
              if hasattr(output_file, 'write'):
@@ -246,7 +259,7 @@ Some documents with many motifs in long stanzas or paragraphs may not compile.
          raise ParseError(e)
  
  
          raise ParseError(e)
  
  
-def load_including_children(provider, slug=None, uri=None):
+def load_including_children(provider, slug=None, uri=None, file_path=None):
      """ makes one big xml file with children inserted at end 
      either slug or uri must be provided
      """
      """ makes one big xml file with children inserted at end 
      either slug or uri must be provided
      """
@@ -255,13 +268,17 @@ def load_including_children(provider, slug=None, uri=None):
          f = provider.by_uri(uri)
      elif slug:
          f = provider[slug]
          f = provider.by_uri(uri)
      elif slug:
          f = provider[slug]
+    elif file_path:
+        f = open(file_path, 'r')
      else:
      else:
-        raise ValueError('Neither slug nor URI provided for a book.')
+        raise ValueError('Neither slug, URI nor file path provided for a book.')
  
      document = WLDocument.from_file(f, True,
          parse_dublincore=True,
          preserve_lines=False)
  
  
      document = WLDocument.from_file(f, True,
          parse_dublincore=True,
          preserve_lines=False)
  
+    f.close()
+
      for child_uri in document.book_info.parts:
          child = load_including_children(provider, uri=child_uri)
          document.edoc.getroot().append(child.edoc.getroot())
      for child_uri in document.book_info.parts:
          child = load_including_children(provider, uri=child_uri)
          document.edoc.getroot().append(child.edoc.getroot())