kind, genre, epoch not required

[librarian.git] / librarian / pdf.py
diff --git a/librarian/pdf.py b/librarian/pdf.py

index 5debb68..bcf8d9a 100644 (file)
--- a/librarian/pdf.py
+++ b/librarian/pdf.py
@@ -8,21 +8,20 @@ import os
  import os.path
  import shutil
  from StringIO import StringIO
  import os.path
  import shutil
  from StringIO import StringIO
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
  import re
  from copy import deepcopy
  from subprocess import call, PIPE
  
  import re
  from copy import deepcopy
  from subprocess import call, PIPE
  
-import sys
-
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
  from librarian.dcparser import Person
  from librarian.parser import WLDocument
  from Texml.processor import process
  from lxml import etree
  from lxml.etree import XMLSyntaxError, XSLTApplyError
  
  from librarian.dcparser import Person
  from librarian.parser import WLDocument
-from librarian import ParseError, DCNS
+from librarian import ParseError, DCNS, get_resource, OutputFile
  from librarian import functions
  from librarian import functions
+from librarian.cover import WLCover
  
  
  functions.reg_substitute_entities()
  
  
  functions.reg_substitute_entities()
@@ -32,12 +31,19 @@ functions.reg_ends_white()
  functions.reg_texcommand()
  
  STYLESHEETS = {
  functions.reg_texcommand()
  
  STYLESHEETS = {
-    'wl2tex': 'xslt/wl2tex.xslt',
+    'wl2tex': 'pdf/wl2tex.xslt',
  }
  
  }
  
+CUSTOMIZATIONS = [
+    'nofootnotes',
+    'nothemes',
+    'onehalfleading',
+    'doubleleading',
+    'nowlfont',
+    ]
  
  
-def insert_tags(doc, split_re, tagname):
-    """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree 
+def insert_tags(doc, split_re, tagname, exclude=None):
+    """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  
      >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
      >>> insert_tags(t, re.compile('-'), 'd');
  
      >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
      >>> insert_tags(t, re.compile('-'), 'd');
@@ -46,6 +52,8 @@ def insert_tags(doc, split_re, tagname):
      """
  
      for elem in doc.iter(tag=etree.Element):
      """
  
      for elem in doc.iter(tag=etree.Element):
+        if exclude and elem.tag in exclude:
+            continue
          if elem.text:
              chunks = split_re.split(elem.text)
              while len(chunks) > 1:
          if elem.text:
              chunks = split_re.split(elem.text)
              while len(chunks) > 1:
@@ -65,15 +73,19 @@ def insert_tags(doc, split_re, tagname):
  
  
  def substitute_hyphens(doc):
  
  
  def substitute_hyphens(doc):
-    insert_tags(doc, 
+    insert_tags(doc,
                  re.compile("(?<=[^-\s])-(?=[^-\s])"),
                  re.compile("(?<=[^-\s])-(?=[^-\s])"),
-                "dywiz")
+                "dywiz",
+                exclude=[DCNS("identifier.url"), DCNS("rights.license")]
+                )
  
  
  def fix_hanging(doc):
  
  
  def fix_hanging(doc):
-    insert_tags(doc, 
+    insert_tags(doc,
                  re.compile("(?<=\s\w)\s+"),
                  re.compile("(?<=\s\w)\s+"),
-                "nbsp")
+                "nbsp",
+                exclude=[DCNS("identifier.url"), DCNS("rights.license")]
+                )
  
  
  def move_motifs_inside(doc):
  
  
  def move_motifs_inside(doc):
@@ -122,29 +134,31 @@ def hack_motifs(doc):
  
  
  def parse_creator(doc):
  
  
  def parse_creator(doc):
-    """ find all dc:creator tags and add dc:creator_parsed with forenames first """
-    for creator in doc.findall('//'+DCNS('creator')):
-        p = Person.from_text(creator.text)
-        creator_parsed = deepcopy(creator)
-        creator_parsed.tag = DCNS('creator_parsed')
-        creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
-        creator.getparent().insert(0, creator_parsed)
-
+    """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
+    for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
+                    'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
+                    namespaces = {'dc': str(DCNS)})[::-1]:
+        if not person.text:
+            continue
+        p = Person.from_text(person.text)
+        person_parsed = deepcopy(person)
+        person_parsed.tag = person.tag + '_parsed'
+        person_parsed.set('sortkey', person.text)
+        person_parsed.text = p.readable()
+        person.getparent().insert(0, person_parsed)
  
  
-def get_resource(path):
-    return os.path.join(os.path.dirname(__file__), path)
  
  def get_stylesheet(name):
      return get_resource(STYLESHEETS[name])
  
  
  def package_available(package, args='', verbose=False):
  
  def get_stylesheet(name):
      return get_resource(STYLESHEETS[name])
  
  
  def package_available(package, args='', verbose=False):
-    """ check if a verion of a latex package accepting given args is available """  
+    """ check if a verion of a latex package accepting given args is available """
      tempdir = mkdtemp('-wl2pdf-test')
      fpath = os.path.join(tempdir, 'test.tex')
      f = open(fpath, 'w')
      f.write(r"""
      tempdir = mkdtemp('-wl2pdf-test')
      fpath = os.path.join(tempdir, 'test.tex')
      f = open(fpath, 'w')
      f.write(r"""
-        \documentclass{book}
+        \documentclass{wl}
          \usepackage[%s]{%s}
          \begin{document}
          \end{document}
          \usepackage[%s]{%s}
          \begin{document}
          \end{document}
@@ -158,31 +172,36 @@ def package_available(package, args='', verbose=False):
      return p == 0
  
  
      return p == 0
  
  
-def transform(provider, slug=None, file_path=None, 
-              output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
+def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
+              cover=None, flags=None, customizations=None):
      """ produces a PDF file with XeLaTeX
  
      """ produces a PDF file with XeLaTeX
  
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    file_path can be provided instead of a slug
-    output_file: file-like object or path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
+    wldoc: a WLDocument
      verbose: prints all output from LaTeX
      save_tex: path to save the intermediary LaTeX file to
      morefloats (old/new/none): force specific morefloats
      verbose: prints all output from LaTeX
      save_tex: path to save the intermediary LaTeX file to
      morefloats (old/new/none): force specific morefloats
+    cover: a cover.Cover object
+    flags: less-advertising,
+    customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
      """
  
      # Parse XSLT
      try:
      """
  
      # Parse XSLT
      try:
-        if file_path:
-            if slug:
-                raise ValueError('slug or file_path should be specified, not both')
-            document = load_including_children(provider, file_path=file_path)
-        else:
-            if not slug:
-                raise ValueError('either slug or file_path should be specified')
-            document = load_including_children(provider, slug=slug)
+        document = load_including_children(wldoc)
+
+        if cover:
+            if cover is True:
+                cover = WLCover
+            document.edoc.getroot().set('data-cover-width', str(cover.width))
+            document.edoc.getroot().set('data-cover-height', str(cover.height))
+            if cover.uses_dc_cover:
+                if document.book_info.cover_by:
+                    document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
+                if document.book_info.cover_source:
+                    document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
+        if flags:
+            for flag in flags:
+                document.edoc.getroot().set('flag-' + flag, 'yes')
  
          # check for LaTeX packages
          if morefloats:
  
          # check for LaTeX packages
          if morefloats:
@@ -190,6 +209,10 @@ def transform(provider, slug=None, file_path=None,
          elif package_available('morefloats', 'maxfloats=19'):
              document.edoc.getroot().set('morefloats', 'new')
  
          elif package_available('morefloats', 'maxfloats=19'):
              document.edoc.getroot().set('morefloats', 'new')
  
+        # add customizations
+        if customizations is not None:
+            document.edoc.getroot().set('customizations', u','.join(customizations))
+
          # hack the tree
          move_motifs_inside(document.edoc)
          hack_motifs(document.edoc)
          # hack the tree
          move_motifs_inside(document.edoc)
          hack_motifs(document.edoc)
@@ -197,19 +220,22 @@ def transform(provider, slug=None, file_path=None,
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
  
          substitute_hyphens(document.edoc)
          fix_hanging(document.edoc)
  
-        # find output dir
-        if make_dir and output_dir is not None:
-            author = unicode(document.book_info.author)
-            output_dir = os.path.join(output_dir, author)
-
          # wl -> TeXML
          style_filename = get_stylesheet("wl2tex")
          style = etree.parse(style_filename)
          # wl -> TeXML
          style_filename = get_stylesheet("wl2tex")
          style = etree.parse(style_filename)
+
          texml = document.transform(style)
          texml = document.transform(style)
-        del document # no longer needed large object :)
  
          # TeXML -> LaTeX
          temp = mkdtemp('-wl2pdf')
  
          # TeXML -> LaTeX
          temp = mkdtemp('-wl2pdf')
+
+        if cover:
+            c = cover(document.book_info)
+            with open(os.path.join(temp, 'cover.png'), 'w') as f:
+                c.save(f)
+
+        del document # no longer needed large object :)
+
          tex_path = os.path.join(temp, 'doc.tex')
          fout = open(tex_path, 'w')
          process(StringIO(texml), fout, 'utf-8')
          tex_path = os.path.join(temp, 'doc.tex')
          fout = open(tex_path, 'w')
          process(StringIO(texml), fout, 'utf-8')
@@ -220,8 +246,8 @@ def transform(provider, slug=None, file_path=None,
              shutil.copy(tex_path, save_tex)
  
          # LaTeX -> PDF
              shutil.copy(tex_path, save_tex)
  
          # LaTeX -> PDF
-        shutil.copy(get_resource('pdf/wl.sty'), temp)
-        shutil.copy(get_resource('pdf/wl-logo.png'), temp)
+        shutil.copy(get_resource('pdf/wl.cls'), temp)
+        shutil.copy(get_resource('res/wl-logo.png'), temp)
  
          cwd = os.getcwd()
          os.chdir(temp)
  
          cwd = os.getcwd()
          os.chdir(temp)
@@ -235,55 +261,38 @@ def transform(provider, slug=None, file_path=None,
  
          os.chdir(cwd)
  
  
          os.chdir(cwd)
  
-        # save the PDF
+        output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
          pdf_path = os.path.join(temp, 'doc.pdf')
          pdf_path = os.path.join(temp, 'doc.pdf')
-        if output_dir is not None:
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-            if slug:
-                output_path = os.path.join(output_dir, '%s.pdf' % slug)
-            else:
-                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
-            shutil.move(pdf_path, output_path)
-        else:
-            if hasattr(output_file, 'write'):
-                # file-like object
-                with open(pdf_path) as f:
-                    output_file.write(f.read())
-                output_file.close()
-            else:
-                # path to output file
-                shutil.copy(pdf_path, output_file)
+        shutil.move(pdf_path, output_file.name)
          shutil.rmtree(temp)
          shutil.rmtree(temp)
+        return OutputFile.from_filename(output_file.name)
  
      except (XMLSyntaxError, XSLTApplyError), e:
          raise ParseError(e)
  
  
  
      except (XMLSyntaxError, XSLTApplyError), e:
          raise ParseError(e)
  
  
-def load_including_children(provider, slug=None, uri=None, file_path=None):
-    """ makes one big xml file with children inserted at end 
-    either slug or uri must be provided
+def load_including_children(wldoc=None, provider=None, uri=None):
+    """ Makes one big xml file with children inserted at end.
+    
+    Either wldoc or provider and URI must be provided.
      """
  
      """
  
-    if uri:
+    if uri and provider:
          f = provider.by_uri(uri)
          f = provider.by_uri(uri)
-    elif slug:
-        f = provider[slug]
-    elif file_path:
-        f = open(file_path, 'r')
+        text = f.read().decode('utf-8')
+        f.close()
+    elif wldoc is not None:
+        text = etree.tostring(wldoc.edoc, encoding=unicode)
+        provider = wldoc.provider
      else:
      else:
-        raise ValueError('Neither slug, URI nor file path provided for a book.')
+        raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
  
  
-    document = WLDocument.from_file(f, True,
-        parse_dublincore=True,
-        preserve_lines=False)
+    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
  
  
-    f.close()
+    document = WLDocument.from_string(text, parse_dublincore=True)
+    document.swap_endlines()
  
      for child_uri in document.book_info.parts:
  
      for child_uri in document.book_info.parts:
-        child = load_including_children(provider, uri=child_uri)
+        child = load_including_children(provider=provider, uri=child_uri)
          document.edoc.getroot().append(child.edoc.getroot())
          document.edoc.getroot().append(child.edoc.getroot())
-
      return document
      return document