these handlers are only used in sub-generation
[librarian.git] / librarian / pdf.py
index 1bfd949..9308704 100644 (file)
@@ -3,25 +3,29 @@
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
+"""PDF creation library.
+
+Creates one big XML from the book and its children, converts it to LaTeX
+with TeXML, then runs it by XeLaTeX.
+
+"""
 from __future__ import with_statement
 import os
 import os.path
 import shutil
 from StringIO import StringIO
 from __future__ import with_statement
 import os
 import os.path
 import shutil
 from StringIO import StringIO
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
 import re
 from copy import deepcopy
 from subprocess import call, PIPE
 
 import re
 from copy import deepcopy
 from subprocess import call, PIPE
 
-import sys
-
 from Texml.processor import process
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
 from librarian.dcparser import Person
 from librarian.parser import WLDocument
 from Texml.processor import process
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
 from librarian.dcparser import Person
 from librarian.parser import WLDocument
-from librarian import ParseError, DCNS, get_resource
+from librarian import ParseError, DCNS, get_resource, IOFile, Format
 from librarian import functions
 
 
 from librarian import functions
 
 
@@ -35,14 +39,6 @@ STYLESHEETS = {
     'wl2tex': 'pdf/wl2tex.xslt',
 }
 
     'wl2tex': 'pdf/wl2tex.xslt',
 }
 
-CUSTOMIZATIONS = [
-    'nofootnotes',
-    'nothemes',
-    'onehalfleading',
-    'doubleleading',
-    'nowlfont',
-    ]
-
 def insert_tags(doc, split_re, tagname, exclude=None):
     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
 
 def insert_tags(doc, split_re, tagname, exclude=None):
     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
 
@@ -77,7 +73,7 @@ def substitute_hyphens(doc):
     insert_tags(doc,
                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
                 "dywiz",
     insert_tags(doc,
                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
                 "dywiz",
-                exclude=[DCNS("identifier.url"), DCNS("rights.license")]
+                exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
                 )
 
 
                 )
 
 
@@ -135,9 +131,13 @@ def hack_motifs(doc):
 
 
 def parse_creator(doc):
 
 
 def parse_creator(doc):
-    """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
+    """Generates readable versions of creator and translator tags.
+
+    Finds all dc:creator and dc.contributor.translator tags
+    and adds *_parsed versions with forenames first.
+    """
     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
-                    'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
+                    'creator', 'contributor.translator')),
                     namespaces = {'dc': str(DCNS)})[::-1]:
         if not person.text:
             continue
                     namespaces = {'dc': str(DCNS)})[::-1]:
         if not person.text:
             continue
@@ -173,52 +173,67 @@ def package_available(package, args='', verbose=False):
     return p == 0
 
 
     return p == 0
 
 
-def transform(provider, slug=None, file_path=None,
-              output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
-              cover=None, flags=None, customizations=None):
-    """ produces a PDF file with XeLaTeX
-
-    provider: a DocProvider
-    slug: slug of file to process, available by provider
-    file_path can be provided instead of a slug
-    output_file: file-like object or path to output file
-    output_dir: path to directory to save output file to; either this or output_file must be present
-    make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
-    verbose: prints all output from LaTeX
-    save_tex: path to save the intermediary LaTeX file to
-    morefloats (old/new/none): force specific morefloats
-    cover: a cover.Cover object
-    flags: less-advertising,
-    customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
+def load_including_children(wldoc=None, provider=None, uri=None):
+    """ Makes one big xml file with children inserted at end.
+    
+    Either wldoc or provider and URI must be provided.
     """
 
     """
 
-    # Parse XSLT
-    try:
-        if file_path:
-            if slug:
-                raise ValueError('slug or file_path should be specified, not both')
-            document = load_including_children(provider, file_path=file_path)
-        else:
-            if not slug:
-                raise ValueError('either slug or file_path should be specified')
-            document = load_including_children(provider, slug=slug)
-
-        if cover:
-            document.edoc.getroot().set('data-cover-width', str(cover.width))
-            document.edoc.getroot().set('data-cover-height', str(cover.height))
-        if flags:
-            for flag in flags:
-                document.edoc.getroot().set('flag-' + flag, 'yes')
-
-        # check for LaTeX packages
-        if morefloats:
-            document.edoc.getroot().set('morefloats', morefloats.lower())
-        elif package_available('morefloats', 'maxfloats=19'):
-            document.edoc.getroot().set('morefloats', 'new')
-
-        # add customizations
-        if customizations is not None:
-            document.edoc.getroot().set('customizations', u','.join(customizations))
+    if uri and provider:
+        f = provider.by_uri(uri)
+        text = f.read().decode('utf-8')
+        f.close()
+    elif wldoc is not None:
+        text = etree.tostring(wldoc.edoc, encoding=unicode)
+        provider = wldoc.provider
+    else:
+        raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
+
+    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
+
+    document = WLDocument.from_string(text,
+                parse_dublincore=True, provider=provider)
+    document.swap_endlines()
+
+    for child_uri in document.book_info.parts:
+        child = load_including_children(provider=provider, uri=child_uri)
+        document.edoc.getroot().append(child.edoc.getroot())
+    return document
+
+
+class PDFFormat(Format):
+    """ Base PDF format.
+    
+    Available customization:
+        nofootnotes: Doesn't do footnotes.
+        nothemes: Doesn't do themes.
+        defaultleading: Default leading.
+        onehalfleading: Bigger leading.
+        doubleleading: Big leading.
+        nowlfont: Uses standard TeX font instead of JUnicodeWL.
+
+    """
+
+    cover_class = None
+    tex_passes = 1
+    style = get_resource('pdf/default.sty')
+    cover = None
+
+    @property
+    def has_cover(self):
+        """ For use in XSLT. """
+        return self.cover is not None
+
+    @property
+    def customization_str(self):
+        """ For use in XSLT. """
+        return u','.join(k for k, v in self.customization.items() if v)
+
+    def get_document(self):
+        document = load_including_children(self.wldoc)
+        root = document.edoc.getroot()
+        root.set('editors', u', '.join(sorted(
+            editor.readable() for editor in document.editors())))
 
         # hack the tree
         move_motifs_inside(document.edoc)
 
         # hack the tree
         move_motifs_inside(document.edoc)
@@ -226,103 +241,75 @@ def transform(provider, slug=None, file_path=None,
         parse_creator(document.edoc)
         substitute_hyphens(document.edoc)
         fix_hanging(document.edoc)
         parse_creator(document.edoc)
         substitute_hyphens(document.edoc)
         fix_hanging(document.edoc)
+        return document
 
 
-        # find output dir
-        if make_dir and output_dir is not None:
-            author = unicode(document.book_info.author)
-            output_dir = os.path.join(output_dir, author)
-
-        # wl -> TeXML
+    def get_texml(self):
         style_filename = get_stylesheet("wl2tex")
         style_filename = get_stylesheet("wl2tex")
-        style = etree.parse(style_filename)
-
-        texml = document.transform(style)
-
-        # TeXML -> LaTeX
+        functions.reg_get(self)
+        try:
+            style = etree.parse(style_filename)
+            texml = self.get_document().transform(style)
+            return texml
+        except (XMLSyntaxError, XSLTApplyError), e:
+            raise ParseError(e)
+
+    def get_tex_dir(self):
+        texml = self.get_texml()
         temp = mkdtemp('-wl2pdf')
         temp = mkdtemp('-wl2pdf')
-
-        if cover:
-            c = cover(document.book_info.author.readable(), document.book_info.title)
-            with open(os.path.join(temp, 'cover.png'), 'w') as f:
-                c.save(f)
-
-        del document # no longer needed large object :)
-
+        # Save TeX file
         tex_path = os.path.join(temp, 'doc.tex')
         tex_path = os.path.join(temp, 'doc.tex')
-        fout = open(tex_path, 'w')
-        process(StringIO(texml), fout, 'utf-8')
-        fout.close()
-        del texml
-
-        if save_tex:
-            shutil.copy(tex_path, save_tex)
-
-        # LaTeX -> PDF
+        with open(tex_path, 'w') as fout:
+            process(StringIO(texml), fout, 'utf-8')
+        if self.save_tex:
+            shutil.copy(tex_path, self.save_tex)
+        # Copy style
         shutil.copy(get_resource('pdf/wl.cls'), temp)
         shutil.copy(get_resource('pdf/wl.cls'), temp)
-        shutil.copy(get_resource('res/wl-logo.png'), temp)
-
-        cwd = os.getcwd()
+        shutil.copy(self.style, os.path.join(temp, 'style.sty'))
+        # Save attachments
+        if self.cover:
+            self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
+        return temp
+
+    def get_pdf(self):
+        temp = self.get_tex_dir()
+        tex_path = os.path.join(temp, 'doc.tex')
+        try:
+            cwd = os.getcwd()
+        except OSError:
+            cwd = None
         os.chdir(temp)
 
         os.chdir(temp)
 
-        if verbose:
-            p = call(['xelatex', tex_path])
+        if self.verbose:
+            for i in range(self.tex_passes):
+                p = call(['xelatex', tex_path])
         else:
         else:
-            p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
+            for i in range(self.tex_passes):
+                p = call(['xelatex', '-interaction=batchmode', tex_path],
+                            stdout=PIPE, stderr=PIPE)
         if p:
             raise ParseError("Error parsing .tex file")
 
         if p:
             raise ParseError("Error parsing .tex file")
 
-        os.chdir(cwd)
+        if cwd is not None:
+            os.chdir(cwd)
 
 
-        # save the PDF
+        output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
         pdf_path = os.path.join(temp, 'doc.pdf')
         pdf_path = os.path.join(temp, 'doc.pdf')
-        if output_dir is not None:
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-            if slug:
-                output_path = os.path.join(output_dir, '%s.pdf' % slug)
-            else:
-                output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
-            shutil.move(pdf_path, output_path)
-        else:
-            if hasattr(output_file, 'write'):
-                # file-like object
-                with open(pdf_path) as f:
-                    output_file.write(f.read())
-                output_file.close()
-            else:
-                # path to output file
-                shutil.copy(pdf_path, output_file)
+        shutil.move(pdf_path, output_file.name)
         shutil.rmtree(temp)
         shutil.rmtree(temp)
-
-    except (XMLSyntaxError, XSLTApplyError), e:
-        raise ParseError(e)
-
-
-def load_including_children(provider, slug=None, uri=None, file_path=None):
-    """ makes one big xml file with children inserted at end
-    either slug or uri must be provided
-    """
-
-    if uri:
-        f = provider.by_uri(uri)
-    elif slug:
-        f = provider[slug]
-    elif file_path:
-        f = open(file_path, 'r')
-    else:
-        raise ValueError('Neither slug, URI nor file path provided for a book.')
-
-    text = f.read().decode('utf-8')
-    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
-
-    document = WLDocument.from_string(text, True,
-        parse_dublincore=True)
-
-    f.close()
-    for child_uri in document.book_info.parts:
-        print child_uri
-        child = load_including_children(provider, uri=child_uri)
-        document.edoc.getroot().append(child.edoc.getroot())
-    return document
+        return IOFile.from_filename(output_file.name)
+
+    def build(self, verbose=False, save_tex=None, morefloats=None):
+        """ morefloats: new/old/none
+        """
+        self.verbose = verbose
+        self.save_tex = save_tex
+        
+        if morefloats is None and package_available('morefloats', 'maxfloats=19'):
+            morefloats = 'new'
+        self.morefloats = morefloats
+
+        book_info = self.wldoc.book_info
+        if self.cover_class:
+            self.cover = self.cover_class(book_info)
+
+        return self.get_pdf()