More fixes
[librarian.git] / librarian / pdf.py
index 169d661..9308704 100644 (file)
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 #
+"""PDF creation library.
+
+Creates one big XML from the book and its children, converts it to LaTeX
+with TeXML, then runs it by XeLaTeX.
+
+"""
+from __future__ import with_statement
 import os
 import os.path
 import shutil
 from StringIO import StringIO
 import os
 import os.path
 import shutil
 from StringIO import StringIO
-from tempfile import mkdtemp
+from tempfile import mkdtemp, NamedTemporaryFile
 import re
 import re
+from copy import deepcopy
+from subprocess import call, PIPE
 
 from Texml.processor import process
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
 
 from Texml.processor import process
 from lxml import etree
 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
+from librarian.dcparser import Person
 from librarian.parser import WLDocument
 from librarian.parser import WLDocument
-from librarian import ParseError
+from librarian import ParseError, DCNS, get_resource, IOFile, Format
 from librarian import functions
 
 from librarian import functions
 
+
 functions.reg_substitute_entities()
 functions.reg_substitute_entities()
+functions.reg_strip()
+functions.reg_starts_white()
+functions.reg_ends_white()
+functions.reg_texcommand()
 
 STYLESHEETS = {
 
 STYLESHEETS = {
-    'wl2tex': 'xslt/wl2tex.xslt',
+    'wl2tex': 'pdf/wl2tex.xslt',
 }
 
 }
 
+def insert_tags(doc, split_re, tagname, exclude=None):
+    """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
 
 
-def insert_tags(doc, split_re, tagname):
-    print tagname
-    for elem in doc.iter():
+    >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
+    >>> insert_tags(t, re.compile('-'), 'd');
+    >>> print etree.tostring(t)
+    <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
+    """
+
+    for elem in doc.iter(tag=etree.Element):
+        if exclude and elem.tag in exclude:
+            continue
         if elem.text:
             chunks = split_re.split(elem.text)
         if elem.text:
             chunks = split_re.split(elem.text)
-            elem.text = chunks.pop(0)
-            while chunks:
+            while len(chunks) > 1:
                 ins = etree.Element(tagname)
                 ins.tail = chunks.pop()
                 elem.insert(0, ins)
                 ins = etree.Element(tagname)
                 ins.tail = chunks.pop()
                 elem.insert(0, ins)
+            elem.text = chunks.pop(0)
         if elem.tail:
             chunks = split_re.split(elem.tail)
             parent = elem.getparent()
             ins_index = parent.index(elem) + 1
         if elem.tail:
             chunks = split_re.split(elem.tail)
             parent = elem.getparent()
             ins_index = parent.index(elem) + 1
-            elem.tail = chunks.pop(0)
-            while chunks:
+            while len(chunks) > 1:
                 ins = etree.Element(tagname)
                 ins = etree.Element(tagname)
-                ins.tail = chunks.pop(0)
+                ins.tail = chunks.pop()
                 parent.insert(ins_index, ins)
                 parent.insert(ins_index, ins)
+            elem.tail = chunks.pop(0)
 
 
 def substitute_hyphens(doc):
 
 
 def substitute_hyphens(doc):
-    insert_tags(doc, 
+    insert_tags(doc,
                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
-                "dywiz")
+                "dywiz",
+                exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
+                )
 
 
 def fix_hanging(doc):
 
 
 def fix_hanging(doc):
-    insert_tags(doc, 
+    insert_tags(doc,
                 re.compile("(?<=\s\w)\s+"),
                 re.compile("(?<=\s\w)\s+"),
-                "nbsp")
+                "nbsp",
+                exclude=[DCNS("identifier.url"), DCNS("rights.license")]
+                )
+
+
+def move_motifs_inside(doc):
+    """ moves motifs to be into block elements """
+    for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
+        for motif in master.xpath('motyw'):
+            for sib in motif.itersiblings():
+                if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
+                    # motif shouldn't have a tail - it would be untagged text
+                    motif.tail = None
+                    motif.getparent().remove(motif)
+                    sib.insert(0, motif)
+                    break
+
+
+def hack_motifs(doc):
+    """ dirty hack for the marginpar-creates-orphans LaTeX problem
+    see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
+
+    moves motifs in stanzas from first verse to second
+    and from next to last to last, then inserts negative vspace before them
+    """
+    for motif in doc.findall('//strofa//motyw'):
+        # find relevant verse-level tag
+        verse, stanza = motif, motif.getparent()
+        while stanza is not None and stanza.tag != 'strofa':
+            verse, stanza = stanza, stanza.getparent()
+        breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
+        breaks_after = sum(1 for i in verse.itersiblings('br'))
+        if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
+            move_by = 1
+            if breaks_after == 2:
+                move_by += 1
+            moved_motif = deepcopy(motif)
+            motif.tag = 'span'
+            motif.text = None
+            moved_motif.tail = None
+            moved_motif.set('moved', str(move_by))
+
+            for br in verse.itersiblings('br'):
+                if move_by > 1:
+                    move_by -= 1
+                    continue
+                br.addnext(moved_motif)
+                break
+
+
+def parse_creator(doc):
+    """Generates readable versions of creator and translator tags.
+
+    Finds all dc:creator and dc.contributor.translator tags
+    and adds *_parsed versions with forenames first.
+    """
+    for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
+                    'creator', 'contributor.translator')),
+                    namespaces = {'dc': str(DCNS)})[::-1]:
+        if not person.text:
+            continue
+        p = Person.from_text(person.text)
+        person_parsed = deepcopy(person)
+        person_parsed.tag = person.tag + '_parsed'
+        person_parsed.set('sortkey', person.text)
+        person_parsed.text = p.readable()
+        person.getparent().insert(0, person_parsed)
 
 
 def get_stylesheet(name):
 
 
 def get_stylesheet(name):
-    return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
+    return get_resource(STYLESHEETS[name])
+
+
+def package_available(package, args='', verbose=False):
+    """ check if a verion of a latex package accepting given args is available """
+    tempdir = mkdtemp('-wl2pdf-test')
+    fpath = os.path.join(tempdir, 'test.tex')
+    f = open(fpath, 'w')
+    f.write(r"""
+        \documentclass{wl}
+        \usepackage[%s]{%s}
+        \begin{document}
+        \end{document}
+        """ % (args, package))
+    f.close()
+    if verbose:
+        p = call(['xelatex', '-output-directory', tempdir, fpath])
+    else:
+        p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
+    shutil.rmtree(tempdir)
+    return p == 0
 
 
-def transform(provider, slug, output_file=None, output_dir=None):
-    """ produces a pdf file
 
 
-    provider is a DocProvider
-    either output_file (a file-like object) or output_dir (path to file/dir) should be specified
-    if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
+def load_including_children(wldoc=None, provider=None, uri=None):
+    """ Makes one big xml file with children inserted at end.
+    
+    Either wldoc or provider and URI must be provided.
     """
 
     """
 
-    # Parse XSLT
-    try:
-        style_filename = get_stylesheet("wl2tex")
-        style = etree.parse(style_filename)
-
-        document = load_including_children(provider, slug)
-
-        substitute_hyphens(document.edoc)
-        fix_hanging(document.edoc)
-        
-        print etree.tostring(document.edoc)
-
-        # if output to dir, create the file
-        if output_dir is not None:
-            author = unicode(document.book_info.author)
-            output_dir = os.path.join(output_dir, author)
+    if uri and provider:
+        f = provider.by_uri(uri)
+        text = f.read().decode('utf-8')
+        f.close()
+    elif wldoc is not None:
+        text = etree.tostring(wldoc.edoc, encoding=unicode)
+        provider = wldoc.provider
+    else:
+        raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 
 
-        texml = document.transform(style)
-        del document # no longer needed large object :)
+    text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 
 
-        temp = mkdtemp('wl2pdf-')
-        tex_path = os.path.join(temp, 'doc.tex')
-        fout = open(tex_path, 'w')
-        process(StringIO(texml), fout, 'utf8', 255, 0, 0)
-        fout.close()
-        del texml
+    document = WLDocument.from_string(text,
+                parse_dublincore=True, provider=provider)
+    document.swap_endlines()
 
 
-        print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
-        if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
-            raise ParseError("Error parsing .tex file")
-
-        pdf_path = os.path.join(temp, 'doc.pdf')
-        if output_dir is not None:
-            try:
-                os.makedirs(output_dir)
-            except OSError:
-                pass
-            output_path = os.path.join(output_dir, '%s.pdf' % slug)
-            shutil.move(pdf_path, output_path)
-        else:
-            with open(pdf_path) as f:
-                output_file.write(f.read())
-            output_file.close()
+    for child_uri in document.book_info.parts:
+        child = load_including_children(provider=provider, uri=child_uri)
+        document.edoc.getroot().append(child.edoc.getroot())
+    return document
 
 
-        return True
-    except (XMLSyntaxError, XSLTApplyError), e:
-        raise ParseError(e)
 
 
+class PDFFormat(Format):
+    """ Base PDF format.
+    
+    Available customization:
+        nofootnotes: Doesn't do footnotes.
+        nothemes: Doesn't do themes.
+        defaultleading: Default leading.
+        onehalfleading: Bigger leading.
+        doubleleading: Big leading.
+        nowlfont: Uses standard TeX font instead of JUnicodeWL.
 
 
-def load_including_children(provider, slug=None, uri=None):
-    """ makes one big xml file with children inserted at end 
-    either slug or uri must be provided
     """
 
     """
 
-    if uri:
-        f = provider.by_uri(uri)
-    elif slug:
-        f = provider[slug]
-    else:
-        raise ValueError('Neither slug nor URI provided for a book.')
-
-    document = WLDocument.from_file(f, True,
-        parse_dublincore=True,
-        preserve_lines=False)
-
-    for child_uri in document.book_info.parts:
-        child = load_including_children(provider, uri=child_uri)
-        document.edoc.getroot().append(child.edoc.getroot())
-
-    return document
+    cover_class = None
+    tex_passes = 1
+    style = get_resource('pdf/default.sty')
+    cover = None
+
+    @property
+    def has_cover(self):
+        """ For use in XSLT. """
+        return self.cover is not None
+
+    @property
+    def customization_str(self):
+        """ For use in XSLT. """
+        return u','.join(k for k, v in self.customization.items() if v)
+
+    def get_document(self):
+        document = load_including_children(self.wldoc)
+        root = document.edoc.getroot()
+        root.set('editors', u', '.join(sorted(
+            editor.readable() for editor in document.editors())))
+
+        # hack the tree
+        move_motifs_inside(document.edoc)
+        hack_motifs(document.edoc)
+        parse_creator(document.edoc)
+        substitute_hyphens(document.edoc)
+        fix_hanging(document.edoc)
+        return document
 
 
+    def get_texml(self):
+        style_filename = get_stylesheet("wl2tex")
+        functions.reg_get(self)
+        try:
+            style = etree.parse(style_filename)
+            texml = self.get_document().transform(style)
+            return texml
+        except (XMLSyntaxError, XSLTApplyError), e:
+            raise ParseError(e)
+
+    def get_tex_dir(self):
+        texml = self.get_texml()
+        temp = mkdtemp('-wl2pdf')
+        # Save TeX file
+        tex_path = os.path.join(temp, 'doc.tex')
+        with open(tex_path, 'w') as fout:
+            process(StringIO(texml), fout, 'utf-8')
+        if self.save_tex:
+            shutil.copy(tex_path, self.save_tex)
+        # Copy style
+        shutil.copy(get_resource('pdf/wl.cls'), temp)
+        shutil.copy(self.style, os.path.join(temp, 'style.sty'))
+        # Save attachments
+        if self.cover:
+            self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
+        return temp
+
+    def get_pdf(self):
+        temp = self.get_tex_dir()
+        tex_path = os.path.join(temp, 'doc.tex')
+        try:
+            cwd = os.getcwd()
+        except OSError:
+            cwd = None
+        os.chdir(temp)
+
+        if self.verbose:
+            for i in range(self.tex_passes):
+                p = call(['xelatex', tex_path])
+        else:
+            for i in range(self.tex_passes):
+                p = call(['xelatex', '-interaction=batchmode', tex_path],
+                            stdout=PIPE, stderr=PIPE)
+        if p:
+            raise ParseError("Error parsing .tex file")
 
 
-if __name__ == '__main__':
-    import sys
-    from librarian import DirDocProvider
+        if cwd is not None:
+            os.chdir(cwd)
 
 
-    if len(sys.argv) < 2:
-        print >> sys.stderr, 'Usage: python pdf.py <input file>'
-        sys.exit(1)
+        output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
+        pdf_path = os.path.join(temp, 'doc.pdf')
+        shutil.move(pdf_path, output_file.name)
+        shutil.rmtree(temp)
+        return IOFile.from_filename(output_file.name)
+
+    def build(self, verbose=False, save_tex=None, morefloats=None):
+        """ morefloats: new/old/none
+        """
+        self.verbose = verbose
+        self.save_tex = save_tex
+        
+        if morefloats is None and package_available('morefloats', 'maxfloats=19'):
+            morefloats = 'new'
+        self.morefloats = morefloats
 
 
-    main_input = sys.argv[1]
-    basepath, ext = os.path.splitext(main_input)
-    path, slug = os.path.realpath(basepath).rsplit('/', 1)
-    provider = DirDocProvider(path)
-    transform(provider, slug, output_dir=path)
+        book_info = self.wldoc.book_info
+        if self.cover_class:
+            self.cover = self.cover_class(book_info)
 
 
+        return self.get_pdf()