librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.parser import WLDocument
  23 from librarian import ParseError
  24 from librarian import functions
  25
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_person_name()
  30 functions.reg_strip()
  31 functions.reg_starts_white()
  32 functions.reg_ends_white()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'xslt/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter():
  49         try:
  50             if elem.text:
  51                 chunks = split_re.split(elem.text)
  52                 while len(chunks) > 1:
  53                     ins = etree.Element(tagname)
  54                     ins.tail = chunks.pop()
  55                     elem.insert(0, ins)
  56                 elem.text = chunks.pop(0)
  57             if elem.tail:
  58                 chunks = split_re.split(elem.tail)
  59                 parent = elem.getparent()
  60                 ins_index = parent.index(elem) + 1
  61                 while len(chunks) > 1:
  62                     ins = etree.Element(tagname)
  63                     ins.tail = chunks.pop()
  64                     parent.insert(ins_index, ins)
  65                 elem.tail = chunks.pop(0)
  66         except TypeError, e:
  67             # element with no children, like comment
  68             pass
  69
  70
  71 def substitute_hyphens(doc):
  72     insert_tags(doc,
  73                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  74                 "dywiz")
  75
  76
  77 def fix_hanging(doc):
  78     insert_tags(doc,
  79                 re.compile("(?<=\s\w)\s+"),
  80                 "nbsp")
  81
  82
  83 def move_motifs_inside(doc):
  84     """ moves motifs to be into block elements """
  85     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  86         for motif in master.xpath('motyw'):
  87             print motif.text
  88             for sib in motif.itersiblings():
  89                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  90                     # motif shouldn't have a tail - it would be untagged text
  91                     motif.tail = None
  92                     motif.getparent().remove(motif)
  93                     sib.insert(0, motif)
  94                     break
  95
  96
  97 def hack_motifs(doc):
  98     """ dirty hack for the marginpar-creates-orphans LaTeX problem
  99     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 100
 101     moves motifs in stanzas from first verse to second
 102     and from next to last to last, then inserts negative vspace before them
 103     """
 104     for motif in doc.findall('//strofa//motyw'):
 105         # find relevant verse-level tag
 106         verse, stanza = motif, motif.getparent()
 107         while stanza is not None and stanza.tag != 'strofa':
 108             verse, stanza = stanza, stanza.getparent()
 109         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 110         breaks_after = sum(1 for i in verse.itersiblings('br'))
 111         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 112             move_by = 1
 113             if breaks_after == 2:
 114                 move_by += 1
 115             moved_motif = deepcopy(motif)
 116             motif.tag = 'span'
 117             motif.text = None
 118             moved_motif.tail = None
 119             moved_motif.set('moved', str(move_by))
 120
 121             for br in verse.itersiblings('br'):
 122                 if move_by > 1:
 123                     move_by -= 1
 124                     continue
 125                 br.addnext(moved_motif)
 126                 break
 127
 128
 129 def get_resource(path):
 130     return os.path.join(os.path.dirname(__file__), path)
 131
 132 def get_stylesheet(name):
 133     return get_resource(STYLESHEETS[name])
 134
 135
 136 def package_available(package, args='', verbose=False):
 137     """ check if a verion of a latex package accepting given args is available """
 138     tempdir = mkdtemp('-wl2pdf-test')
 139     fpath = os.path.join(tempdir, 'test.tex')
 140     f = open(fpath, 'w')
 141     f.write(r"""
 142         \documentclass{book}
 143         \usepackage[%s]{%s}
 144         \begin{document}
 145         \end{document}
 146         """ % (args, package))
 147     f.close()
 148     if verbose:
 149         p = call(['xelatex', '-output-directory', tempdir, fpath])
 150     else:
 151         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 152     shutil.rmtree(tempdir)
 153     return p == 0
 154
 155
 156 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
 157     """ produces a PDF file with XeLaTeX
 158
 159     provider: a DocProvider
 160     slug: slug of file to process, available by provider
 161     file_path can be provided instead of a slug
 162     output_file: file-like object or path to output file
 163     output_dir: path to directory to save output file to; either this or output_file must be present
 164     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 165     verbose: prints all output from LaTeX
 166     save_tex: path to save the intermediary LaTeX file to
 167     """
 168
 169     # Parse XSLT
 170     try:
 171         if file_path:
 172             if slug:
 173                 raise ValueError('slug or file_path should be specified, not both')
 174             document = load_including_children(provider, file_path=file_path)
 175         else:
 176             if not slug:
 177                 raise ValueError('either slug or file_path should be specified')
 178             document = load_including_children(provider, slug=slug)
 179
 180         # check for LaTeX packages
 181         if not package_available('morefloats', 'maxfloats=19'):
 182             # using old morefloats or none at all
 183             document.edoc.getroot().set('old-morefloats', 'yes')
 184
 185         # hack the tree
 186         move_motifs_inside(document.edoc)
 187         hack_motifs(document.edoc)
 188         substitute_hyphens(document.edoc)
 189         fix_hanging(document.edoc)
 190
 191         # find output dir
 192         if make_dir and output_dir is not None:
 193             author = unicode(document.book_info.author)
 194             output_dir = os.path.join(output_dir, author)
 195
 196         # wl -> TeXML
 197         style_filename = get_stylesheet("wl2tex")
 198         style = etree.parse(style_filename)
 199         texml = document.transform(style)
 200         del document # no longer needed large object :)
 201
 202         # TeXML -> LaTeX
 203         temp = mkdtemp('-wl2pdf')
 204         tex_path = os.path.join(temp, 'doc.tex')
 205         fout = open(tex_path, 'w')
 206         process(StringIO(texml), fout, 'utf-8')
 207         fout.close()
 208         del texml
 209
 210         if save_tex:
 211             shutil.copy(tex_path, save_tex)
 212
 213         # LaTeX -> PDF
 214         shutil.copy(get_resource('pdf/wl.sty'), temp)
 215         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 216
 217         cwd = os.getcwd()
 218         os.chdir(temp)
 219
 220         if verbose:
 221             p = call(['xelatex', tex_path])
 222         else:
 223             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 224         if p:
 225             raise ParseError("Error parsing .tex file")
 226
 227         os.chdir(cwd)
 228
 229         # save the PDF
 230         pdf_path = os.path.join(temp, 'doc.pdf')
 231         if output_dir is not None:
 232             try:
 233                 os.makedirs(output_dir)
 234             except OSError:
 235                 pass
 236             if slug:
 237                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 238             else:
 239                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 240             shutil.move(pdf_path, output_path)
 241         else:
 242             if hasattr(output_file, 'write'):
 243                 # file-like object
 244                 with open(pdf_path) as f:
 245                     output_file.write(f.read())
 246                 output_file.close()
 247             else:
 248                 # path to output file
 249                 shutil.copy(pdf_path, output_file)
 250         shutil.rmtree(temp)
 251
 252     except (XMLSyntaxError, XSLTApplyError), e:
 253         raise ParseError(e)
 254
 255
 256 def load_including_children(provider, slug=None, uri=None, file_path=None):
 257     """ makes one big xml file with children inserted at end
 258     either slug or uri must be provided
 259     """
 260
 261     if uri:
 262         f = provider.by_uri(uri)
 263     elif slug:
 264         f = provider[slug]
 265     elif file_path:
 266         f = open(file_path, 'r')
 267     else:
 268         raise ValueError('Neither slug, URI nor file path provided for a book.')
 269
 270     document = WLDocument.from_file(f, True,
 271         parse_dublincore=True,
 272         preserve_lines=False)
 273
 274     f.close()
 275
 276     for child_uri in document.book_info.parts:
 277         child = load_including_children(provider, uri=child_uri)
 278         document.edoc.getroot().append(child.edoc.getroot())
 279
 280     return document