librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'xslt/wl2tex.xslt',
  35 }
  36
  37
  38 def insert_tags(doc, split_re, tagname):
  39     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  40
  41     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  42     >>> insert_tags(t, re.compile('-'), 'd');
  43     >>> print etree.tostring(t)
  44     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  45     """
  46
  47     for elem in doc.iter(tag=etree.Element):
  48         if elem.text:
  49             chunks = split_re.split(elem.text)
  50             while len(chunks) > 1:
  51                 ins = etree.Element(tagname)
  52                 ins.tail = chunks.pop()
  53                 elem.insert(0, ins)
  54             elem.text = chunks.pop(0)
  55         if elem.tail:
  56             chunks = split_re.split(elem.tail)
  57             parent = elem.getparent()
  58             ins_index = parent.index(elem) + 1
  59             while len(chunks) > 1:
  60                 ins = etree.Element(tagname)
  61                 ins.tail = chunks.pop()
  62                 parent.insert(ins_index, ins)
  63             elem.tail = chunks.pop(0)
  64
  65
  66 def substitute_hyphens(doc):
  67     insert_tags(doc,
  68                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  69                 "dywiz")
  70
  71
  72 def fix_hanging(doc):
  73     insert_tags(doc,
  74                 re.compile("(?<=\s\w)\s+"),
  75                 "nbsp")
  76
  77
  78 def move_motifs_inside(doc):
  79     """ moves motifs to be into block elements """
  80     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  81         for motif in master.xpath('motyw'):
  82             for sib in motif.itersiblings():
  83                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  84                     # motif shouldn't have a tail - it would be untagged text
  85                     motif.tail = None
  86                     motif.getparent().remove(motif)
  87                     sib.insert(0, motif)
  88                     break
  89
  90
  91 def hack_motifs(doc):
  92     """ dirty hack for the marginpar-creates-orphans LaTeX problem
  93     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
  94
  95     moves motifs in stanzas from first verse to second
  96     and from next to last to last, then inserts negative vspace before them
  97     """
  98     for motif in doc.findall('//strofa//motyw'):
  99         # find relevant verse-level tag
 100         verse, stanza = motif, motif.getparent()
 101         while stanza is not None and stanza.tag != 'strofa':
 102             verse, stanza = stanza, stanza.getparent()
 103         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 104         breaks_after = sum(1 for i in verse.itersiblings('br'))
 105         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 106             move_by = 1
 107             if breaks_after == 2:
 108                 move_by += 1
 109             moved_motif = deepcopy(motif)
 110             motif.tag = 'span'
 111             motif.text = None
 112             moved_motif.tail = None
 113             moved_motif.set('moved', str(move_by))
 114
 115             for br in verse.itersiblings('br'):
 116                 if move_by > 1:
 117                     move_by -= 1
 118                     continue
 119                 br.addnext(moved_motif)
 120                 break
 121
 122
 123 def parse_creator(doc):
 124     """ find all dc:creator tags and add dc:creator_parsed with forenames first """
 125     for creator in doc.findall('//'+DCNS('creator')):
 126         p = Person.from_text(creator.text)
 127         creator_parsed = deepcopy(creator)
 128         creator_parsed.tag = DCNS('creator_parsed')
 129         creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
 130         creator.getparent().insert(0, creator_parsed)
 131
 132
 133 def get_resource(path):
 134     return os.path.join(os.path.dirname(__file__), path)
 135
 136 def get_stylesheet(name):
 137     return get_resource(STYLESHEETS[name])
 138
 139
 140 def package_available(package, args='', verbose=False):
 141     """ check if a verion of a latex package accepting given args is available """
 142     tempdir = mkdtemp('-wl2pdf-test')
 143     fpath = os.path.join(tempdir, 'test.tex')
 144     f = open(fpath, 'w')
 145     f.write(r"""
 146         \documentclass{book}
 147         \usepackage[%s]{%s}
 148         \begin{document}
 149         \end{document}
 150         """ % (args, package))
 151     f.close()
 152     if verbose:
 153         p = call(['xelatex', '-output-directory', tempdir, fpath])
 154     else:
 155         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 156     shutil.rmtree(tempdir)
 157     return p == 0
 158
 159
 160 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
 161     """ produces a PDF file with XeLaTeX
 162
 163     provider: a DocProvider
 164     slug: slug of file to process, available by provider
 165     file_path can be provided instead of a slug
 166     output_file: file-like object or path to output file
 167     output_dir: path to directory to save output file to; either this or output_file must be present
 168     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 169     verbose: prints all output from LaTeX
 170     save_tex: path to save the intermediary LaTeX file to
 171     """
 172
 173     # Parse XSLT
 174     try:
 175         if file_path:
 176             if slug:
 177                 raise ValueError('slug or file_path should be specified, not both')
 178             document = load_including_children(provider, file_path=file_path)
 179         else:
 180             if not slug:
 181                 raise ValueError('either slug or file_path should be specified')
 182             document = load_including_children(provider, slug=slug)
 183
 184         # check for LaTeX packages
 185         if not package_available('morefloats', 'maxfloats=19'):
 186             # using old morefloats or none at all
 187             document.edoc.getroot().set('old-morefloats', 'yes')
 188
 189         # hack the tree
 190         move_motifs_inside(document.edoc)
 191         hack_motifs(document.edoc)
 192         parse_creator(document.edoc)
 193         substitute_hyphens(document.edoc)
 194         fix_hanging(document.edoc)
 195
 196         # find output dir
 197         if make_dir and output_dir is not None:
 198             author = unicode(document.book_info.author)
 199             output_dir = os.path.join(output_dir, author)
 200
 201         # wl -> TeXML
 202         style_filename = get_stylesheet("wl2tex")
 203         style = etree.parse(style_filename)
 204         texml = document.transform(style)
 205         del document # no longer needed large object :)
 206
 207         # TeXML -> LaTeX
 208         temp = mkdtemp('-wl2pdf')
 209         tex_path = os.path.join(temp, 'doc.tex')
 210         fout = open(tex_path, 'w')
 211         process(StringIO(texml), fout, 'utf-8')
 212         fout.close()
 213         del texml
 214
 215         if save_tex:
 216             shutil.copy(tex_path, save_tex)
 217
 218         # LaTeX -> PDF
 219         shutil.copy(get_resource('pdf/wl.sty'), temp)
 220         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 221
 222         cwd = os.getcwd()
 223         os.chdir(temp)
 224
 225         if verbose:
 226             p = call(['xelatex', tex_path])
 227         else:
 228             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 229         if p:
 230             raise ParseError("Error parsing .tex file")
 231
 232         os.chdir(cwd)
 233
 234         # save the PDF
 235         pdf_path = os.path.join(temp, 'doc.pdf')
 236         if output_dir is not None:
 237             try:
 238                 os.makedirs(output_dir)
 239             except OSError:
 240                 pass
 241             if slug:
 242                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 243             else:
 244                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 245             shutil.move(pdf_path, output_path)
 246         else:
 247             if hasattr(output_file, 'write'):
 248                 # file-like object
 249                 with open(pdf_path) as f:
 250                     output_file.write(f.read())
 251                 output_file.close()
 252             else:
 253                 # path to output file
 254                 shutil.copy(pdf_path, output_file)
 255         shutil.rmtree(temp)
 256
 257     except (XMLSyntaxError, XSLTApplyError), e:
 258         raise ParseError(e)
 259
 260
 261 def load_including_children(provider, slug=None, uri=None, file_path=None):
 262     """ makes one big xml file with children inserted at end
 263     either slug or uri must be provided
 264     """
 265
 266     if uri:
 267         f = provider.by_uri(uri)
 268     elif slug:
 269         f = provider[slug]
 270     elif file_path:
 271         f = open(file_path, 'r')
 272     else:
 273         raise ValueError('Neither slug, URI nor file path provided for a book.')
 274
 275     document = WLDocument.from_file(f, True,
 276         parse_dublincore=True,
 277         preserve_lines=False)
 278
 279     f.close()
 280
 281     for child_uri in document.book_info.parts:
 282         child = load_including_children(provider, uri=child_uri)
 283         document.edoc.getroot().append(child.edoc.getroot())
 284
 285     return document