librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'xslt/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter(tag=etree.Element):
  49         if elem.text:
  50             chunks = split_re.split(elem.text)
  51             while len(chunks) > 1:
  52                 ins = etree.Element(tagname)
  53                 ins.tail = chunks.pop()
  54                 elem.insert(0, ins)
  55             elem.text = chunks.pop(0)
  56         if elem.tail:
  57             chunks = split_re.split(elem.tail)
  58             parent = elem.getparent()
  59             ins_index = parent.index(elem) + 1
  60             while len(chunks) > 1:
  61                 ins = etree.Element(tagname)
  62                 ins.tail = chunks.pop()
  63                 parent.insert(ins_index, ins)
  64             elem.tail = chunks.pop(0)
  65
  66
  67 def substitute_hyphens(doc):
  68     insert_tags(doc,
  69                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  70                 "dywiz")
  71
  72
  73 def fix_hanging(doc):
  74     insert_tags(doc,
  75                 re.compile("(?<=\s\w)\s+"),
  76                 "nbsp")
  77
  78
  79 def move_motifs_inside(doc):
  80     """ moves motifs to be into block elements """
  81     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  82         for motif in master.xpath('motyw'):
  83             for sib in motif.itersiblings():
  84                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  85                     # motif shouldn't have a tail - it would be untagged text
  86                     motif.tail = None
  87                     motif.getparent().remove(motif)
  88                     sib.insert(0, motif)
  89                     break
  90
  91
  92 def hack_motifs(doc):
  93     """ dirty hack for the marginpar-creates-orphans LaTeX problem
  94     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
  95
  96     moves motifs in stanzas from first verse to second
  97     and from next to last to last, then inserts negative vspace before them
  98     """
  99     for motif in doc.findall('//strofa//motyw'):
 100         # find relevant verse-level tag
 101         verse, stanza = motif, motif.getparent()
 102         while stanza is not None and stanza.tag != 'strofa':
 103             verse, stanza = stanza, stanza.getparent()
 104         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 105         breaks_after = sum(1 for i in verse.itersiblings('br'))
 106         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 107             move_by = 1
 108             if breaks_after == 2:
 109                 move_by += 1
 110             moved_motif = deepcopy(motif)
 111             motif.tag = 'span'
 112             motif.text = None
 113             moved_motif.tail = None
 114             moved_motif.set('moved', str(move_by))
 115
 116             for br in verse.itersiblings('br'):
 117                 if move_by > 1:
 118                     move_by -= 1
 119                     continue
 120                 br.addnext(moved_motif)
 121                 break
 122
 123
 124 def parse_creator(doc):
 125     """ find all dc:creator tags and add dc:creator_parsed with forenames first """
 126     for creator in doc.findall('//'+DCNS('creator')):
 127         p = Person.from_text(creator.text)
 128         creator_parsed = deepcopy(creator)
 129         creator_parsed.tag = DCNS('creator_parsed')
 130         creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
 131         creator.getparent().insert(0, creator_parsed)
 132
 133
 134 def get_resource(path):
 135     return os.path.join(os.path.dirname(__file__), path)
 136
 137 def get_stylesheet(name):
 138     return get_resource(STYLESHEETS[name])
 139
 140
 141 def package_available(package, args='', verbose=False):
 142     """ check if a verion of a latex package accepting given args is available """
 143     tempdir = mkdtemp('-wl2pdf-test')
 144     fpath = os.path.join(tempdir, 'test.tex')
 145     f = open(fpath, 'w')
 146     f.write(r"""
 147         \documentclass{book}
 148         \usepackage[%s]{%s}
 149         \begin{document}
 150         \end{document}
 151         """ % (args, package))
 152     f.close()
 153     if verbose:
 154         p = call(['xelatex', '-output-directory', tempdir, fpath])
 155     else:
 156         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 157     shutil.rmtree(tempdir)
 158     return p == 0
 159
 160
 161 def transform(provider, slug=None, file_path=None,
 162               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
 163     """ produces a PDF file with XeLaTeX
 164
 165     provider: a DocProvider
 166     slug: slug of file to process, available by provider
 167     file_path can be provided instead of a slug
 168     output_file: file-like object or path to output file
 169     output_dir: path to directory to save output file to; either this or output_file must be present
 170     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 171     verbose: prints all output from LaTeX
 172     save_tex: path to save the intermediary LaTeX file to
 173     morefloats (old/new/none): force specific morefloats
 174     """
 175
 176     # Parse XSLT
 177     try:
 178         if file_path:
 179             if slug:
 180                 raise ValueError('slug or file_path should be specified, not both')
 181             document = load_including_children(provider, file_path=file_path)
 182         else:
 183             if not slug:
 184                 raise ValueError('either slug or file_path should be specified')
 185             document = load_including_children(provider, slug=slug)
 186
 187         # check for LaTeX packages
 188         if morefloats:
 189             document.edoc.getroot().set('morefloats', morefloats.lower())
 190         elif package_available('morefloats', 'maxfloats=19'):
 191             document.edoc.getroot().set('morefloats', 'new')
 192
 193         # hack the tree
 194         move_motifs_inside(document.edoc)
 195         hack_motifs(document.edoc)
 196         parse_creator(document.edoc)
 197         substitute_hyphens(document.edoc)
 198         fix_hanging(document.edoc)
 199
 200         # find output dir
 201         if make_dir and output_dir is not None:
 202             author = unicode(document.book_info.author)
 203             output_dir = os.path.join(output_dir, author)
 204
 205         # wl -> TeXML
 206         style_filename = get_stylesheet("wl2tex")
 207         style = etree.parse(style_filename)
 208         texml = document.transform(style)
 209         del document # no longer needed large object :)
 210
 211         # TeXML -> LaTeX
 212         temp = mkdtemp('-wl2pdf')
 213         tex_path = os.path.join(temp, 'doc.tex')
 214         fout = open(tex_path, 'w')
 215         process(StringIO(texml), fout, 'utf-8')
 216         fout.close()
 217         del texml
 218
 219         if save_tex:
 220             shutil.copy(tex_path, save_tex)
 221
 222         # LaTeX -> PDF
 223         shutil.copy(get_resource('pdf/wl.sty'), temp)
 224         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 225
 226         cwd = os.getcwd()
 227         os.chdir(temp)
 228
 229         if verbose:
 230             p = call(['xelatex', tex_path])
 231         else:
 232             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 233         if p:
 234             raise ParseError("Error parsing .tex file")
 235
 236         os.chdir(cwd)
 237
 238         # save the PDF
 239         pdf_path = os.path.join(temp, 'doc.pdf')
 240         if output_dir is not None:
 241             try:
 242                 os.makedirs(output_dir)
 243             except OSError:
 244                 pass
 245             if slug:
 246                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 247             else:
 248                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 249             shutil.move(pdf_path, output_path)
 250         else:
 251             if hasattr(output_file, 'write'):
 252                 # file-like object
 253                 with open(pdf_path) as f:
 254                     output_file.write(f.read())
 255                 output_file.close()
 256             else:
 257                 # path to output file
 258                 shutil.copy(pdf_path, output_file)
 259         shutil.rmtree(temp)
 260
 261     except (XMLSyntaxError, XSLTApplyError), e:
 262         raise ParseError(e)
 263
 264
 265 def load_including_children(provider, slug=None, uri=None, file_path=None):
 266     """ makes one big xml file with children inserted at end
 267     either slug or uri must be provided
 268     """
 269
 270     if uri:
 271         f = provider.by_uri(uri)
 272     elif slug:
 273         f = provider[slug]
 274     elif file_path:
 275         f = open(file_path, 'r')
 276     else:
 277         raise ValueError('Neither slug, URI nor file path provided for a book.')
 278
 279     document = WLDocument.from_file(f, True,
 280         parse_dublincore=True,
 281         preserve_lines=False)
 282
 283     f.close()
 284
 285     for child_uri in document.book_info.parts:
 286         child = load_including_children(provider, uri=child_uri)
 287         document.edoc.getroot().append(child.edoc.getroot())
 288
 289     return document