librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from distutils.dir_util import copy_tree # shutil.copytree is so uncapable.
  17 from StringIO import StringIO
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 import re
  20 from copy import deepcopy
  21 from subprocess import call, PIPE
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile
  30 from librarian import functions
  31 from librarian.cover import WLCover
  32
  33
  34 functions.reg_substitute_entities()
  35 functions.reg_strip()
  36 functions.reg_starts_white()
  37 functions.reg_ends_white()
  38 functions.reg_texcommand()
  39
  40 STYLESHEETS = {
  41     'wl2tex': 'pdf/wl2tex.xslt',
  42 }
  43
  44 #CUSTOMIZATIONS = [
  45 #    'nofootnotes',
  46 #    'nothemes',
  47 #    'defaultleading',
  48 #    'onehalfleading',
  49 #    'doubleleading',
  50 #    'nowlfont',
  51 #    ]
  52
  53 def insert_tags(doc, split_re, tagname, exclude=None):
  54     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  55
  56     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  57     >>> insert_tags(t, re.compile('-'), 'd');
  58     >>> print etree.tostring(t)
  59     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  60     """
  61
  62     for elem in doc.iter(tag=etree.Element):
  63         if exclude and elem.tag in exclude:
  64             continue
  65         if elem.text:
  66             chunks = split_re.split(elem.text)
  67             while len(chunks) > 1:
  68                 ins = etree.Element(tagname)
  69                 ins.tail = chunks.pop()
  70                 elem.insert(0, ins)
  71             elem.text = chunks.pop(0)
  72         if elem.tail:
  73             chunks = split_re.split(elem.tail)
  74             parent = elem.getparent()
  75             ins_index = parent.index(elem) + 1
  76             while len(chunks) > 1:
  77                 ins = etree.Element(tagname)
  78                 ins.tail = chunks.pop()
  79                 parent.insert(ins_index, ins)
  80             elem.tail = chunks.pop(0)
  81
  82
  83 def substitute_hyphens(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  86                 "dywiz",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "www"]
  88                 )
  89
  90
  91 def fix_hanging(doc):
  92     insert_tags(doc,
  93                 re.compile("(?<=\s\w)\s+"),
  94                 "nbsp",
  95                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  96                 )
  97
  98
  99 def move_motifs_inside(doc):
 100     """ moves motifs to be into block elements """
 101     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 102         for motif in master.xpath('motyw'):
 103             for sib in motif.itersiblings():
 104                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 105                     # motif shouldn't have a tail - it would be untagged text
 106                     motif.tail = None
 107                     motif.getparent().remove(motif)
 108                     sib.insert(0, motif)
 109                     break
 110
 111
 112 def hack_motifs(doc):
 113     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 114     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 115
 116     moves motifs in stanzas from first verse to second
 117     and from next to last to last, then inserts negative vspace before them
 118     """
 119     for motif in doc.findall('//strofa//motyw'):
 120         # find relevant verse-level tag
 121         verse, stanza = motif, motif.getparent()
 122         while stanza is not None and stanza.tag != 'strofa':
 123             verse, stanza = stanza, stanza.getparent()
 124         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 125         breaks_after = sum(1 for i in verse.itersiblings('br'))
 126         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 127             move_by = 1
 128             if breaks_after == 2:
 129                 move_by += 1
 130             moved_motif = deepcopy(motif)
 131             motif.tag = 'span'
 132             motif.text = None
 133             moved_motif.tail = None
 134             moved_motif.set('moved', str(move_by))
 135
 136             for br in verse.itersiblings('br'):
 137                 if move_by > 1:
 138                     move_by -= 1
 139                     continue
 140                 br.addnext(moved_motif)
 141                 break
 142
 143
 144 def parse_creator(doc):
 145     """Generates readable versions of creator and translator tags.
 146
 147     Finds all dc:creator and dc.contributor.translator tags
 148     and adds *_parsed versions with forenames first.
 149     """
 150     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 151                     'creator', 'contributor.translator')),
 152                     namespaces = {'dc': str(DCNS)})[::-1]:
 153         if not person.text:
 154             continue
 155         p = Person.from_text(person.text)
 156         person_parsed = deepcopy(person)
 157         person_parsed.tag = person.tag + '_parsed'
 158         person_parsed.set('sortkey', person.text)
 159         person_parsed.text = p.readable()
 160         person.getparent().insert(0, person_parsed)
 161
 162
 163 def get_stylesheet(name):
 164     return get_resource(STYLESHEETS[name])
 165
 166
 167 def package_available(package, args='', verbose=False):
 168     """ check if a verion of a latex package accepting given args is available """
 169     tempdir = mkdtemp('-wl2pdf-test')
 170     fpath = os.path.join(tempdir, 'test.tex')
 171     f = open(fpath, 'w')
 172     f.write(r"""
 173         \documentclass{wl}
 174         \usepackage[%s]{%s}
 175         \begin{document}
 176         \end{document}
 177         """ % (args, package))
 178     f.close()
 179     if verbose:
 180         p = call(['xelatex', '-output-directory', tempdir, fpath])
 181     else:
 182         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 183     shutil.rmtree(tempdir)
 184     return p == 0
 185
 186
 187 def transform(wldoc, verbose=False, save_tex=None, save_texml=None, morefloats=None,
 188               cover=None, flags=None, customizations=None, documentclass='wl', resources=None):
 189     """ produces a PDF file with XeLaTeX
 190
 191     wldoc: a WLDocument
 192     verbose: prints all output from LaTeX
 193     save_tex: path to save the intermediary LaTeX file to
 194     save_texml: path to save the intermediary TeXML file to
 195     morefloats (old/new/none): force specific morefloats
 196     cover: a cover.Cover factory or True for default
 197     flags: less-advertising,
 198     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 199     documentclass: LaTeX document class, defaults to wl
 200     resources: a directory with resources, copied to place where LaTeX compilation is made
 201     """
 202
 203     # Parse XSLT
 204     try:
 205         book_info = wldoc.book_info
 206         document = load_including_children(wldoc)
 207         root = document.edoc.getroot()
 208
 209         if cover:
 210             if cover is True:
 211                 cover = WLCover
 212             bound_cover = cover(book_info)
 213             root.set('data-cover-width', str(bound_cover.width))
 214             root.set('data-cover-height', str(bound_cover.height))
 215             if bound_cover.uses_dc_cover:
 216                 if book_info.cover_by:
 217                     root.set('data-cover-by', book_info.cover_by)
 218                 if book_info.cover_source:
 219                     root.set('data-cover-source',
 220                             book_info.cover_source)
 221         if flags:
 222             for flag in flags:
 223                 root.set('flag-' + flag, 'yes')
 224
 225         # check for LaTeX packages
 226         if morefloats:
 227             root.set('morefloats', morefloats.lower())
 228         elif package_available('morefloats', 'maxfloats=19'):
 229             root.set('morefloats', 'new')
 230
 231         # add customizations
 232         if customizations is not None:
 233             root.set('customizations', u','.join(customizations))
 234
 235         root.set('documentclass', documentclass or 'wl')
 236
 237         # add editors info
 238         root.set('editors', u', '.join(sorted(
 239             editor.readable() for editor in document.editors())))
 240
 241         # hack the tree
 242         move_motifs_inside(document.edoc)
 243         hack_motifs(document.edoc)
 244         parse_creator(document.edoc)
 245         substitute_hyphens(document.edoc)
 246         fix_hanging(document.edoc)
 247
 248         # wl -> TeXML
 249         style_filename = get_stylesheet("wl2tex")
 250         style = etree.parse(style_filename)
 251
 252         texml = document.transform(style)
 253
 254         if save_texml:
 255             texml.write(save_texml)
 256
 257         # TeXML -> LaTeX
 258         temp = mkdtemp('-wl2pdf')
 259
 260         if cover:
 261             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 262                 bound_cover.save(f)
 263
 264         del document # no longer needed large object :)
 265
 266         tex_path = os.path.join(temp, 'doc.tex')
 267         fout = open(tex_path, 'w')
 268         process(StringIO(texml), fout, 'utf-8')
 269         fout.close()
 270         del texml
 271
 272         if save_tex:
 273             shutil.copy(tex_path, save_tex)
 274
 275         # LaTeX -> PDF
 276         shutil.copy(get_resource('pdf/wl.cls'), temp)
 277         shutil.copy(get_resource('pdf/wlpub.cls'), temp)
 278         shutil.copy(get_resource('res/wl-logo.png'), temp)
 279         if resources:
 280             copy_tree(resources, temp)
 281
 282         try:
 283             cwd = os.getcwd()
 284         except OSError:
 285             cwd = None
 286         os.chdir(temp)
 287
 288         if verbose:
 289             p = call(['xelatex', tex_path])
 290         else:
 291             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 292         if p:
 293             raise ParseError("Error parsing .tex file")
 294
 295         if cwd is not None:
 296             os.chdir(cwd)
 297
 298         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 299         pdf_path = os.path.join(temp, 'doc.pdf')
 300         shutil.move(pdf_path, output_file.name)
 301         shutil.rmtree(temp)
 302         return OutputFile.from_filename(output_file.name)
 303
 304     except (XMLSyntaxError, XSLTApplyError), e:
 305         raise ParseError(e)
 306
 307
 308 def load_including_children(wldoc=None, provider=None, uri=None):
 309     """ Makes one big xml file with children inserted at end.
 310
 311     Either wldoc or provider and URI must be provided.
 312     """
 313
 314     if uri and provider:
 315         f = provider.by_uri(uri)
 316         text = f.read().decode('utf-8')
 317         f.close()
 318     elif wldoc is not None:
 319         text = etree.tostring(wldoc.edoc, encoding=unicode)
 320         provider = wldoc.provider
 321     else:
 322         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 323
 324     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 325
 326     document = WLDocument.from_string(text,
 327                 parse_dublincore=True, provider=provider)
 328     document.swap_endlines()
 329
 330     for child_uri in document.book_info.parts:
 331         child = load_including_children(provider=provider, uri=child_uri)
 332         document.edoc.getroot().append(child.edoc.getroot())
 333     return document