librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import WLCover
  31
  32
  33 functions.reg_substitute_entities()
  34 functions.reg_strip()
  35 functions.reg_starts_white()
  36 functions.reg_ends_white()
  37 functions.reg_texcommand()
  38
  39 STYLESHEETS = {
  40     'wl2tex': 'pdf/wl2tex.xslt',
  41 }
  42
  43 #CUSTOMIZATIONS = [
  44 #    'nofootnotes',
  45 #    'nothemes',
  46 #    'defaultleading',
  47 #    'onehalfleading',
  48 #    'doubleleading',
  49 #    'nowlfont',
  50 #    ]
  51
  52 def insert_tags(doc, split_re, tagname, exclude=None):
  53     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  54
  55     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  56     >>> insert_tags(t, re.compile('-'), 'd');
  57     >>> print etree.tostring(t)
  58     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  59     """
  60
  61     for elem in doc.iter(tag=etree.Element):
  62         if exclude and elem.tag in exclude:
  63             continue
  64         if elem.text:
  65             chunks = split_re.split(elem.text)
  66             while len(chunks) > 1:
  67                 ins = etree.Element(tagname)
  68                 ins.tail = chunks.pop()
  69                 elem.insert(0, ins)
  70             elem.text = chunks.pop(0)
  71         if elem.tail:
  72             chunks = split_re.split(elem.tail)
  73             parent = elem.getparent()
  74             ins_index = parent.index(elem) + 1
  75             while len(chunks) > 1:
  76                 ins = etree.Element(tagname)
  77                 ins.tail = chunks.pop()
  78                 parent.insert(ins_index, ins)
  79             elem.tail = chunks.pop(0)
  80
  81
  82 def substitute_hyphens(doc):
  83     insert_tags(doc,
  84                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  85                 "dywiz",
  86                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  87                 )
  88
  89
  90 def fix_hanging(doc):
  91     insert_tags(doc,
  92                 re.compile("(?<=\s\w)\s+"),
  93                 "nbsp",
  94                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  95                 )
  96
  97
  98 def move_motifs_inside(doc):
  99     """ moves motifs to be into block elements """
 100     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 101         for motif in master.xpath('motyw'):
 102             for sib in motif.itersiblings():
 103                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 104                     # motif shouldn't have a tail - it would be untagged text
 105                     motif.tail = None
 106                     motif.getparent().remove(motif)
 107                     sib.insert(0, motif)
 108                     break
 109
 110
 111 def hack_motifs(doc):
 112     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 113     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 114
 115     moves motifs in stanzas from first verse to second
 116     and from next to last to last, then inserts negative vspace before them
 117     """
 118     for motif in doc.findall('//strofa//motyw'):
 119         # find relevant verse-level tag
 120         verse, stanza = motif, motif.getparent()
 121         while stanza is not None and stanza.tag != 'strofa':
 122             verse, stanza = stanza, stanza.getparent()
 123         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 124         breaks_after = sum(1 for i in verse.itersiblings('br'))
 125         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 126             move_by = 1
 127             if breaks_after == 2:
 128                 move_by += 1
 129             moved_motif = deepcopy(motif)
 130             motif.tag = 'span'
 131             motif.text = None
 132             moved_motif.tail = None
 133             moved_motif.set('moved', str(move_by))
 134
 135             for br in verse.itersiblings('br'):
 136                 if move_by > 1:
 137                     move_by -= 1
 138                     continue
 139                 br.addnext(moved_motif)
 140                 break
 141
 142
 143 def parse_creator(doc):
 144     """Generates readable versions of creator and translator tags.
 145
 146     Finds all dc:creator and dc.contributor.translator tags
 147     and adds *_parsed versions with forenames first.
 148     """
 149     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 150                     'creator', 'contributor.translator')),
 151                     namespaces = {'dc': str(DCNS)})[::-1]:
 152         if not person.text:
 153             continue
 154         p = Person.from_text(person.text)
 155         person_parsed = deepcopy(person)
 156         person_parsed.tag = person.tag + '_parsed'
 157         person_parsed.set('sortkey', person.text)
 158         person_parsed.text = p.readable()
 159         person.getparent().insert(0, person_parsed)
 160
 161
 162 def get_stylesheet(name):
 163     return get_resource(STYLESHEETS[name])
 164
 165
 166 def package_available(package, args='', verbose=False):
 167     """ check if a verion of a latex package accepting given args is available """
 168     tempdir = mkdtemp('-wl2pdf-test')
 169     fpath = os.path.join(tempdir, 'test.tex')
 170     f = open(fpath, 'w')
 171     f.write(r"""
 172         \documentclass{wl}
 173         \usepackage[%s]{%s}
 174         \begin{document}
 175         \end{document}
 176         """ % (args, package))
 177     f.close()
 178     if verbose:
 179         p = call(['xelatex', '-output-directory', tempdir, fpath])
 180     else:
 181         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 182     shutil.rmtree(tempdir)
 183     return p == 0
 184
 185
 186 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 187               cover=None, flags=None, customizations=None):
 188     """ produces a PDF file with XeLaTeX
 189
 190     wldoc: a WLDocument
 191     verbose: prints all output from LaTeX
 192     save_tex: path to save the intermediary LaTeX file to
 193     morefloats (old/new/none): force specific morefloats
 194     cover: a cover.Cover factory or True for default
 195     flags: less-advertising,
 196     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 197     """
 198
 199     # Parse XSLT
 200     try:
 201         book_info = wldoc.book_info
 202         document = load_including_children(wldoc)
 203         root = document.edoc.getroot()
 204
 205         if cover:
 206             if cover is True:
 207                 cover = WLCover
 208             bound_cover = cover(book_info)
 209             root.set('data-cover-width', str(bound_cover.width))
 210             root.set('data-cover-height', str(bound_cover.height))
 211             if bound_cover.uses_dc_cover:
 212                 if book_info.cover_by:
 213                     root.set('data-cover-by', book_info.cover_by)
 214                 if book_info.cover_source:
 215                     root.set('data-cover-source',
 216                             book_info.cover_source)
 217         if flags:
 218             for flag in flags:
 219                 root.set('flag-' + flag, 'yes')
 220
 221         # check for LaTeX packages
 222         if morefloats:
 223             root.set('morefloats', morefloats.lower())
 224         elif package_available('morefloats', 'maxfloats=19'):
 225             root.set('morefloats', 'new')
 226
 227         # add customizations
 228         if customizations is not None:
 229             root.set('customizations', u','.join(customizations))
 230
 231         # add editors info
 232         root.set('editors', u', '.join(sorted(
 233             editor.readable() for editor in document.editors())))
 234         if document.book_info.funders:
 235             root.set('funders', u', '.join(document.book_info.funders))
 236         if document.book_info.thanks:
 237             root.set('thanks', document.book_info.thanks)
 238
 239         # hack the tree
 240         move_motifs_inside(document.edoc)
 241         hack_motifs(document.edoc)
 242         parse_creator(document.edoc)
 243         substitute_hyphens(document.edoc)
 244         fix_hanging(document.edoc)
 245
 246         # wl -> TeXML
 247         style_filename = get_stylesheet("wl2tex")
 248         style = etree.parse(style_filename)
 249
 250         texml = document.transform(style)
 251
 252         # TeXML -> LaTeX
 253         temp = mkdtemp('-wl2pdf')
 254
 255         if cover:
 256             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 257                 bound_cover.save(f)
 258
 259         del document # no longer needed large object :)
 260
 261         tex_path = os.path.join(temp, 'doc.tex')
 262         fout = open(tex_path, 'w')
 263         process(StringIO(texml), fout, 'utf-8')
 264         fout.close()
 265         del texml
 266
 267         if save_tex:
 268             shutil.copy(tex_path, save_tex)
 269
 270         # LaTeX -> PDF
 271         shutil.copy(get_resource('pdf/wl.cls'), temp)
 272         shutil.copy(get_resource('res/wl-logo.png'), temp)
 273
 274         try:
 275             cwd = os.getcwd()
 276         except OSError:
 277             cwd = None
 278         os.chdir(temp)
 279
 280         if verbose:
 281             p = call(['xelatex', tex_path])
 282         else:
 283             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 284         if p:
 285             raise ParseError("Error parsing .tex file")
 286
 287         if cwd is not None:
 288             os.chdir(cwd)
 289
 290         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 291         pdf_path = os.path.join(temp, 'doc.pdf')
 292         shutil.move(pdf_path, output_file.name)
 293         shutil.rmtree(temp)
 294         return OutputFile.from_filename(output_file.name)
 295
 296     except (XMLSyntaxError, XSLTApplyError), e:
 297         raise ParseError(e)
 298
 299
 300 def load_including_children(wldoc=None, provider=None, uri=None):
 301     """ Makes one big xml file with children inserted at end.
 302
 303     Either wldoc or provider and URI must be provided.
 304     """
 305
 306     if uri and provider:
 307         f = provider.by_uri(uri)
 308         text = f.read().decode('utf-8')
 309         f.close()
 310     elif wldoc is not None:
 311         text = etree.tostring(wldoc.edoc, encoding=unicode)
 312         provider = wldoc.provider
 313     else:
 314         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 315
 316     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 317
 318     document = WLDocument.from_string(text,
 319                 parse_dublincore=True, provider=provider)
 320     document.swap_endlines()
 321
 322     for child_uri in document.book_info.parts:
 323         child = load_including_children(provider=provider, uri=child_uri)
 324         document.edoc.getroot().append(child.edoc.getroot())
 325     return document