librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import WLCover
  31
  32
  33 functions.reg_substitute_entities()
  34 functions.reg_strip()
  35 functions.reg_starts_white()
  36 functions.reg_ends_white()
  37 functions.reg_texcommand()
  38 functions.reg_urlquote()
  39 functions.reg_breakurl()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 #CUSTOMIZATIONS = [
  46 #    'nofootnotes',
  47 #    'nothemes',
  48 #    'defaultleading',
  49 #    'onehalfleading',
  50 #    'doubleleading',
  51 #    'nowlfont',
  52 #    ]
  53
  54 def insert_tags(doc, split_re, tagname, exclude=None):
  55     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  56
  57     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  58     >>> insert_tags(t, re.compile('-'), 'd');
  59     >>> print etree.tostring(t)
  60     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  61     """
  62
  63     for elem in doc.iter(tag=etree.Element):
  64         if exclude and elem.tag in exclude:
  65             continue
  66         if elem.text:
  67             chunks = split_re.split(elem.text)
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 elem.insert(0, ins)
  72             elem.text = chunks.pop(0)
  73         if elem.tail:
  74             chunks = split_re.split(elem.tail)
  75             parent = elem.getparent()
  76             ins_index = parent.index(elem) + 1
  77             while len(chunks) > 1:
  78                 ins = etree.Element(tagname)
  79                 ins.tail = chunks.pop()
  80                 parent.insert(ins_index, ins)
  81             elem.tail = chunks.pop(0)
  82
  83
  84 def substitute_hyphens(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  87                 "dywiz",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
  89                 )
  90
  91
  92 def fix_hanging(doc):
  93     insert_tags(doc,
  94                 re.compile("(?<=\s\w)\s+"),
  95                 "nbsp",
  96                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  97                 )
  98
  99
 100 def move_motifs_inside(doc):
 101     """ moves motifs to be into block elements """
 102     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 103         for motif in master.xpath('motyw'):
 104             for sib in motif.itersiblings():
 105                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 106                     # motif shouldn't have a tail - it would be untagged text
 107                     motif.tail = None
 108                     motif.getparent().remove(motif)
 109                     sib.insert(0, motif)
 110                     break
 111
 112
 113 def hack_motifs(doc):
 114     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 115     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 116
 117     moves motifs in stanzas from first verse to second
 118     and from next to last to last, then inserts negative vspace before them
 119     """
 120     for motif in doc.findall('//strofa//motyw'):
 121         # find relevant verse-level tag
 122         verse, stanza = motif, motif.getparent()
 123         while stanza is not None and stanza.tag != 'strofa':
 124             verse, stanza = stanza, stanza.getparent()
 125         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 126         breaks_after = sum(1 for i in verse.itersiblings('br'))
 127         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 128             move_by = 1
 129             if breaks_after == 2:
 130                 move_by += 1
 131             moved_motif = deepcopy(motif)
 132             motif.tag = 'span'
 133             motif.text = None
 134             moved_motif.tail = None
 135             moved_motif.set('moved', str(move_by))
 136
 137             for br in verse.itersiblings('br'):
 138                 if move_by > 1:
 139                     move_by -= 1
 140                     continue
 141                 br.addnext(moved_motif)
 142                 break
 143
 144
 145 def parse_creator(doc):
 146     """Generates readable versions of creator and translator tags.
 147
 148     Finds all dc:creator and dc.contributor.translator tags
 149     and adds *_parsed versions with forenames first.
 150     """
 151     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 152                     'creator', 'contributor.translator',
 153                     'contributor.editor', 'contributor.technical_editor')),
 154                     namespaces = {'dc': str(DCNS)})[::-1]:
 155         if not person.text:
 156             continue
 157         p = Person.from_text(person.text)
 158         person_parsed = deepcopy(person)
 159         person_parsed.tag = person.tag + '_parsed'
 160         person_parsed.set('sortkey', person.text)
 161         person_parsed.text = p.readable()
 162         person.getparent().insert(0, person_parsed)
 163
 164
 165 def get_stylesheet(name):
 166     return get_resource(STYLESHEETS[name])
 167
 168
 169 def package_available(package, args='', verbose=False):
 170     """ check if a verion of a latex package accepting given args is available """
 171     tempdir = mkdtemp('-wl2pdf-test')
 172     fpath = os.path.join(tempdir, 'test.tex')
 173     f = open(fpath, 'w')
 174     f.write(r"""
 175         \documentclass{wl}
 176         \usepackage[%s]{%s}
 177         \begin{document}
 178         \end{document}
 179         """ % (args, package))
 180     f.close()
 181     if verbose:
 182         p = call(['xelatex', '-output-directory', tempdir, fpath])
 183     else:
 184         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 185     shutil.rmtree(tempdir)
 186     return p == 0
 187
 188
 189 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 190               cover=None, flags=None, customizations=None):
 191     """ produces a PDF file with XeLaTeX
 192
 193     wldoc: a WLDocument
 194     verbose: prints all output from LaTeX
 195     save_tex: path to save the intermediary LaTeX file to
 196     morefloats (old/new/none): force specific morefloats
 197     cover: a cover.Cover factory or True for default
 198     flags: less-advertising,
 199     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 200     """
 201
 202     # Parse XSLT
 203     try:
 204         book_info = wldoc.book_info
 205         document = load_including_children(wldoc)
 206         root = document.edoc.getroot()
 207
 208         if cover:
 209             if cover is True:
 210                 cover = WLCover
 211             bound_cover = cover(book_info)
 212             root.set('data-cover-width', str(bound_cover.width))
 213             root.set('data-cover-height', str(bound_cover.height))
 214             if bound_cover.uses_dc_cover:
 215                 if book_info.cover_by:
 216                     root.set('data-cover-by', book_info.cover_by)
 217                 if book_info.cover_source:
 218                     root.set('data-cover-source',
 219                             book_info.cover_source)
 220         if flags:
 221             for flag in flags:
 222                 root.set('flag-' + flag, 'yes')
 223
 224         # check for LaTeX packages
 225         if morefloats:
 226             root.set('morefloats', morefloats.lower())
 227         elif package_available('morefloats', 'maxfloats=19'):
 228             root.set('morefloats', 'new')
 229
 230         # add customizations
 231         if customizations is not None:
 232             root.set('customizations', u','.join(customizations))
 233
 234         # add editors info
 235         root.set('editors', u', '.join(sorted(
 236             editor.readable() for editor in document.editors())))
 237
 238         # hack the tree
 239         move_motifs_inside(document.edoc)
 240         hack_motifs(document.edoc)
 241         parse_creator(document.edoc)
 242         substitute_hyphens(document.edoc)
 243         fix_hanging(document.edoc)
 244
 245         # wl -> TeXML
 246         style_filename = get_stylesheet("wl2tex")
 247         style = etree.parse(style_filename)
 248
 249         texml = document.transform(style)
 250
 251         # TeXML -> LaTeX
 252         temp = mkdtemp('-wl2pdf')
 253
 254         for ilustr in document.edoc.findall("//ilustr"):
 255             shutil.copy(ilustr.get("src"), temp)
 256
 257         if cover:
 258             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 259                 bound_cover.save(f)
 260
 261         del document # no longer needed large object :)
 262
 263         tex_path = os.path.join(temp, 'doc.tex')
 264         fout = open(tex_path, 'w')
 265         process(StringIO(texml), fout, 'utf-8')
 266         fout.close()
 267         del texml
 268
 269         if save_tex:
 270             shutil.copy(tex_path, save_tex)
 271
 272         # LaTeX -> PDF
 273         shutil.copy(get_resource('pdf/wl.cls'), temp)
 274         shutil.copy(get_resource('res/wl-logo.png'), temp)
 275         #shutil.copy(get_resource('res/prawokultury-logo.png'), temp)
 276         #shutil.copy(get_resource('res/trust-logo.eps'), temp)
 277         shutil.copy(get_resource('res/nowoczesnapolska.org.pl.png'), temp)
 278         shutil.copy(get_resource('res/koedlogo.png'), temp)
 279
 280         try:
 281             cwd = os.getcwd()
 282         except OSError:
 283             cwd = None
 284         os.chdir(temp)
 285
 286         if verbose:
 287             p = call(['xelatex', tex_path])
 288         else:
 289             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 290         if p:
 291             raise ParseError("Error parsing .tex file")
 292
 293         if cwd is not None:
 294             os.chdir(cwd)
 295
 296         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 297         pdf_path = os.path.join(temp, 'doc.pdf')
 298         shutil.move(pdf_path, output_file.name)
 299         shutil.rmtree(temp)
 300         return OutputFile.from_filename(output_file.name)
 301
 302     except (XMLSyntaxError, XSLTApplyError), e:
 303         raise ParseError(e)
 304
 305
 306 def load_including_children(wldoc=None, provider=None, uri=None):
 307     """ Makes one big xml file with children inserted at end.
 308
 309     Either wldoc or provider and URI must be provided.
 310     """
 311
 312     if uri and provider:
 313         f = provider.by_uri(uri)
 314         text = f.read().decode('utf-8')
 315         f.close()
 316     elif wldoc is not None:
 317         text = etree.tostring(wldoc.edoc, encoding=unicode)
 318         provider = wldoc.provider
 319     else:
 320         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 321
 322     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 323
 324     document = WLDocument.from_string(text,
 325                 parse_dublincore=True, provider=provider)
 326     document.swap_endlines()
 327
 328     for child_uri in document.book_info.parts:
 329         child = load_including_children(provider=provider, uri=child_uri)
 330         document.edoc.getroot().append(child.edoc.getroot())
 331     return document