librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from distutils.dir_util import copy_tree # shutil.copytree is so uncapable.
  17 from StringIO import StringIO
  18 from tempfile import mkdtemp, NamedTemporaryFile
  19 import re
  20 from copy import deepcopy
  21 from subprocess import call, PIPE
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile
  30 from librarian import functions
  31 from librarian.cover import WLCover
  32
  33 import itertools, operator
  34
  35 functions.reg_substitute_entities()
  36 functions.reg_strip()
  37 functions.reg_starts_white()
  38 functions.reg_ends_white()
  39 functions.reg_texcommand()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 #CUSTOMIZATIONS = [
  46 #    'nofootnotes',
  47 #    'nothemes',
  48 #    'defaultleading',
  49 #    'onehalfleading',
  50 #    'doubleleading',
  51 #    'nowlfont',
  52 #    ]
  53
  54 def insert_tags(doc, split_re, tagname, exclude=None):
  55     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  56
  57     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  58     >>> insert_tags(t, re.compile('-'), 'd');
  59     >>> print etree.tostring(t)
  60     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  61     """
  62
  63     for elem in doc.iter(tag=etree.Element):
  64         if exclude and elem.tag in exclude:
  65             continue
  66         if elem.text:
  67             chunks = split_re.split(elem.text)
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 elem.insert(0, ins)
  72             elem.text = chunks.pop(0)
  73         if elem.tail:
  74             chunks = split_re.split(elem.tail)
  75             parent = elem.getparent()
  76             ins_index = parent.index(elem) + 1
  77             while len(chunks) > 1:
  78                 ins = etree.Element(tagname)
  79                 ins.tail = chunks.pop()
  80                 parent.insert(ins_index, ins)
  81             elem.tail = chunks.pop(0)
  82
  83
  84 def substitute_hyphens(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  87                 "dywiz",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "www"]
  89                 )
  90
  91
  92 def fix_hanging(doc):
  93     insert_tags(doc,
  94                 re.compile("(?<=\s\w)\s+"),
  95                 "nbsp",
  96                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  97                 )
  98
  99 def fake_tables(doc):
 100     for tabela in doc.findall("//tabela"):
 101         # are we dealing with a table of proper structure?
 102         # two levels of same tags, and all tags on second level
 103         # must be of same count.
 104         def tag_count(m, k):
 105             m[k.tag] = m.get(k.tag, 0) + 1
 106             return m
 107
 108         child_tags = reduce(tag_count, list(tabela), {})
 109         if len(child_tags) != 1:
 110             return
 111         grandchild_tags = reduce(tag_count, itertools.chain(*[list(c) for c in tabela]), {})
 112         if len(grandchild_tags) != 1:
 113             return
 114         if len(set(grandchild_tags.values())) != 1:
 115             return
 116
 117         for row in tabela:
 118             row.tag = 'r'
 119             for col in row:
 120                 col.tag = 'c'
 121     return
 122
 123 def move_motifs_inside(doc):
 124     """ moves motifs to be into block elements """
 125     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 126         for motif in master.xpath('motyw'):
 127             for sib in motif.itersiblings():
 128                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 129                     # motif shouldn't have a tail - it would be untagged text
 130                     motif.tail = None
 131                     motif.getparent().remove(motif)
 132                     sib.insert(0, motif)
 133                     break
 134
 135
 136 def hack_motifs(doc):
 137     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 138     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 139
 140     moves motifs in stanzas from first verse to second
 141     and from next to last to last, then inserts negative vspace before them
 142     """
 143     for motif in doc.findall('//strofa//motyw'):
 144         # find relevant verse-level tag
 145         verse, stanza = motif, motif.getparent()
 146         while stanza is not None and stanza.tag != 'strofa':
 147             verse, stanza = stanza, stanza.getparent()
 148         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 149         breaks_after = sum(1 for i in verse.itersiblings('br'))
 150         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 151             move_by = 1
 152             if breaks_after == 2:
 153                 move_by += 1
 154             moved_motif = deepcopy(motif)
 155             motif.tag = 'span'
 156             motif.text = None
 157             moved_motif.tail = None
 158             moved_motif.set('moved', str(move_by))
 159
 160             for br in verse.itersiblings('br'):
 161                 if move_by > 1:
 162                     move_by -= 1
 163                     continue
 164                 br.addnext(moved_motif)
 165                 break
 166
 167
 168 def parse_creator(doc):
 169     """Generates readable versions of creator and translator tags.
 170
 171     Finds all dc:creator and dc.contributor.translator tags
 172     and adds *_parsed versions with forenames first.
 173     """
 174     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 175                     'creator', 'contributor.translator')),
 176                     namespaces = {'dc': str(DCNS)})[::-1]:
 177         if not person.text:
 178             continue
 179         p = Person.from_text(person.text)
 180         person_parsed = deepcopy(person)
 181         person_parsed.tag = person.tag + '_parsed'
 182         person_parsed.set('sortkey', person.text)
 183         person_parsed.text = p.readable()
 184         person.getparent().insert(0, person_parsed)
 185
 186
 187 def get_stylesheet(name):
 188     return get_resource(STYLESHEETS[name])
 189
 190
 191 def package_available(package, args='', verbose=False):
 192     """ check if a verion of a latex package accepting given args is available """
 193     tempdir = mkdtemp('-wl2pdf-test')
 194     fpath = os.path.join(tempdir, 'test.tex')
 195     f = open(fpath, 'w')
 196     f.write(r"""
 197         \documentclass{wl}
 198         \usepackage[%s]{%s}
 199         \begin{document}
 200         \end{document}
 201         """ % (args, package))
 202     f.close()
 203     if verbose:
 204         p = call(['xelatex', '-output-directory', tempdir, fpath])
 205     else:
 206         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 207     shutil.rmtree(tempdir)
 208     return p == 0
 209
 210
 211 def transform(wldoc, verbose=False, save_tex=None, save_texml=None, morefloats=None,
 212               cover=None, cover_file=None, flags=None, customizations=None, documentclass='wl', resources=None):
 213     """ produces a PDF file with XeLaTeX
 214
 215     wldoc: a WLDocument
 216     verbose: prints all output from LaTeX
 217     save_tex: path to save the intermediary LaTeX file to
 218     save_texml: path to save the intermediary TeXML file to
 219     morefloats (old/new/none): force specific morefloats
 220     cover: a cover.Cover factory or True for default
 221     flags: less-advertising,
 222     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 223     documentclass: LaTeX document class, defaults to wl
 224     resources: a directory with resources, copied to place where LaTeX compilation is made
 225     """
 226
 227     # Parse XSLT
 228     try:
 229         book_info = wldoc.book_info
 230         document = load_including_children(wldoc)
 231         root = document.edoc.getroot()
 232
 233         if cover:
 234             if cover is True:
 235                 cover = WLCover
 236             bound_cover = cover(book_info)
 237             root.set('data-cover-width', str(bound_cover.width))
 238             root.set('data-cover-height', str(bound_cover.height))
 239             if bound_cover.uses_dc_cover:
 240                 if book_info.cover_by:
 241                     root.set('data-cover-by', book_info.cover_by)
 242                 if book_info.cover_source:
 243                     root.set('data-cover-source',
 244                             book_info.cover_source)
 245         if flags:
 246             for flag in flags:
 247                 root.set('flag-' + flag, 'yes')
 248
 249         # check for LaTeX packages
 250         if morefloats:
 251             root.set('morefloats', morefloats.lower())
 252         elif package_available('morefloats', 'maxfloats=19'):
 253             root.set('morefloats', 'new')
 254
 255         # add customizations
 256         if customizations is not None:
 257             root.set('customizations', u','.join(customizations))
 258
 259         root.set('documentclass', documentclass or 'wl')
 260
 261         # add editors info
 262         root.set('editors', u', '.join(sorted(
 263             editor.readable() for editor in document.editors())))
 264
 265         # hack the tree
 266         move_motifs_inside(document.edoc)
 267         hack_motifs(document.edoc)
 268         fake_tables(document.edoc)
 269         parse_creator(document.edoc)
 270         substitute_hyphens(document.edoc)
 271         fix_hanging(document.edoc)
 272
 273         # wl -> TeXML
 274         style_filename = get_stylesheet("wl2tex")
 275         style = etree.parse(style_filename)
 276
 277         texml = document.transform(style)
 278
 279         if save_texml:
 280             texml.write(save_texml)
 281
 282         # TeXML -> LaTeX
 283         temp = mkdtemp('-wl2pdf')
 284
 285         if cover:
 286             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 287                 bound_cover.save(f)
 288
 289         del document # no longer needed large object :)
 290
 291         tex_path = os.path.join(temp, 'doc.tex')
 292         fout = open(tex_path, 'w')
 293         process(StringIO(texml), fout, 'utf-8')
 294         fout.close()
 295         del texml
 296
 297         if save_tex:
 298             shutil.copy(tex_path, save_tex)
 299
 300         # LaTeX -> PDF
 301         shutil.copy(get_resource('pdf/wl.cls'), temp)
 302         shutil.copy(get_resource('pdf/wlpub.cls'), temp)
 303         shutil.copy(get_resource('pdf/fnprep.cls'), temp)
 304         shutil.copy(get_resource('res/wl-logo.png'), temp)
 305         shutil.copy(get_resource('res/cover.jpg'), temp)
 306         if resources:
 307             copy_tree(resources, temp)
 308
 309         try:
 310             cwd = os.getcwd()
 311         except OSError:
 312             cwd = None
 313         os.chdir(temp)
 314
 315         if resources:
 316             os.putenv("TEXINPUTS", "::.:%s" % resources)
 317
 318         if verbose:
 319             p = call(['xelatex', tex_path])
 320         else:
 321             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 322         if p:
 323             raise ParseError("Error parsing .tex file")
 324
 325         if cwd is not None:
 326             os.chdir(cwd)
 327
 328         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 329         pdf_path = os.path.join(temp, 'doc.pdf')
 330         shutil.move(pdf_path, output_file.name)
 331         shutil.rmtree(temp)
 332         return OutputFile.from_filename(output_file.name)
 333
 334     except (XMLSyntaxError, XSLTApplyError), e:
 335         raise ParseError(e)
 336
 337
 338 def load_including_children(wldoc=None, provider=None, uri=None):
 339     """ Makes one big xml file with children inserted at end.
 340
 341     Either wldoc or provider and URI must be provided.
 342     """
 343
 344     if uri and provider:
 345         f = provider.by_uri(uri)
 346         text = f.read().decode('utf-8')
 347         f.close()
 348     elif wldoc is not None:
 349         text = etree.tostring(wldoc.edoc, encoding=unicode)
 350         provider = wldoc.provider
 351     else:
 352         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 353
 354     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 355
 356     document = WLDocument.from_string(text,
 357                 parse_dublincore=True, provider=provider)
 358     document.swap_endlines()
 359
 360     for child_uri in document.book_info.parts:
 361         child = load_including_children(provider=provider, uri=child_uri)
 362         document.edoc.getroot().append(child.edoc.getroot())
 363     return document