librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp, NamedTemporaryFile
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 from Texml.processor import process
  17 from lxml import etree
  18 from lxml.etree import XMLSyntaxError, XSLTApplyError
  19
  20 from librarian.dcparser import Person
  21 from librarian.parser import WLDocument
  22 from librarian import ParseError, DCNS, get_resource, OutputFile
  23 from librarian import functions
  24
  25
  26 functions.reg_substitute_entities()
  27 functions.reg_strip()
  28 functions.reg_starts_white()
  29 functions.reg_ends_white()
  30 functions.reg_texcommand()
  31
  32 STYLESHEETS = {
  33     'wl2tex': 'pdf/wl2tex.xslt',
  34 }
  35
  36 CUSTOMIZATIONS = [
  37     'nofootnotes',
  38     'nothemes',
  39     'onehalfleading',
  40     'doubleleading',
  41     'nowlfont',
  42     ]
  43
  44 def insert_tags(doc, split_re, tagname, exclude=None):
  45     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  46
  47     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  48     >>> insert_tags(t, re.compile('-'), 'd');
  49     >>> print etree.tostring(t)
  50     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  51     """
  52
  53     for elem in doc.iter(tag=etree.Element):
  54         if exclude and elem.tag in exclude:
  55             continue
  56         if elem.text:
  57             chunks = split_re.split(elem.text)
  58             while len(chunks) > 1:
  59                 ins = etree.Element(tagname)
  60                 ins.tail = chunks.pop()
  61                 elem.insert(0, ins)
  62             elem.text = chunks.pop(0)
  63         if elem.tail:
  64             chunks = split_re.split(elem.tail)
  65             parent = elem.getparent()
  66             ins_index = parent.index(elem) + 1
  67             while len(chunks) > 1:
  68                 ins = etree.Element(tagname)
  69                 ins.tail = chunks.pop()
  70                 parent.insert(ins_index, ins)
  71             elem.tail = chunks.pop(0)
  72
  73
  74 def substitute_hyphens(doc):
  75     insert_tags(doc,
  76                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  77                 "dywiz",
  78                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  79                 )
  80
  81
  82 def fix_hanging(doc):
  83     insert_tags(doc,
  84                 re.compile("(?<=\s\w)\s+"),
  85                 "nbsp",
  86                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  87                 )
  88
  89
  90 def move_motifs_inside(doc):
  91     """ moves motifs to be into block elements """
  92     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  93         for motif in master.xpath('motyw'):
  94             for sib in motif.itersiblings():
  95                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  96                     # motif shouldn't have a tail - it would be untagged text
  97                     motif.tail = None
  98                     motif.getparent().remove(motif)
  99                     sib.insert(0, motif)
 100                     break
 101
 102
 103 def hack_motifs(doc):
 104     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 105     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 106
 107     moves motifs in stanzas from first verse to second
 108     and from next to last to last, then inserts negative vspace before them
 109     """
 110     for motif in doc.findall('//strofa//motyw'):
 111         # find relevant verse-level tag
 112         verse, stanza = motif, motif.getparent()
 113         while stanza is not None and stanza.tag != 'strofa':
 114             verse, stanza = stanza, stanza.getparent()
 115         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 116         breaks_after = sum(1 for i in verse.itersiblings('br'))
 117         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 118             move_by = 1
 119             if breaks_after == 2:
 120                 move_by += 1
 121             moved_motif = deepcopy(motif)
 122             motif.tag = 'span'
 123             motif.text = None
 124             moved_motif.tail = None
 125             moved_motif.set('moved', str(move_by))
 126
 127             for br in verse.itersiblings('br'):
 128                 if move_by > 1:
 129                     move_by -= 1
 130                     continue
 131                 br.addnext(moved_motif)
 132                 break
 133
 134
 135 def parse_creator(doc):
 136     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 137     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 138                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 139                     namespaces = {'dc': str(DCNS)})[::-1]:
 140         if not person.text:
 141             continue
 142         p = Person.from_text(person.text)
 143         person_parsed = deepcopy(person)
 144         person_parsed.tag = person.tag + '_parsed'
 145         person_parsed.set('sortkey', person.text)
 146         person_parsed.text = p.readable()
 147         person.getparent().insert(0, person_parsed)
 148
 149
 150 def get_stylesheet(name):
 151     return get_resource(STYLESHEETS[name])
 152
 153
 154 def package_available(package, args='', verbose=False):
 155     """ check if a verion of a latex package accepting given args is available """
 156     tempdir = mkdtemp('-wl2pdf-test')
 157     fpath = os.path.join(tempdir, 'test.tex')
 158     f = open(fpath, 'w')
 159     f.write(r"""
 160         \documentclass{wl}
 161         \usepackage[%s]{%s}
 162         \begin{document}
 163         \end{document}
 164         """ % (args, package))
 165     f.close()
 166     if verbose:
 167         p = call(['xelatex', '-output-directory', tempdir, fpath])
 168     else:
 169         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 170     shutil.rmtree(tempdir)
 171     return p == 0
 172
 173
 174 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 175               cover=None, flags=None, customizations=None):
 176     """ produces a PDF file with XeLaTeX
 177
 178     wldoc: a WLDocument
 179     verbose: prints all output from LaTeX
 180     save_tex: path to save the intermediary LaTeX file to
 181     morefloats (old/new/none): force specific morefloats
 182     cover: a cover.Cover object
 183     flags: less-advertising,
 184     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 185     """
 186
 187     # Parse XSLT
 188     try:
 189         document = load_including_children(wldoc)
 190
 191         if cover:
 192             document.edoc.getroot().set('data-cover-width', str(cover.width))
 193             document.edoc.getroot().set('data-cover-height', str(cover.height))
 194         if flags:
 195             for flag in flags:
 196                 document.edoc.getroot().set('flag-' + flag, 'yes')
 197
 198         # check for LaTeX packages
 199         if morefloats:
 200             document.edoc.getroot().set('morefloats', morefloats.lower())
 201         elif package_available('morefloats', 'maxfloats=19'):
 202             document.edoc.getroot().set('morefloats', 'new')
 203
 204         # add customizations
 205         if customizations is not None:
 206             document.edoc.getroot().set('customizations', u','.join(customizations))
 207
 208         # hack the tree
 209         move_motifs_inside(document.edoc)
 210         hack_motifs(document.edoc)
 211         parse_creator(document.edoc)
 212         substitute_hyphens(document.edoc)
 213         fix_hanging(document.edoc)
 214
 215         # wl -> TeXML
 216         style_filename = get_stylesheet("wl2tex")
 217         style = etree.parse(style_filename)
 218
 219         texml = document.transform(style)
 220
 221         # TeXML -> LaTeX
 222         temp = mkdtemp('-wl2pdf')
 223
 224         if cover:
 225             c = cover(document.book_info.author.readable(), document.book_info.title)
 226             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 227                 c.save(f)
 228
 229         del document # no longer needed large object :)
 230
 231         tex_path = os.path.join(temp, 'doc.tex')
 232         fout = open(tex_path, 'w')
 233         process(StringIO(texml), fout, 'utf-8')
 234         fout.close()
 235         del texml
 236
 237         if save_tex:
 238             shutil.copy(tex_path, save_tex)
 239
 240         # LaTeX -> PDF
 241         shutil.copy(get_resource('pdf/wl.cls'), temp)
 242         shutil.copy(get_resource('res/wl-logo.png'), temp)
 243
 244         cwd = os.getcwd()
 245         os.chdir(temp)
 246
 247         if verbose:
 248             p = call(['xelatex', tex_path])
 249         else:
 250             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 251         if p:
 252             raise ParseError("Error parsing .tex file")
 253
 254         os.chdir(cwd)
 255
 256         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 257         pdf_path = os.path.join(temp, 'doc.pdf')
 258         shutil.move(pdf_path, output_file.name)
 259         shutil.rmtree(temp)
 260         return OutputFile.from_filename(output_file.name)
 261
 262     except (XMLSyntaxError, XSLTApplyError), e:
 263         raise ParseError(e)
 264
 265
 266 def load_including_children(wldoc=None, provider=None, uri=None):
 267     """ Makes one big xml file with children inserted at end.
 268
 269     Either wldoc or provider and URI must be provided.
 270     """
 271
 272     if uri and provider:
 273         f = provider.by_uri(uri)
 274         text = f.read().decode('utf-8')
 275         f.close()
 276     elif wldoc is not None:
 277         text = etree.tostring(wldoc.edoc, encoding=unicode)
 278         provider = wldoc.provider
 279     else:
 280         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 281
 282     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 283
 284     document = WLDocument.from_string(text, parse_dublincore=True)
 285     document.swap_endlines()
 286
 287     for child_uri in document.book_info.parts:
 288         child = load_including_children(provider=provider, uri=child_uri)
 289         document.edoc.getroot().append(child.edoc.getroot())
 290     return document