librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp, NamedTemporaryFile
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 from Texml.processor import process
  17 from lxml import etree
  18 from lxml.etree import XMLSyntaxError, XSLTApplyError
  19
  20 from librarian.dcparser import Person
  21 from librarian.parser import WLDocument
  22 from librarian import ParseError, DCNS, get_resource, OutputFile
  23 from librarian import functions
  24 from librarian.cover import WLCover
  25
  26
  27 functions.reg_substitute_entities()
  28 functions.reg_strip()
  29 functions.reg_starts_white()
  30 functions.reg_ends_white()
  31 functions.reg_texcommand()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'pdf/wl2tex.xslt',
  35 }
  36
  37 CUSTOMIZATIONS = [
  38     'nofootnotes',
  39     'nothemes',
  40     'onehalfleading',
  41     'doubleleading',
  42     'nowlfont',
  43     ]
  44
  45 def insert_tags(doc, split_re, tagname, exclude=None):
  46     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  47
  48     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  49     >>> insert_tags(t, re.compile('-'), 'd');
  50     >>> print etree.tostring(t)
  51     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  52     """
  53
  54     for elem in doc.iter(tag=etree.Element):
  55         if exclude and elem.tag in exclude:
  56             continue
  57         if elem.text:
  58             chunks = split_re.split(elem.text)
  59             while len(chunks) > 1:
  60                 ins = etree.Element(tagname)
  61                 ins.tail = chunks.pop()
  62                 elem.insert(0, ins)
  63             elem.text = chunks.pop(0)
  64         if elem.tail:
  65             chunks = split_re.split(elem.tail)
  66             parent = elem.getparent()
  67             ins_index = parent.index(elem) + 1
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 parent.insert(ins_index, ins)
  72             elem.tail = chunks.pop(0)
  73
  74
  75 def substitute_hyphens(doc):
  76     insert_tags(doc,
  77                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  78                 "dywiz",
  79                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  80                 )
  81
  82
  83 def fix_hanging(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=\s\w)\s+"),
  86                 "nbsp",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  88                 )
  89
  90
  91 def move_motifs_inside(doc):
  92     """ moves motifs to be into block elements """
  93     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  94         for motif in master.xpath('motyw'):
  95             for sib in motif.itersiblings():
  96                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  97                     # motif shouldn't have a tail - it would be untagged text
  98                     motif.tail = None
  99                     motif.getparent().remove(motif)
 100                     sib.insert(0, motif)
 101                     break
 102
 103
 104 def hack_motifs(doc):
 105     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 106     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 107
 108     moves motifs in stanzas from first verse to second
 109     and from next to last to last, then inserts negative vspace before them
 110     """
 111     for motif in doc.findall('//strofa//motyw'):
 112         # find relevant verse-level tag
 113         verse, stanza = motif, motif.getparent()
 114         while stanza is not None and stanza.tag != 'strofa':
 115             verse, stanza = stanza, stanza.getparent()
 116         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 117         breaks_after = sum(1 for i in verse.itersiblings('br'))
 118         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 119             move_by = 1
 120             if breaks_after == 2:
 121                 move_by += 1
 122             moved_motif = deepcopy(motif)
 123             motif.tag = 'span'
 124             motif.text = None
 125             moved_motif.tail = None
 126             moved_motif.set('moved', str(move_by))
 127
 128             for br in verse.itersiblings('br'):
 129                 if move_by > 1:
 130                     move_by -= 1
 131                     continue
 132                 br.addnext(moved_motif)
 133                 break
 134
 135
 136 def parse_creator(doc):
 137     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 138     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 139                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 140                     namespaces = {'dc': str(DCNS)})[::-1]:
 141         if not person.text:
 142             continue
 143         p = Person.from_text(person.text)
 144         person_parsed = deepcopy(person)
 145         person_parsed.tag = person.tag + '_parsed'
 146         person_parsed.set('sortkey', person.text)
 147         person_parsed.text = p.readable()
 148         person.getparent().insert(0, person_parsed)
 149
 150
 151 def get_stylesheet(name):
 152     return get_resource(STYLESHEETS[name])
 153
 154
 155 def package_available(package, args='', verbose=False):
 156     """ check if a verion of a latex package accepting given args is available """
 157     tempdir = mkdtemp('-wl2pdf-test')
 158     fpath = os.path.join(tempdir, 'test.tex')
 159     f = open(fpath, 'w')
 160     f.write(r"""
 161         \documentclass{wl}
 162         \usepackage[%s]{%s}
 163         \begin{document}
 164         \end{document}
 165         """ % (args, package))
 166     f.close()
 167     if verbose:
 168         p = call(['xelatex', '-output-directory', tempdir, fpath])
 169     else:
 170         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 171     shutil.rmtree(tempdir)
 172     return p == 0
 173
 174
 175 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 176               cover=None, flags=None, customizations=None):
 177     """ produces a PDF file with XeLaTeX
 178
 179     wldoc: a WLDocument
 180     verbose: prints all output from LaTeX
 181     save_tex: path to save the intermediary LaTeX file to
 182     morefloats (old/new/none): force specific morefloats
 183     cover: a cover.Cover object or True for default
 184     flags: less-advertising, not-wl, images
 185     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 186     """
 187     # TODO: images
 188
 189     # Parse XSLT
 190     try:
 191         document = load_including_children(wldoc)
 192
 193         if cover:
 194             if cover is True:
 195                 cover = WLCover
 196             document.edoc.getroot().set('data-cover-width', str(cover.width))
 197             document.edoc.getroot().set('data-cover-height', str(cover.height))
 198             if cover.uses_dc_cover:
 199                 if document.book_info.cover_by:
 200                     document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 201                 if document.book_info.cover_source:
 202                     document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 203         if flags:
 204             for flag in flags:
 205                 document.edoc.getroot().set('flag-' + flag, 'yes')
 206
 207         # check for LaTeX packages
 208         if morefloats:
 209             document.edoc.getroot().set('morefloats', morefloats.lower())
 210         elif package_available('morefloats', 'maxfloats=19'):
 211             document.edoc.getroot().set('morefloats', 'new')
 212
 213         # add customizations
 214         if customizations is not None:
 215             document.edoc.getroot().set('customizations', u','.join(customizations))
 216
 217         # hack the tree
 218         move_motifs_inside(document.edoc)
 219         hack_motifs(document.edoc)
 220         parse_creator(document.edoc)
 221         substitute_hyphens(document.edoc)
 222         fix_hanging(document.edoc)
 223
 224         # wl -> TeXML
 225         style_filename = get_stylesheet("wl2tex")
 226         style = etree.parse(style_filename)
 227
 228         texml = document.transform(style)
 229
 230         # TeXML -> LaTeX
 231         temp = mkdtemp('-wl2pdf')
 232
 233         if cover:
 234             c = cover(document.book_info)
 235             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 236                 c.save(f)
 237
 238         del document # no longer needed large object :)
 239
 240         tex_path = os.path.join(temp, 'doc.tex')
 241         fout = open(tex_path, 'w')
 242         process(StringIO(texml), fout, 'utf-8')
 243         fout.close()
 244         del texml
 245
 246         if save_tex:
 247             shutil.copy(tex_path, save_tex)
 248
 249         # LaTeX -> PDF
 250         shutil.copy(get_resource('pdf/wl.cls'), temp)
 251         shutil.copy(get_resource('res/wl-logo.png'), temp)
 252
 253         # FIXME: temporary
 254         shutil.copy(get_resource('res/ofop-logo.png'), temp)
 255         shutil.copy(get_resource('res/logo-fio.jpg'), temp)
 256
 257         cwd = os.getcwd()
 258         os.chdir(temp)
 259
 260         if verbose:
 261             p = call(['xelatex', tex_path])
 262         else:
 263             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 264         if p:
 265             raise ParseError("Error parsing .tex file")
 266
 267         os.chdir(cwd)
 268
 269         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 270         pdf_path = os.path.join(temp, 'doc.pdf')
 271         shutil.move(pdf_path, output_file.name)
 272         shutil.rmtree(temp)
 273         return OutputFile.from_filename(output_file.name)
 274
 275     except (XMLSyntaxError, XSLTApplyError), e:
 276         raise ParseError(e)
 277
 278
 279 def load_including_children(wldoc=None, provider=None, uri=None):
 280     """ Makes one big xml file with children inserted at end.
 281
 282     Either wldoc or provider and URI must be provided.
 283     """
 284
 285     if uri and provider:
 286         f = provider.by_uri(uri)
 287         text = f.read().decode('utf-8')
 288         f.close()
 289     elif wldoc is not None:
 290         text = etree.tostring(wldoc.edoc, encoding=unicode)
 291         provider = wldoc.provider
 292     else:
 293         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 294
 295     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 296
 297     document = WLDocument.from_string(text, parse_dublincore=True)
 298     document.swap_endlines()
 299
 300     for child_uri in document.book_info.parts:
 301         child = load_including_children(provider=provider, uri=child_uri)
 302         document.edoc.getroot().append(child.edoc.getroot())
 303     return document