librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp, NamedTemporaryFile
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 from Texml.processor import process
  17 from lxml import etree
  18 from lxml.etree import XMLSyntaxError, XSLTApplyError
  19
  20 from librarian.dcparser import Person
  21 from librarian.parser import WLDocument
  22 from librarian import ParseError, DCNS, get_resource, OutputFile
  23 from librarian import functions
  24 from librarian.cover import WLCover
  25
  26
  27 functions.reg_substitute_entities()
  28 functions.reg_strip()
  29 functions.reg_starts_white()
  30 functions.reg_ends_white()
  31 functions.reg_texcommand()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'pdf/wl2tex.xslt',
  35 }
  36
  37 CUSTOMIZATIONS = [
  38     'nofootnotes',
  39     'nothemes',
  40     'onehalfleading',
  41     'doubleleading',
  42     'nowlfont',
  43     ]
  44
  45 def insert_tags(doc, split_re, tagname, exclude=None):
  46     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  47
  48     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  49     >>> insert_tags(t, re.compile('-'), 'd');
  50     >>> print etree.tostring(t)
  51     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  52     """
  53
  54     for elem in doc.iter(tag=etree.Element):
  55         if exclude and elem.tag in exclude:
  56             continue
  57         if elem.text:
  58             chunks = split_re.split(elem.text)
  59             while len(chunks) > 1:
  60                 ins = etree.Element(tagname)
  61                 ins.tail = chunks.pop()
  62                 elem.insert(0, ins)
  63             elem.text = chunks.pop(0)
  64         if elem.tail:
  65             chunks = split_re.split(elem.tail)
  66             parent = elem.getparent()
  67             ins_index = parent.index(elem) + 1
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 parent.insert(ins_index, ins)
  72             elem.tail = chunks.pop(0)
  73
  74
  75 def substitute_hyphens(doc):
  76     insert_tags(doc,
  77                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  78                 "dywiz",
  79                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  80                 )
  81
  82
  83 def fix_hanging(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=\s\w)\s+"),
  86                 "nbsp",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  88                 )
  89
  90
  91 def move_motifs_inside(doc):
  92     """ moves motifs to be into block elements """
  93     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  94         for motif in master.xpath('motyw'):
  95             for sib in motif.itersiblings():
  96                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  97                     # motif shouldn't have a tail - it would be untagged text
  98                     motif.tail = None
  99                     motif.getparent().remove(motif)
 100                     sib.insert(0, motif)
 101                     break
 102
 103
 104 def hack_motifs(doc):
 105     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 106     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 107
 108     moves motifs in stanzas from first verse to second
 109     and from next to last to last, then inserts negative vspace before them
 110     """
 111     for motif in doc.findall('//strofa//motyw'):
 112         # find relevant verse-level tag
 113         verse, stanza = motif, motif.getparent()
 114         while stanza is not None and stanza.tag != 'strofa':
 115             verse, stanza = stanza, stanza.getparent()
 116         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 117         breaks_after = sum(1 for i in verse.itersiblings('br'))
 118         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 119             move_by = 1
 120             if breaks_after == 2:
 121                 move_by += 1
 122             moved_motif = deepcopy(motif)
 123             motif.tag = 'span'
 124             motif.text = None
 125             moved_motif.tail = None
 126             moved_motif.set('moved', str(move_by))
 127
 128             for br in verse.itersiblings('br'):
 129                 if move_by > 1:
 130                     move_by -= 1
 131                     continue
 132                 br.addnext(moved_motif)
 133                 break
 134
 135
 136 def parse_creator(doc):
 137     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 138     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 139                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 140                     namespaces = {'dc': str(DCNS)})[::-1]:
 141         if not person.text:
 142             continue
 143         p = Person.from_text(person.text)
 144         person_parsed = deepcopy(person)
 145         person_parsed.tag = person.tag + '_parsed'
 146         person_parsed.set('sortkey', person.text)
 147         person_parsed.text = p.readable()
 148         person.getparent().insert(0, person_parsed)
 149
 150
 151 def get_stylesheet(name):
 152     return get_resource(STYLESHEETS[name])
 153
 154
 155 def package_available(package, args='', verbose=False):
 156     """ check if a verion of a latex package accepting given args is available """
 157     tempdir = mkdtemp('-wl2pdf-test')
 158     fpath = os.path.join(tempdir, 'test.tex')
 159     f = open(fpath, 'w')
 160     f.write(r"""
 161         \documentclass{wl}
 162         \usepackage[%s]{%s}
 163         \begin{document}
 164         \end{document}
 165         """ % (args, package))
 166     f.close()
 167     if verbose:
 168         p = call(['xelatex', '-output-directory', tempdir, fpath])
 169     else:
 170         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 171     shutil.rmtree(tempdir)
 172     return p == 0
 173
 174
 175 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 176               cover=None, flags=None, customizations=None):
 177     """ produces a PDF file with XeLaTeX
 178
 179     wldoc: a WLDocument
 180     verbose: prints all output from LaTeX
 181     save_tex: path to save the intermediary LaTeX file to
 182     morefloats (old/new/none): force specific morefloats
 183     cover: a cover.Cover object
 184     flags: less-advertising,
 185     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 186     """
 187
 188     # Parse XSLT
 189     try:
 190         document = load_including_children(wldoc)
 191
 192         if cover:
 193             if cover is True:
 194                 cover = WLCover
 195             document.edoc.getroot().set('data-cover-width', str(cover.width))
 196             document.edoc.getroot().set('data-cover-height', str(cover.height))
 197             if cover.uses_dc_cover:
 198                 if document.book_info.cover_by:
 199                     document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
 200                 if document.book_info.cover_source:
 201                     document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
 202         if flags:
 203             for flag in flags:
 204                 document.edoc.getroot().set('flag-' + flag, 'yes')
 205
 206         # check for LaTeX packages
 207         if morefloats:
 208             document.edoc.getroot().set('morefloats', morefloats.lower())
 209         elif package_available('morefloats', 'maxfloats=19'):
 210             document.edoc.getroot().set('morefloats', 'new')
 211
 212         # add customizations
 213         if customizations is not None:
 214             document.edoc.getroot().set('customizations', u','.join(customizations))
 215
 216         # hack the tree
 217         move_motifs_inside(document.edoc)
 218         hack_motifs(document.edoc)
 219         parse_creator(document.edoc)
 220         substitute_hyphens(document.edoc)
 221         fix_hanging(document.edoc)
 222
 223         # wl -> TeXML
 224         style_filename = get_stylesheet("wl2tex")
 225         style = etree.parse(style_filename)
 226
 227         texml = document.transform(style)
 228
 229         # TeXML -> LaTeX
 230         temp = mkdtemp('-wl2pdf')
 231
 232         if cover:
 233             c = cover(document.book_info)
 234             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 235                 c.save(f)
 236
 237         del document # no longer needed large object :)
 238
 239         tex_path = os.path.join(temp, 'doc.tex')
 240         fout = open(tex_path, 'w')
 241         process(StringIO(texml), fout, 'utf-8')
 242         fout.close()
 243         del texml
 244
 245         if save_tex:
 246             shutil.copy(tex_path, save_tex)
 247
 248         # LaTeX -> PDF
 249         shutil.copy(get_resource('pdf/wl.cls'), temp)
 250         shutil.copy(get_resource('res/wl-logo.png'), temp)
 251
 252         cwd = os.getcwd()
 253         os.chdir(temp)
 254
 255         if verbose:
 256             p = call(['xelatex', tex_path])
 257         else:
 258             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 259         if p:
 260             raise ParseError("Error parsing .tex file")
 261
 262         os.chdir(cwd)
 263
 264         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 265         pdf_path = os.path.join(temp, 'doc.pdf')
 266         shutil.move(pdf_path, output_file.name)
 267         shutil.rmtree(temp)
 268         return OutputFile.from_filename(output_file.name)
 269
 270     except (XMLSyntaxError, XSLTApplyError), e:
 271         raise ParseError(e)
 272
 273
 274 def load_including_children(wldoc=None, provider=None, uri=None):
 275     """ Makes one big xml file with children inserted at end.
 276
 277     Either wldoc or provider and URI must be provided.
 278     """
 279
 280     if uri and provider:
 281         f = provider.by_uri(uri)
 282         text = f.read().decode('utf-8')
 283         f.close()
 284     elif wldoc is not None:
 285         text = etree.tostring(wldoc.edoc, encoding=unicode)
 286         provider = wldoc.provider
 287     else:
 288         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 289
 290     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 291
 292     document = WLDocument.from_string(text, parse_dublincore=True)
 293     document.swap_endlines()
 294
 295     for child_uri in document.book_info.parts:
 296         child = load_including_children(provider=provider, uri=child_uri)
 297         document.edoc.getroot().append(child.edoc.getroot())
 298     return document