librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  29 from librarian import functions
  30
  31
  32 functions.reg_substitute_entities()
  33 functions.reg_strip()
  34 functions.reg_starts_white()
  35 functions.reg_ends_white()
  36 functions.reg_texcommand()
  37
  38 STYLESHEETS = {
  39     'wl2tex': 'pdf/wl2tex.xslt',
  40 }
  41
  42 def insert_tags(doc, split_re, tagname, exclude=None):
  43     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  44
  45     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  46     >>> insert_tags(t, re.compile('-'), 'd');
  47     >>> print etree.tostring(t)
  48     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  49     """
  50
  51     for elem in doc.iter(tag=etree.Element):
  52         if exclude and elem.tag in exclude:
  53             continue
  54         if elem.text:
  55             chunks = split_re.split(elem.text)
  56             while len(chunks) > 1:
  57                 ins = etree.Element(tagname)
  58                 ins.tail = chunks.pop()
  59                 elem.insert(0, ins)
  60             elem.text = chunks.pop(0)
  61         if elem.tail:
  62             chunks = split_re.split(elem.tail)
  63             parent = elem.getparent()
  64             ins_index = parent.index(elem) + 1
  65             while len(chunks) > 1:
  66                 ins = etree.Element(tagname)
  67                 ins.tail = chunks.pop()
  68                 parent.insert(ins_index, ins)
  69             elem.tail = chunks.pop(0)
  70
  71
  72 def substitute_hyphens(doc):
  73     insert_tags(doc,
  74                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  75                 "dywiz",
  76                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
  77                 )
  78
  79
  80 def fix_hanging(doc):
  81     insert_tags(doc,
  82                 re.compile("(?<=\s\w)\s+"),
  83                 "nbsp",
  84                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  85                 )
  86
  87
  88 def move_motifs_inside(doc):
  89     """ moves motifs to be into block elements """
  90     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  91         for motif in master.xpath('motyw'):
  92             for sib in motif.itersiblings():
  93                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  94                     # motif shouldn't have a tail - it would be untagged text
  95                     motif.tail = None
  96                     motif.getparent().remove(motif)
  97                     sib.insert(0, motif)
  98                     break
  99
 100
 101 def hack_motifs(doc):
 102     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 103     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 104
 105     moves motifs in stanzas from first verse to second
 106     and from next to last to last, then inserts negative vspace before them
 107     """
 108     for motif in doc.findall('//strofa//motyw'):
 109         # find relevant verse-level tag
 110         verse, stanza = motif, motif.getparent()
 111         while stanza is not None and stanza.tag != 'strofa':
 112             verse, stanza = stanza, stanza.getparent()
 113         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 114         breaks_after = sum(1 for i in verse.itersiblings('br'))
 115         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 116             move_by = 1
 117             if breaks_after == 2:
 118                 move_by += 1
 119             moved_motif = deepcopy(motif)
 120             motif.tag = 'span'
 121             motif.text = None
 122             moved_motif.tail = None
 123             moved_motif.set('moved', str(move_by))
 124
 125             for br in verse.itersiblings('br'):
 126                 if move_by > 1:
 127                     move_by -= 1
 128                     continue
 129                 br.addnext(moved_motif)
 130                 break
 131
 132
 133 def parse_creator(doc):
 134     """Generates readable versions of creator and translator tags.
 135
 136     Finds all dc:creator and dc.contributor.translator tags
 137     and adds *_parsed versions with forenames first.
 138     """
 139     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 140                     'creator', 'contributor.translator')),
 141                     namespaces = {'dc': str(DCNS)})[::-1]:
 142         if not person.text:
 143             continue
 144         p = Person.from_text(person.text)
 145         person_parsed = deepcopy(person)
 146         person_parsed.tag = person.tag + '_parsed'
 147         person_parsed.set('sortkey', person.text)
 148         person_parsed.text = p.readable()
 149         person.getparent().insert(0, person_parsed)
 150
 151
 152 def get_stylesheet(name):
 153     return get_resource(STYLESHEETS[name])
 154
 155
 156 def package_available(package, args='', verbose=False):
 157     """ check if a verion of a latex package accepting given args is available """
 158     tempdir = mkdtemp('-wl2pdf-test')
 159     fpath = os.path.join(tempdir, 'test.tex')
 160     f = open(fpath, 'w')
 161     f.write(r"""
 162         \documentclass{wl}
 163         \usepackage[%s]{%s}
 164         \begin{document}
 165         \end{document}
 166         """ % (args, package))
 167     f.close()
 168     if verbose:
 169         p = call(['xelatex', '-output-directory', tempdir, fpath])
 170     else:
 171         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 172     shutil.rmtree(tempdir)
 173     return p == 0
 174
 175
 176 def load_including_children(wldoc=None, provider=None, uri=None):
 177     """ Makes one big xml file with children inserted at end.
 178
 179     Either wldoc or provider and URI must be provided.
 180     """
 181
 182     if uri and provider:
 183         f = provider.by_uri(uri)
 184         text = f.read().decode('utf-8')
 185         f.close()
 186     elif wldoc is not None:
 187         text = etree.tostring(wldoc.edoc, encoding=unicode)
 188         provider = wldoc.provider
 189     else:
 190         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 191
 192     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 193
 194     document = WLDocument.from_string(text,
 195                 parse_dublincore=True, provider=provider)
 196     document.swap_endlines()
 197
 198     for child_uri in document.book_info.parts:
 199         child = load_including_children(provider=provider, uri=child_uri)
 200         document.edoc.getroot().append(child.edoc.getroot())
 201     return document
 202
 203
 204 class PDFFormat(Format):
 205     """ Base PDF format.
 206
 207     Available customization:
 208         nofootnotes: Doesn't do footnotes.
 209         nothemes: Doesn't do themes.
 210         defaultleading: Default leading.
 211         onehalfleading: Bigger leading.
 212         doubleleading: Big leading.
 213         nowlfont: Uses standard TeX font instead of JUnicodeWL.
 214
 215     """
 216
 217     cover_class = None
 218     tex_passes = 1
 219     style = get_resource('pdf/default.sty')
 220     cover = None
 221
 222     @property
 223     def has_cover(self):
 224         """ For use in XSLT. """
 225         return self.cover is not None
 226
 227     @property
 228     def customization_str(self):
 229         """ For use in XSLT. """
 230         return u','.join(k for k, v in self.customization.items() if v)
 231
 232     def get_document(self):
 233         document = load_including_children(self.wldoc)
 234         root = document.edoc.getroot()
 235         root.set('editors', u', '.join(sorted(
 236             editor.readable() for editor in document.editors())))
 237
 238         # hack the tree
 239         move_motifs_inside(document.edoc)
 240         hack_motifs(document.edoc)
 241         parse_creator(document.edoc)
 242         substitute_hyphens(document.edoc)
 243         fix_hanging(document.edoc)
 244         return document
 245
 246     def get_texml(self):
 247         style_filename = get_stylesheet("wl2tex")
 248         functions.reg_get(self)
 249         try:
 250             style = etree.parse(style_filename)
 251             texml = self.get_document().transform(style)
 252             return texml
 253         except (XMLSyntaxError, XSLTApplyError), e:
 254             raise ParseError(e)
 255
 256     def get_tex_dir(self):
 257         texml = self.get_texml()
 258         temp = mkdtemp('-wl2pdf')
 259         # Save TeX file
 260         tex_path = os.path.join(temp, 'doc.tex')
 261         with open(tex_path, 'w') as fout:
 262             process(StringIO(texml), fout, 'utf-8')
 263         if self.save_tex:
 264             shutil.copy(tex_path, self.save_tex)
 265         # Copy style
 266         shutil.copy(get_resource('pdf/wl.cls'), temp)
 267         shutil.copy(self.style, os.path.join(temp, 'style.sty'))
 268         #for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
 269         #    shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
 270
 271         # Save attachments
 272         if self.cover:
 273             self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
 274         return temp
 275
 276     def get_pdf(self):
 277         temp = self.get_tex_dir()
 278         tex_path = os.path.join(temp, 'doc.tex')
 279         try:
 280             cwd = os.getcwd()
 281         except OSError:
 282             cwd = None
 283         os.chdir(temp)
 284
 285         if self.verbose:
 286             for i in range(self.tex_passes):
 287                 p = call(['xelatex', tex_path])
 288         else:
 289             for i in range(self.tex_passes):
 290                 p = call(['xelatex', '-interaction=batchmode', tex_path],
 291                             stdout=PIPE, stderr=PIPE)
 292         if p:
 293             raise ParseError("Error parsing .tex file")
 294
 295         if cwd is not None:
 296             os.chdir(cwd)
 297
 298         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 299         pdf_path = os.path.join(temp, 'doc.pdf')
 300         shutil.move(pdf_path, output_file.name)
 301         shutil.rmtree(temp)
 302         return IOFile.from_filename(output_file.name)
 303
 304     def build(self, verbose=False, save_tex=None, morefloats=None):
 305         """ morefloats: new/old/none
 306         """
 307         self.verbose = verbose
 308         self.save_tex = save_tex
 309
 310         if morefloats is None and package_available('morefloats', 'maxfloats=19'):
 311             morefloats = 'new'
 312         self.morefloats = morefloats
 313
 314         book_info = self.wldoc.book_info
 315         if self.cover_class:
 316             self.cover = self.cover_class(book_info)
 317
 318         return self.get_pdf()