librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24
  25 from librarian.dcparser import Person
  26 from librarian.parser import WLDocument
  27 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  28 from librarian import functions
  29
  30
  31 functions.reg_substitute_entities()
  32 functions.reg_strip()
  33 functions.reg_starts_white()
  34 functions.reg_ends_white()
  35 functions.reg_texcommand()
  36
  37 STYLESHEETS = {
  38     'wl2tex': 'pdf/wl2tex.xslt',
  39 }
  40
  41
  42 def insert_tags(doc, split_re, tagname, exclude=None):
  43     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  44
  45     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  46     >>> insert_tags(t, re.compile('-'), 'd')
  47     >>> print etree.tostring(t)
  48     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  49     """
  50
  51     for elem in doc.iter(tag=etree.Element):
  52         if exclude and elem.tag in exclude:
  53             continue
  54         if elem.text:
  55             chunks = split_re.split(elem.text)
  56             while len(chunks) > 1:
  57                 ins = etree.Element(tagname)
  58                 ins.tail = chunks.pop()
  59                 elem.insert(0, ins)
  60             elem.text = chunks.pop(0)
  61         if elem.tail:
  62             chunks = split_re.split(elem.tail)
  63             parent = elem.getparent()
  64             ins_index = parent.index(elem) + 1
  65             while len(chunks) > 1:
  66                 ins = etree.Element(tagname)
  67                 ins.tail = chunks.pop()
  68                 parent.insert(ins_index, ins)
  69             elem.tail = chunks.pop(0)
  70
  71
  72 def substitute_hyphens(doc):
  73     insert_tags(
  74         doc, re.compile("(?<=[^-\s])-(?=[^-\s])"), "dywiz",
  75         exclude=[
  76             DCNS("identifier.url"),
  77             DCNS("rights.license"),
  78             DCNS("title"),
  79             DCNS("description"),
  80             DCNS("subject.curriculum"),
  81             'www',
  82         ]
  83     )
  84
  85
  86 def fix_hanging(doc):
  87     insert_tags(
  88         doc, re.compile("(?<=\s\w)\s+"), "nbsp",
  89         exclude=[
  90             DCNS("identifier.url"),
  91             DCNS("rights.license"),
  92             DCNS("title"),
  93             DCNS("description"),
  94             DCNS("subject.curriculum"),
  95         ]
  96     )
  97
  98
  99 def move_motifs_inside(doc):
 100     """ moves motifs to be into block elements """
 101     main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp',
 102                  'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny')
 103     for master in doc.xpath('|'.join('//' + tag for tag in main_tags)):
 104         for motif in master.xpath('motyw'):
 105             for sib in motif.itersiblings():
 106                 special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 107                                 'begin', 'end', 'motyw', 'extra', 'uwaga')
 108                 if sib.tag not in special_tags:
 109                     # motif shouldn't have a tail - it would be untagged text
 110                     motif.tail = None
 111                     motif.getparent().remove(motif)
 112                     sib.insert(0, motif)
 113                     break
 114
 115
 116 def hack_motifs(doc):
 117     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 118     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 119
 120     moves motifs in stanzas from first verse to second
 121     and from next to last to last, then inserts negative vspace before them
 122     """
 123     for motif in doc.findall('//strofa//motyw'):
 124         # find relevant verse-level tag
 125         verse, stanza = motif, motif.getparent()
 126         while stanza is not None and stanza.tag != 'strofa':
 127             verse, stanza = stanza, stanza.getparent()
 128         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 129         breaks_after = sum(1 for i in verse.itersiblings('br'))
 130         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 131             move_by = 1
 132             if breaks_after == 2:
 133                 move_by += 1
 134             moved_motif = deepcopy(motif)
 135             motif.tag = 'span'
 136             motif.text = None
 137             moved_motif.tail = None
 138             moved_motif.set('moved', str(move_by))
 139
 140             for br in verse.itersiblings('br'):
 141                 if move_by > 1:
 142                     move_by -= 1
 143                     continue
 144                 br.addnext(moved_motif)
 145                 break
 146
 147
 148 def parse_creator(doc):
 149     """Generates readable versions of creator and translator tags.
 150
 151     Finds all dc:creator and dc.contributor.translator tags
 152     and adds *_parsed versions with forenames first.
 153     """
 154     persons = doc.xpath(
 155         "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 156         namespaces={'dc': str(DCNS)})[::-1]
 157     for person in persons:
 158         if not person.text:
 159             continue
 160         p = Person.from_text(person.text)
 161         person_parsed = deepcopy(person)
 162         person_parsed.tag = person.tag + '_parsed'
 163         person_parsed.set('sortkey', person.text)
 164         person_parsed.text = p.readable()
 165         person.getparent().insert(0, person_parsed)
 166
 167
 168 def get_stylesheet(name):
 169     return get_resource(STYLESHEETS[name])
 170
 171
 172 def package_available(package, args='', verbose=False):
 173     """ check if a verion of a latex package accepting given args is available """
 174     tempdir = mkdtemp('-wl2pdf-test')
 175     fpath = os.path.join(tempdir, 'test.tex')
 176     f = open(fpath, 'w')
 177     f.write(r"""
 178         \documentclass{wl}
 179         \usepackage[%s]{%s}
 180         \begin{document}
 181         \end{document}
 182         """ % (args, package))
 183     f.close()
 184     if verbose:
 185         p = call(['xelatex', '-output-directory', tempdir, fpath])
 186     else:
 187         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 188     shutil.rmtree(tempdir)
 189     return p == 0
 190
 191
 192 # not used
 193 def load_including_children(wldoc=None, provider=None, uri=None):
 194     """ Makes one big xml file with children inserted at end.
 195
 196     Either wldoc or provider and URI must be provided.
 197     """
 198
 199     if uri and provider:
 200         f = provider.by_uri(uri)
 201         # WTF DocProvider.by_uri() returns IOFile, so no .read() there
 202         text = f.read().decode('utf-8')
 203         f.close()
 204     elif wldoc is not None:
 205         text = etree.tostring(wldoc.edoc, encoding=unicode)
 206         provider = wldoc.provider
 207     else:
 208         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 209
 210     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 211
 212     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 213     document.swap_endlines()
 214
 215     for child_uri in document.book_info.parts:
 216         child = load_including_children(provider=provider, uri=child_uri)
 217         document.edoc.getroot().append(child.edoc.getroot())
 218     return document
 219
 220
 221 class PDFFormat(Format):
 222     """ Base PDF format.
 223
 224     Available customization:
 225         nofootnotes: Doesn't do footnotes.
 226         nothemes: Doesn't do themes.
 227         defaultleading: Default leading.
 228         onehalfleading: Bigger leading.
 229         doubleleading: Big leading.
 230         nowlfont: Uses standard TeX font instead of JUnicodeWL.
 231
 232     """
 233
 234     cover_class = None
 235     tex_passes = 1
 236     style = get_resource('pdf/default.sty')
 237     cover = None
 238
 239     @property
 240     def has_cover(self):
 241         """ For use in XSLT. """
 242         return self.cover is not None
 243
 244     @property
 245     def customization_str(self):
 246         """ For use in XSLT. """
 247         return u','.join(k for k, v in self.customization.items() if v)
 248
 249     def get_texml(self):
 250         raise NotImplementedError
 251
 252     def get_tex_dir(self):
 253         texml = self.get_texml()
 254         temp = mkdtemp('-wl2pdf')
 255         # Save TeX file
 256         tex_path = os.path.join(temp, 'doc.tex')
 257         with open(tex_path, 'w') as fout:
 258             process(StringIO(texml), fout, 'utf-8')
 259         if self.save_tex:
 260             shutil.copy(tex_path, self.save_tex)
 261         # Copy style
 262         shutil.copy(get_resource('pdf/wl.cls'), temp)
 263         shutil.copy(self.style, os.path.join(temp, 'style.sty'))
 264         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
 265         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
 266
 267         # Save attachments
 268         if self.cover:
 269             self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
 270         return temp
 271
 272     def get_pdf(self):
 273         temp = self.get_tex_dir()
 274         tex_path = os.path.join(temp, 'doc.tex')
 275         try:
 276             cwd = os.getcwd()
 277         except OSError:
 278             cwd = None
 279         os.chdir(temp)
 280
 281         p = None
 282         if self.verbose:
 283             for i in xrange(self.tex_passes):
 284                 p = call(['xelatex', tex_path])
 285         else:
 286             for i in xrange(self.tex_passes):
 287                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 288         if p:
 289             raise ParseError("Error parsing .tex file: %s" % tex_path)
 290
 291         if cwd is not None:
 292             os.chdir(cwd)
 293
 294         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 295         pdf_path = os.path.join(temp, 'doc.pdf')
 296         shutil.move(pdf_path, output_file.name)
 297         shutil.rmtree(temp)
 298         return IOFile.from_filename(output_file.name)
 299
 300     def build(self, verbose=False, save_tex=None, morefloats=None):
 301         """ morefloats: new/old/none
 302         """
 303         self.verbose = verbose
 304         self.save_tex = save_tex
 305
 306         if morefloats is None and package_available('morefloats', 'maxfloats=19'):
 307             morefloats = 'new'
 308         self.morefloats = morefloats
 309
 310         book_info = self.wldoc.book_info
 311         if self.cover_class:
 312             self.cover = self.cover_class(book_info)
 313
 314         return self.get_pdf()