librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24
  25 from librarian.dcparser import Person
  26 from librarian.parser import WLDocument
  27 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  28 from librarian import functions
  29
  30
  31 functions.reg_substitute_entities()
  32 functions.reg_strip()
  33 functions.reg_starts_white()
  34 functions.reg_ends_white()
  35 functions.reg_texcommand()
  36
  37 STYLESHEETS = {
  38     'wl2tex': 'pdf/wl2tex.xslt',
  39 }
  40
  41
  42 def insert_tags(doc, split_re, tagname, exclude=None):
  43     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  44
  45     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  46     >>> insert_tags(t, re.compile('-'), 'd')
  47     >>> print etree.tostring(t)
  48     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  49     """
  50
  51     for elem in doc.iter(tag=etree.Element):
  52         if exclude and elem.tag in exclude:
  53             continue
  54         if elem.text:
  55             chunks = split_re.split(elem.text)
  56             while len(chunks) > 1:
  57                 ins = etree.Element(tagname)
  58                 ins.tail = chunks.pop()
  59                 elem.insert(0, ins)
  60             elem.text = chunks.pop(0)
  61         if elem.tail:
  62             chunks = split_re.split(elem.tail)
  63             parent = elem.getparent()
  64             ins_index = parent.index(elem) + 1
  65             while len(chunks) > 1:
  66                 ins = etree.Element(tagname)
  67                 ins.tail = chunks.pop()
  68                 parent.insert(ins_index, ins)
  69             elem.tail = chunks.pop(0)
  70
  71
  72 def substitute_hyphens(doc):
  73     insert_tags(
  74         doc, re.compile("(?<=[^-\s])-(?=[^-\s])"), "dywiz",
  75         exclude=[
  76             DCNS("identifier.url"),
  77             DCNS("rights.license"),
  78             DCNS("title"),
  79             DCNS("description"),
  80             DCNS("subject.curriculum"),
  81             DCNS("subject.curriculum.new"),
  82             'www',
  83         ]
  84     )
  85
  86
  87 def fix_hanging(doc):
  88     insert_tags(
  89         doc, re.compile("(?<=\s\w)\s+"), "nbsp",
  90         exclude=[
  91             DCNS("identifier.url"),
  92             DCNS("rights.license"),
  93             DCNS("title"),
  94             DCNS("description"),
  95             DCNS("subject.curriculum"),
  96             DCNS("subject.curriculum.new"),
  97         ]
  98     )
  99
 100
 101 def move_motifs_inside(doc):
 102     """ moves motifs to be into block elements """
 103     main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp',
 104                  'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny')
 105     for master in doc.xpath('|'.join('//' + tag for tag in main_tags)):
 106         for motif in master.xpath('motyw'):
 107             for sib in motif.itersiblings():
 108                 special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 109                                 'begin', 'end', 'motyw', 'extra', 'uwaga')
 110                 if sib.tag not in special_tags:
 111                     # motif shouldn't have a tail - it would be untagged text
 112                     motif.tail = None
 113                     motif.getparent().remove(motif)
 114                     sib.insert(0, motif)
 115                     break
 116
 117
 118 def hack_motifs(doc):
 119     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 120     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 121
 122     moves motifs in stanzas from first verse to second
 123     and from next to last to last, then inserts negative vspace before them
 124     """
 125     for motif in doc.findall('//strofa//motyw'):
 126         # find relevant verse-level tag
 127         verse, stanza = motif, motif.getparent()
 128         while stanza is not None and stanza.tag != 'strofa':
 129             verse, stanza = stanza, stanza.getparent()
 130         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 131         breaks_after = sum(1 for i in verse.itersiblings('br'))
 132         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 133             move_by = 1
 134             if breaks_after == 2:
 135                 move_by += 1
 136             moved_motif = deepcopy(motif)
 137             motif.tag = 'span'
 138             motif.text = None
 139             moved_motif.tail = None
 140             moved_motif.set('moved', str(move_by))
 141
 142             for br in verse.itersiblings('br'):
 143                 if move_by > 1:
 144                     move_by -= 1
 145                     continue
 146                 br.addnext(moved_motif)
 147                 break
 148
 149
 150 def parse_creator(doc):
 151     """Generates readable versions of creator and translator tags.
 152
 153     Finds all dc:creator and dc.contributor.translator tags
 154     and adds *_parsed versions with forenames first.
 155     """
 156     persons = doc.xpath(
 157         "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 158         namespaces={'dc': str(DCNS)})[::-1]
 159     for person in persons:
 160         if not person.text:
 161             continue
 162         p = Person.from_text(person.text)
 163         person_parsed = deepcopy(person)
 164         person_parsed.tag = person.tag + '_parsed'
 165         person_parsed.set('sortkey', person.text)
 166         person_parsed.text = p.readable()
 167         person.getparent().insert(0, person_parsed)
 168
 169
 170 def get_stylesheet(name):
 171     return get_resource(STYLESHEETS[name])
 172
 173
 174 def package_available(package, args='', verbose=False):
 175     """ check if a verion of a latex package accepting given args is available """
 176     tempdir = mkdtemp('-wl2pdf-test')
 177     fpath = os.path.join(tempdir, 'test.tex')
 178     f = open(fpath, 'w')
 179     f.write(r"""
 180         \documentclass{wl}
 181         \usepackage[%s]{%s}
 182         \begin{document}
 183         \end{document}
 184         """ % (args, package))
 185     f.close()
 186     if verbose:
 187         p = call(['xelatex', '-output-directory', tempdir, fpath])
 188     else:
 189         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 190     shutil.rmtree(tempdir)
 191     return p == 0
 192
 193
 194 # not used
 195 def load_including_children(wldoc=None, provider=None, uri=None):
 196     """ Makes one big xml file with children inserted at end.
 197
 198     Either wldoc or provider and URI must be provided.
 199     """
 200
 201     if uri and provider:
 202         f = provider.by_uri(uri)
 203         # WTF DocProvider.by_uri() returns IOFile, so no .read() there
 204         text = f.read().decode('utf-8')
 205         f.close()
 206     elif wldoc is not None:
 207         text = etree.tostring(wldoc.edoc, encoding=unicode)
 208         provider = wldoc.provider
 209     else:
 210         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 211
 212     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 213
 214     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 215     document.swap_endlines()
 216
 217     for child_uri in document.book_info.parts:
 218         child = load_including_children(provider=provider, uri=child_uri)
 219         document.edoc.getroot().append(child.edoc.getroot())
 220     return document
 221
 222
 223 class PDFFormat(Format):
 224     """ Base PDF format.
 225
 226     Available customization:
 227         nofootnotes: Doesn't do footnotes.
 228         nothemes: Doesn't do themes.
 229         defaultleading: Default leading.
 230         onehalfleading: Bigger leading.
 231         doubleleading: Big leading.
 232         nowlfont: Uses standard TeX font instead of JUnicodeWL.
 233
 234     """
 235
 236     cover_class = None
 237     tex_passes = 1
 238     style = get_resource('pdf/default.sty')
 239     cover = None
 240
 241     @property
 242     def has_cover(self):
 243         """ For use in XSLT. """
 244         return self.cover is not None
 245
 246     @property
 247     def customization_str(self):
 248         """ For use in XSLT. """
 249         return u','.join(k for k, v in self.customization.items() if v)
 250
 251     def get_texml(self):
 252         raise NotImplementedError
 253
 254     def get_tex_dir(self):
 255         texml = self.get_texml()
 256         temp = mkdtemp('-wl2pdf')
 257         # Save TeX file
 258         tex_path = os.path.join(temp, 'doc.tex')
 259         with open(tex_path, 'w') as fout:
 260             process(StringIO(texml), fout, 'utf-8')
 261         if self.save_tex:
 262             shutil.copy(tex_path, self.save_tex)
 263         # Copy style
 264         shutil.copy(get_resource('pdf/wl.cls'), temp)
 265         shutil.copy(self.style, os.path.join(temp, 'style.sty'))
 266         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
 267         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
 268
 269         # Save attachments
 270         if self.cover:
 271             self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
 272         return temp
 273
 274     def get_pdf(self):
 275         temp = self.get_tex_dir()
 276         tex_path = os.path.join(temp, 'doc.tex')
 277         try:
 278             cwd = os.getcwd()
 279         except OSError:
 280             cwd = None
 281         os.chdir(temp)
 282
 283         p = None
 284         if self.verbose:
 285             for i in xrange(self.tex_passes):
 286                 p = call(['xelatex', tex_path])
 287         else:
 288             for i in xrange(self.tex_passes):
 289                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 290         if p:
 291             raise ParseError("Error parsing .tex file: %s" % tex_path)
 292
 293         if cwd is not None:
 294             os.chdir(cwd)
 295
 296         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 297         pdf_path = os.path.join(temp, 'doc.pdf')
 298         shutil.move(pdf_path, output_file.name)
 299         shutil.rmtree(temp)
 300         return IOFile.from_filename(output_file.name)
 301
 302     def build(self, verbose=False, save_tex=None, morefloats=None):
 303         """ morefloats: new/old/none
 304         """
 305         self.verbose = verbose
 306         self.save_tex = save_tex
 307
 308         if morefloats is None and package_available('morefloats', 'maxfloats=19'):
 309             morefloats = 'new'
 310         self.morefloats = morefloats
 311
 312         book_info = self.wldoc.book_info
 313         if self.cover_class:
 314             self.cover = self.cover_class(book_info)
 315
 316         return self.get_pdf()