librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24
  25 from librarian.dcparser import Person
  26 from librarian.parser import WLDocument
  27 from librarian import ParseError, DCNS, get_resource, IOFile, Format
  28 from librarian import functions
  29
  30
  31 functions.reg_substitute_entities()
  32 functions.reg_strip()
  33 functions.reg_starts_white()
  34 functions.reg_ends_white()
  35 functions.reg_texcommand()
  36
  37 STYLESHEETS = {
  38     'wl2tex': 'pdf/wl2tex.xslt',
  39 }
  40
  41
  42 def insert_tags(doc, split_re, tagname, exclude=None):
  43     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  44
  45     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  46     >>> insert_tags(t, re.compile('-'), 'd')
  47     >>> print etree.tostring(t)
  48     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  49     """
  50
  51     for elem in doc.iter(tag=etree.Element):
  52         if exclude and elem.tag in exclude:
  53             continue
  54         if elem.text:
  55             chunks = split_re.split(elem.text)
  56             while len(chunks) > 1:
  57                 ins = etree.Element(tagname)
  58                 ins.tail = chunks.pop()
  59                 elem.insert(0, ins)
  60             elem.text = chunks.pop(0)
  61         if elem.tail:
  62             chunks = split_re.split(elem.tail)
  63             parent = elem.getparent()
  64             ins_index = parent.index(elem) + 1
  65             while len(chunks) > 1:
  66                 ins = etree.Element(tagname)
  67                 ins.tail = chunks.pop()
  68                 parent.insert(ins_index, ins)
  69             elem.tail = chunks.pop(0)
  70
  71
  72 def substitute_hyphens(doc):
  73     insert_tags(doc,
  74                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  75                 "dywiz",
  76                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
  77                 )
  78
  79
  80 def fix_hanging(doc):
  81     insert_tags(doc,
  82                 re.compile("(?<=\s\w)\s+"),
  83                 "nbsp",
  84                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  85                 )
  86
  87
  88 def move_motifs_inside(doc):
  89     """ moves motifs to be into block elements """
  90     main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp',
  91                  'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny')
  92     for master in doc.xpath('|'.join('//' + tag for tag in main_tags)):
  93         for motif in master.xpath('motyw'):
  94             for sib in motif.itersiblings():
  95                 special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
  96                                 'begin', 'end', 'motyw', 'extra', 'uwaga')
  97                 if sib.tag not in special_tags:
  98                     # motif shouldn't have a tail - it would be untagged text
  99                     motif.tail = None
 100                     motif.getparent().remove(motif)
 101                     sib.insert(0, motif)
 102                     break
 103
 104
 105 def hack_motifs(doc):
 106     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 107     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 108
 109     moves motifs in stanzas from first verse to second
 110     and from next to last to last, then inserts negative vspace before them
 111     """
 112     for motif in doc.findall('//strofa//motyw'):
 113         # find relevant verse-level tag
 114         verse, stanza = motif, motif.getparent()
 115         while stanza is not None and stanza.tag != 'strofa':
 116             verse, stanza = stanza, stanza.getparent()
 117         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 118         breaks_after = sum(1 for i in verse.itersiblings('br'))
 119         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 120             move_by = 1
 121             if breaks_after == 2:
 122                 move_by += 1
 123             moved_motif = deepcopy(motif)
 124             motif.tag = 'span'
 125             motif.text = None
 126             moved_motif.tail = None
 127             moved_motif.set('moved', str(move_by))
 128
 129             for br in verse.itersiblings('br'):
 130                 if move_by > 1:
 131                     move_by -= 1
 132                     continue
 133                 br.addnext(moved_motif)
 134                 break
 135
 136
 137 def parse_creator(doc):
 138     """Generates readable versions of creator and translator tags.
 139
 140     Finds all dc:creator and dc.contributor.translator tags
 141     and adds *_parsed versions with forenames first.
 142     """
 143     persons = doc.xpath(
 144         "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 145         namespaces={'dc': str(DCNS)})[::-1]
 146     for person in persons:
 147         if not person.text:
 148             continue
 149         p = Person.from_text(person.text)
 150         person_parsed = deepcopy(person)
 151         person_parsed.tag = person.tag + '_parsed'
 152         person_parsed.set('sortkey', person.text)
 153         person_parsed.text = p.readable()
 154         person.getparent().insert(0, person_parsed)
 155
 156
 157 def get_stylesheet(name):
 158     return get_resource(STYLESHEETS[name])
 159
 160
 161 def package_available(package, args='', verbose=False):
 162     """ check if a verion of a latex package accepting given args is available """
 163     tempdir = mkdtemp('-wl2pdf-test')
 164     fpath = os.path.join(tempdir, 'test.tex')
 165     f = open(fpath, 'w')
 166     f.write(r"""
 167         \documentclass{wl}
 168         \usepackage[%s]{%s}
 169         \begin{document}
 170         \end{document}
 171         """ % (args, package))
 172     f.close()
 173     if verbose:
 174         p = call(['xelatex', '-output-directory', tempdir, fpath])
 175     else:
 176         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 177     shutil.rmtree(tempdir)
 178     return p == 0
 179
 180
 181 # not used
 182 def load_including_children(wldoc=None, provider=None, uri=None):
 183     """ Makes one big xml file with children inserted at end.
 184
 185     Either wldoc or provider and URI must be provided.
 186     """
 187
 188     if uri and provider:
 189         f = provider.by_uri(uri)
 190         # WTF DocProvider.by_uri() returns IOFile, so no .read() there
 191         text = f.read().decode('utf-8')
 192         f.close()
 193     elif wldoc is not None:
 194         text = etree.tostring(wldoc.edoc, encoding=unicode)
 195         provider = wldoc.provider
 196     else:
 197         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 198
 199     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 200
 201     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 202     document.swap_endlines()
 203
 204     for child_uri in document.book_info.parts:
 205         child = load_including_children(provider=provider, uri=child_uri)
 206         document.edoc.getroot().append(child.edoc.getroot())
 207     return document
 208
 209
 210 class PDFFormat(Format):
 211     """ Base PDF format.
 212
 213     Available customization:
 214         nofootnotes: Doesn't do footnotes.
 215         nothemes: Doesn't do themes.
 216         defaultleading: Default leading.
 217         onehalfleading: Bigger leading.
 218         doubleleading: Big leading.
 219         nowlfont: Uses standard TeX font instead of JUnicodeWL.
 220
 221     """
 222
 223     cover_class = None
 224     tex_passes = 1
 225     style = get_resource('pdf/default.sty')
 226     cover = None
 227
 228     @property
 229     def has_cover(self):
 230         """ For use in XSLT. """
 231         return self.cover is not None
 232
 233     @property
 234     def customization_str(self):
 235         """ For use in XSLT. """
 236         return u','.join(k for k, v in self.customization.items() if v)
 237
 238     def get_texml(self):
 239         raise NotImplementedError
 240
 241     def get_tex_dir(self):
 242         texml = self.get_texml()
 243         temp = mkdtemp('-wl2pdf')
 244         # Save TeX file
 245         tex_path = os.path.join(temp, 'doc.tex')
 246         with open(tex_path, 'w') as fout:
 247             process(StringIO(texml), fout, 'utf-8')
 248         if self.save_tex:
 249             shutil.copy(tex_path, self.save_tex)
 250         # Copy style
 251         shutil.copy(get_resource('pdf/wl.cls'), temp)
 252         shutil.copy(self.style, os.path.join(temp, 'style.sty'))
 253         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
 254         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
 255
 256         # Save attachments
 257         if self.cover:
 258             self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
 259         return temp
 260
 261     def get_pdf(self):
 262         temp = self.get_tex_dir()
 263         tex_path = os.path.join(temp, 'doc.tex')
 264         try:
 265             cwd = os.getcwd()
 266         except OSError:
 267             cwd = None
 268         os.chdir(temp)
 269
 270         p = None
 271         if self.verbose:
 272             for i in xrange(self.tex_passes):
 273                 p = call(['xelatex', tex_path])
 274         else:
 275             for i in xrange(self.tex_passes):
 276                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 277         if p:
 278             raise ParseError("Error parsing .tex file: %s" % tex_path)
 279
 280         if cwd is not None:
 281             os.chdir(cwd)
 282
 283         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 284         pdf_path = os.path.join(temp, 'doc.pdf')
 285         shutil.move(pdf_path, output_file.name)
 286         shutil.rmtree(temp)
 287         return IOFile.from_filename(output_file.name)
 288
 289     def build(self, verbose=False, save_tex=None, morefloats=None):
 290         """ morefloats: new/old/none
 291         """
 292         self.verbose = verbose
 293         self.save_tex = save_tex
 294
 295         if morefloats is None and package_available('morefloats', 'maxfloats=19'):
 296             morefloats = 'new'
 297         self.morefloats = morefloats
 298
 299         book_info = self.wldoc.book_info
 300         if self.cover_class:
 301             self.cover = self.cover_class(book_info)
 302
 303         return self.get_pdf()