librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import DefaultEbookCover
  31 from .sponsor import sponsor_logo
  32
  33
  34 functions.reg_substitute_entities()
  35 functions.reg_strip()
  36 functions.reg_starts_white()
  37 functions.reg_ends_white()
  38 functions.reg_texcommand()
  39
  40 STYLESHEETS = {
  41     'wl2tex': 'pdf/wl2tex.xslt',
  42 }
  43
  44 #CUSTOMIZATIONS = [
  45 #    'nofootnotes',
  46 #    'nothemes',
  47 #    'defaultleading',
  48 #    'onehalfleading',
  49 #    'doubleleading',
  50 #    'nowlfont',
  51 #    ]
  52
  53 def insert_tags(doc, split_re, tagname, exclude=None):
  54     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  55
  56     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  57     >>> insert_tags(t, re.compile('-'), 'd');
  58     >>> print etree.tostring(t)
  59     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  60     """
  61
  62     for elem in doc.iter(tag=etree.Element):
  63         if exclude and elem.tag in exclude:
  64             continue
  65         if elem.text:
  66             chunks = split_re.split(elem.text)
  67             while len(chunks) > 1:
  68                 ins = etree.Element(tagname)
  69                 ins.tail = chunks.pop()
  70                 elem.insert(0, ins)
  71             elem.text = chunks.pop(0)
  72         if elem.tail:
  73             chunks = split_re.split(elem.tail)
  74             parent = elem.getparent()
  75             ins_index = parent.index(elem) + 1
  76             while len(chunks) > 1:
  77                 ins = etree.Element(tagname)
  78                 ins.tail = chunks.pop()
  79                 parent.insert(ins_index, ins)
  80             elem.tail = chunks.pop(0)
  81
  82
  83 def substitute_hyphens(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  86                 "dywiz",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  88                 )
  89
  90
  91 def fix_hanging(doc):
  92     insert_tags(doc,
  93                 re.compile("(?<=\s\w)\s+"),
  94                 "nbsp",
  95                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  96                 )
  97
  98
  99 def move_motifs_inside(doc):
 100     """ moves motifs to be into block elements """
 101     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 102         for motif in master.xpath('motyw'):
 103             for sib in motif.itersiblings():
 104                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 105                     # motif shouldn't have a tail - it would be untagged text
 106                     motif.tail = None
 107                     motif.getparent().remove(motif)
 108                     sib.insert(0, motif)
 109                     break
 110
 111
 112 def hack_motifs(doc):
 113     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 114     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 115
 116     moves motifs in stanzas from first verse to second
 117     and from next to last to last, then inserts negative vspace before them
 118     """
 119     for motif in doc.findall('//strofa//motyw'):
 120         # find relevant verse-level tag
 121         verse, stanza = motif, motif.getparent()
 122         while stanza is not None and stanza.tag != 'strofa':
 123             verse, stanza = stanza, stanza.getparent()
 124         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 125         breaks_after = sum(1 for i in verse.itersiblings('br'))
 126         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 127             move_by = 1
 128             if breaks_after == 2:
 129                 move_by += 1
 130             moved_motif = deepcopy(motif)
 131             motif.tag = 'span'
 132             motif.text = None
 133             moved_motif.tail = None
 134             moved_motif.set('moved', str(move_by))
 135
 136             for br in verse.itersiblings('br'):
 137                 if move_by > 1:
 138                     move_by -= 1
 139                     continue
 140                 br.addnext(moved_motif)
 141                 break
 142
 143
 144 def parse_creator(doc):
 145     """Generates readable versions of creator and translator tags.
 146
 147     Finds all dc:creator and dc.contributor.translator tags
 148     and adds *_parsed versions with forenames first.
 149     """
 150     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 151                     'creator', 'contributor.translator')),
 152                     namespaces = {'dc': str(DCNS)})[::-1]:
 153         if not person.text:
 154             continue
 155         p = Person.from_text(person.text)
 156         person_parsed = deepcopy(person)
 157         person_parsed.tag = person.tag + '_parsed'
 158         person_parsed.set('sortkey', person.text)
 159         person_parsed.text = p.readable()
 160         person.getparent().insert(0, person_parsed)
 161
 162
 163 def get_stylesheet(name):
 164     return get_resource(STYLESHEETS[name])
 165
 166
 167 def package_available(package, args='', verbose=False):
 168     """ check if a verion of a latex package accepting given args is available """
 169     tempdir = mkdtemp('-wl2pdf-test')
 170     fpath = os.path.join(tempdir, 'test.tex')
 171     f = open(fpath, 'w')
 172     f.write(r"""
 173         \documentclass{wl}
 174         \usepackage[%s]{%s}
 175         \begin{document}
 176         \end{document}
 177         """ % (args, package))
 178     f.close()
 179     if verbose:
 180         p = call(['xelatex', '-output-directory', tempdir, fpath])
 181     else:
 182         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 183     shutil.rmtree(tempdir)
 184     return p == 0
 185
 186
 187 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 188               cover=None, flags=None, customizations=None):
 189     """ produces a PDF file with XeLaTeX
 190
 191     wldoc: a WLDocument
 192     verbose: prints all output from LaTeX
 193     save_tex: path to save the intermediary LaTeX file to
 194     morefloats (old/new/none): force specific morefloats
 195     cover: a cover.Cover factory or True for default
 196     flags: less-advertising,
 197     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 198     """
 199
 200     # Parse XSLT
 201     try:
 202         book_info = wldoc.book_info
 203         document = load_including_children(wldoc)
 204         root = document.edoc.getroot()
 205
 206         if cover:
 207             if cover is True:
 208                 cover = DefaultEbookCover
 209             bound_cover = cover(book_info, width=1200)
 210             root.set('data-cover-width', str(bound_cover.width))
 211             root.set('data-cover-height', str(bound_cover.height))
 212             if bound_cover.uses_dc_cover:
 213                 if book_info.cover_by:
 214                     root.set('data-cover-by', book_info.cover_by)
 215                 if book_info.cover_source:
 216                     root.set('data-cover-source',
 217                             book_info.cover_source)
 218         if flags:
 219             for flag in flags:
 220                 root.set('flag-' + flag, 'yes')
 221
 222         # check for LaTeX packages
 223         if morefloats:
 224             root.set('morefloats', morefloats.lower())
 225         elif package_available('morefloats', 'maxfloats=19'):
 226             root.set('morefloats', 'new')
 227
 228         # add customizations
 229         if customizations is not None:
 230             root.set('customizations', u','.join(customizations))
 231
 232         # add editors info
 233         root.set('editors', u', '.join(sorted(
 234             editor.readable() for editor in document.editors())))
 235         if document.book_info.funders:
 236             root.set('funders', u', '.join(document.book_info.funders))
 237         if document.book_info.thanks:
 238             root.set('thanks', document.book_info.thanks)
 239
 240         # hack the tree
 241         move_motifs_inside(document.edoc)
 242         hack_motifs(document.edoc)
 243         parse_creator(document.edoc)
 244         substitute_hyphens(document.edoc)
 245         fix_hanging(document.edoc)
 246
 247         # wl -> TeXML
 248         style_filename = get_stylesheet("wl2tex")
 249         style = etree.parse(style_filename)
 250
 251         # TeXML -> LaTeX
 252         temp = mkdtemp('-wl2pdf')
 253
 254         for sponsor in book_info.sponsors:
 255             ins = etree.Element("data-sponsor", name=sponsor)
 256             logo = sponsor_logo(sponsor)
 257             if logo:
 258                 fname = 'sponsor-%s' % os.path.basename(logo)
 259                 shutil.copy(logo, os.path.join(temp, fname))
 260                 ins.set('src', fname)
 261             root.insert(0, ins)
 262
 263         if book_info.sponsor_note:
 264             root.set("sponsor-note", book_info.sponsor_note)
 265
 266         texml = document.transform(style)
 267
 268         if cover:
 269             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 270                 bound_cover.save(f, quality=80)
 271
 272         del document # no longer needed large object :)
 273
 274         tex_path = os.path.join(temp, 'doc.tex')
 275         fout = open(tex_path, 'w')
 276         process(StringIO(texml), fout, 'utf-8')
 277         fout.close()
 278         del texml
 279
 280         if save_tex:
 281             shutil.copy(tex_path, save_tex)
 282
 283         # LaTeX -> PDF
 284         shutil.copy(get_resource('pdf/wl.cls'), temp)
 285         shutil.copy(get_resource('res/wl-logo.png'), temp)
 286
 287         try:
 288             cwd = os.getcwd()
 289         except OSError:
 290             cwd = None
 291         os.chdir(temp)
 292
 293         if verbose:
 294             p = call(['xelatex', tex_path])
 295         else:
 296             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 297         if p:
 298             raise ParseError("Error parsing .tex file")
 299
 300         if cwd is not None:
 301             os.chdir(cwd)
 302
 303         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 304         pdf_path = os.path.join(temp, 'doc.pdf')
 305         shutil.move(pdf_path, output_file.name)
 306         shutil.rmtree(temp)
 307         return OutputFile.from_filename(output_file.name)
 308
 309     except (XMLSyntaxError, XSLTApplyError), e:
 310         raise ParseError(e)
 311
 312
 313 def load_including_children(wldoc=None, provider=None, uri=None):
 314     """ Makes one big xml file with children inserted at end.
 315
 316     Either wldoc or provider and URI must be provided.
 317     """
 318
 319     if uri and provider:
 320         f = provider.by_uri(uri)
 321         text = f.read().decode('utf-8')
 322         f.close()
 323     elif wldoc is not None:
 324         text = etree.tostring(wldoc.edoc, encoding=unicode)
 325         provider = wldoc.provider
 326     else:
 327         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 328
 329     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 330
 331     document = WLDocument.from_string(text,
 332                 parse_dublincore=True, provider=provider)
 333     document.swap_endlines()
 334
 335     for child_uri in document.book_info.parts:
 336         child = load_including_children(provider=provider, uri=child_uri)
 337         document.edoc.getroot().append(child.edoc.getroot())
 338     return document