librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import DefaultEbookCover
  31 from .sponsor import sponsor_logo
  32
  33
  34 functions.reg_substitute_entities()
  35 functions.reg_strip()
  36 functions.reg_starts_white()
  37 functions.reg_ends_white()
  38 functions.reg_texcommand()
  39
  40 STYLESHEETS = {
  41     'wl2tex': 'pdf/wl2tex.xslt',
  42 }
  43
  44 #CUSTOMIZATIONS = [
  45 #    'nofootnotes',
  46 #    'nothemes',
  47 #    'defaultleading',
  48 #    'onehalfleading',
  49 #    'doubleleading',
  50 #    'nowlfont',
  51 #    ]
  52
  53 def insert_tags(doc, split_re, tagname, exclude=None):
  54     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  55
  56     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  57     >>> insert_tags(t, re.compile('-'), 'd');
  58     >>> print etree.tostring(t)
  59     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  60     """
  61
  62     for elem in doc.iter(tag=etree.Element):
  63         if exclude and elem.tag in exclude:
  64             continue
  65         if elem.text:
  66             chunks = split_re.split(elem.text)
  67             while len(chunks) > 1:
  68                 ins = etree.Element(tagname)
  69                 ins.tail = chunks.pop()
  70                 elem.insert(0, ins)
  71             elem.text = chunks.pop(0)
  72         if elem.tail:
  73             chunks = split_re.split(elem.tail)
  74             parent = elem.getparent()
  75             ins_index = parent.index(elem) + 1
  76             while len(chunks) > 1:
  77                 ins = etree.Element(tagname)
  78                 ins.tail = chunks.pop()
  79                 parent.insert(ins_index, ins)
  80             elem.tail = chunks.pop(0)
  81
  82
  83 def substitute_hyphens(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  86                 "dywiz",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  88                 )
  89
  90
  91 def fix_hanging(doc):
  92     insert_tags(doc,
  93                 re.compile("(?<=\s\w)\s+"),
  94                 "nbsp",
  95                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  96                 )
  97
  98
  99 def move_motifs_inside(doc):
 100     """ moves motifs to be into block elements """
 101     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 102         for motif in master.xpath('motyw'):
 103             for sib in motif.itersiblings():
 104                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 105                     # motif shouldn't have a tail - it would be untagged text
 106                     motif.tail = None
 107                     motif.getparent().remove(motif)
 108                     sib.insert(0, motif)
 109                     break
 110
 111
 112 def hack_motifs(doc):
 113     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 114     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 115
 116     moves motifs in stanzas from first verse to second
 117     and from next to last to last, then inserts negative vspace before them
 118     """
 119     for motif in doc.findall('//strofa//motyw'):
 120         # find relevant verse-level tag
 121         verse, stanza = motif, motif.getparent()
 122         while stanza is not None and stanza.tag != 'strofa':
 123             verse, stanza = stanza, stanza.getparent()
 124         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 125         breaks_after = sum(1 for i in verse.itersiblings('br'))
 126         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 127             move_by = 1
 128             if breaks_after == 2:
 129                 move_by += 1
 130             moved_motif = deepcopy(motif)
 131             motif.tag = 'span'
 132             motif.text = None
 133             moved_motif.tail = None
 134             moved_motif.set('moved', str(move_by))
 135
 136             for br in verse.itersiblings('br'):
 137                 if move_by > 1:
 138                     move_by -= 1
 139                     continue
 140                 br.addnext(moved_motif)
 141                 break
 142
 143
 144 def parse_creator(doc):
 145     """Generates readable versions of creator and translator tags.
 146
 147     Finds all dc:creator and dc.contributor.translator tags
 148     and adds *_parsed versions with forenames first.
 149     """
 150     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 151                     'creator', 'contributor.translator')),
 152                     namespaces = {'dc': str(DCNS)})[::-1]:
 153         if not person.text:
 154             continue
 155         p = Person.from_text(person.text)
 156         person_parsed = deepcopy(person)
 157         person_parsed.tag = person.tag + '_parsed'
 158         person_parsed.set('sortkey', person.text)
 159         person_parsed.text = p.readable()
 160         person.getparent().insert(0, person_parsed)
 161
 162
 163 def get_stylesheet(name):
 164     return get_resource(STYLESHEETS[name])
 165
 166
 167 def package_available(package, args='', verbose=False):
 168     """ check if a verion of a latex package accepting given args is available """
 169     tempdir = mkdtemp('-wl2pdf-test')
 170     fpath = os.path.join(tempdir, 'test.tex')
 171     f = open(fpath, 'w')
 172     f.write(r"""
 173         \documentclass{wl}
 174         \usepackage[%s]{%s}
 175         \begin{document}
 176         \end{document}
 177         """ % (args, package))
 178     f.close()
 179     if verbose:
 180         p = call(['xelatex', '-output-directory', tempdir, fpath])
 181     else:
 182         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 183     shutil.rmtree(tempdir)
 184     return p == 0
 185
 186
 187 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 188               cover=None, flags=None, customizations=None):
 189     """ produces a PDF file with XeLaTeX
 190
 191     wldoc: a WLDocument
 192     verbose: prints all output from LaTeX
 193     save_tex: path to save the intermediary LaTeX file to
 194     morefloats (old/new/none): force specific morefloats
 195     cover: a cover.Cover factory or True for default
 196     flags: less-advertising,
 197     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 198     """
 199
 200     # Parse XSLT
 201     try:
 202         book_info = wldoc.book_info
 203         document = load_including_children(wldoc)
 204         root = document.edoc.getroot()
 205
 206         if cover:
 207             if cover is True:
 208                 cover = DefaultEbookCover
 209             bound_cover = cover(book_info, width=1200)
 210             root.set('data-cover-width', str(bound_cover.width))
 211             root.set('data-cover-height', str(bound_cover.height))
 212             if bound_cover.uses_dc_cover:
 213                 if book_info.cover_by:
 214                     root.set('data-cover-by', book_info.cover_by)
 215                 if book_info.cover_source:
 216                     root.set('data-cover-source',
 217                             book_info.cover_source)
 218         if flags:
 219             for flag in flags:
 220                 root.set('flag-' + flag, 'yes')
 221
 222         # check for LaTeX packages
 223         if morefloats:
 224             root.set('morefloats', morefloats.lower())
 225         elif package_available('morefloats', 'maxfloats=19'):
 226             root.set('morefloats', 'new')
 227
 228         # add customizations
 229         if customizations is not None:
 230             root.set('customizations', u','.join(customizations))
 231
 232         # add editors info
 233         editors = document.editors()
 234         if editors:
 235             root.set('editors', u', '.join(sorted(
 236                 editor.readable() for editor in editors)))
 237         if document.book_info.funders:
 238             root.set('funders', u', '.join(document.book_info.funders))
 239         if document.book_info.thanks:
 240             root.set('thanks', document.book_info.thanks)
 241
 242         # hack the tree
 243         move_motifs_inside(document.edoc)
 244         hack_motifs(document.edoc)
 245         parse_creator(document.edoc)
 246         substitute_hyphens(document.edoc)
 247         fix_hanging(document.edoc)
 248
 249         # wl -> TeXML
 250         style_filename = get_stylesheet("wl2tex")
 251         style = etree.parse(style_filename)
 252
 253         # TeXML -> LaTeX
 254         temp = mkdtemp('-wl2pdf')
 255
 256         for sponsor in book_info.sponsors:
 257             ins = etree.Element("data-sponsor", name=sponsor)
 258             logo = sponsor_logo(sponsor)
 259             if logo:
 260                 fname = 'sponsor-%s' % os.path.basename(logo)
 261                 shutil.copy(logo, os.path.join(temp, fname))
 262                 ins.set('src', fname)
 263             root.insert(0, ins)
 264
 265         if book_info.sponsor_note:
 266             root.set("sponsor-note", book_info.sponsor_note)
 267
 268         texml = document.transform(style)
 269
 270         if cover:
 271             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 272                 bound_cover.save(f, quality=80)
 273
 274         del document # no longer needed large object :)
 275
 276         tex_path = os.path.join(temp, 'doc.tex')
 277         fout = open(tex_path, 'w')
 278         process(StringIO(texml), fout, 'utf-8')
 279         fout.close()
 280         del texml
 281
 282         if save_tex:
 283             shutil.copy(tex_path, save_tex)
 284
 285         # LaTeX -> PDF
 286         shutil.copy(get_resource('pdf/wl.cls'), temp)
 287         shutil.copy(get_resource('res/wl-logo.png'), temp)
 288
 289         try:
 290             cwd = os.getcwd()
 291         except OSError:
 292             cwd = None
 293         os.chdir(temp)
 294
 295         if verbose:
 296             p = call(['xelatex', tex_path])
 297         else:
 298             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 299         if p:
 300             raise ParseError("Error parsing .tex file")
 301
 302         if cwd is not None:
 303             os.chdir(cwd)
 304
 305         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 306         pdf_path = os.path.join(temp, 'doc.pdf')
 307         shutil.move(pdf_path, output_file.name)
 308         shutil.rmtree(temp)
 309         return OutputFile.from_filename(output_file.name)
 310
 311     except (XMLSyntaxError, XSLTApplyError), e:
 312         raise ParseError(e)
 313
 314
 315 def load_including_children(wldoc=None, provider=None, uri=None):
 316     """ Makes one big xml file with children inserted at end.
 317
 318     Either wldoc or provider and URI must be provided.
 319     """
 320
 321     if uri and provider:
 322         f = provider.by_uri(uri)
 323         text = f.read().decode('utf-8')
 324         f.close()
 325     elif wldoc is not None:
 326         text = etree.tostring(wldoc.edoc, encoding=unicode)
 327         provider = wldoc.provider
 328     else:
 329         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 330
 331     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 332
 333     document = WLDocument.from_string(text,
 334                 parse_dublincore=True, provider=provider)
 335     document.swap_endlines()
 336
 337     for child_uri in document.book_info.parts:
 338         child = load_including_children(provider=provider, uri=child_uri)
 339         document.edoc.getroot().append(child.edoc.getroot())
 340     return document