librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import DefaultEbookCover
  31 from .sponsor import sponsor_logo
  32
  33
  34 functions.reg_substitute_entities()
  35 functions.reg_strip()
  36 functions.reg_starts_white()
  37 functions.reg_ends_white()
  38 functions.reg_texcommand()
  39
  40 STYLESHEETS = {
  41     'wl2tex': 'pdf/wl2tex.xslt',
  42 }
  43
  44 #CUSTOMIZATIONS = [
  45 #    'nofootnotes',
  46 #    'nothemes',
  47 #    'defaultleading',
  48 #    'onehalfleading',
  49 #    'doubleleading',
  50 #    'nowlfont',
  51 #    ]
  52
  53 def insert_tags(doc, split_re, tagname, exclude=None):
  54     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  55
  56     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  57     >>> insert_tags(t, re.compile('-'), 'd');
  58     >>> print etree.tostring(t)
  59     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  60     """
  61
  62     for elem in doc.iter(tag=etree.Element):
  63         if exclude and elem.tag in exclude:
  64             continue
  65         if elem.text:
  66             chunks = split_re.split(elem.text)
  67             while len(chunks) > 1:
  68                 ins = etree.Element(tagname)
  69                 ins.tail = chunks.pop()
  70                 elem.insert(0, ins)
  71             elem.text = chunks.pop(0)
  72         if elem.tail:
  73             chunks = split_re.split(elem.tail)
  74             parent = elem.getparent()
  75             ins_index = parent.index(elem) + 1
  76             while len(chunks) > 1:
  77                 ins = etree.Element(tagname)
  78                 ins.tail = chunks.pop()
  79                 parent.insert(ins_index, ins)
  80             elem.tail = chunks.pop(0)
  81
  82
  83 def substitute_hyphens(doc):
  84     insert_tags(doc,
  85                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  86                 "dywiz",
  87                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  88                 )
  89
  90
  91 def fix_hanging(doc):
  92     insert_tags(doc,
  93                 re.compile("(?<=\s\w)\s+"),
  94                 "nbsp",
  95                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  96                 )
  97
  98 def fix_tables(doc):
  99     for kol in doc.iter(tag='kol'):
 100         if kol.tail is not None:
 101             if not kol.tail.strip():
 102                 kol.tail = None
 103     for table in doc.iter(tag='tabela'):
 104         if table.get('ramka') == '1' or table.get('ramki') == '1':
 105             table.set('_format', '|' + 'X|' * len(table[0]))
 106         else:
 107             table.set('_format', 'X' * len(table[0]))
 108
 109
 110 def move_motifs_inside(doc):
 111     """ moves motifs to be into block elements """
 112     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 113         for motif in master.xpath('motyw'):
 114             for sib in motif.itersiblings():
 115                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 116                     # motif shouldn't have a tail - it would be untagged text
 117                     motif.tail = None
 118                     motif.getparent().remove(motif)
 119                     sib.insert(0, motif)
 120                     break
 121
 122
 123 def hack_motifs(doc):
 124     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 125     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 126
 127     moves motifs in stanzas from first verse to second
 128     and from next to last to last, then inserts negative vspace before them
 129     """
 130     for motif in doc.findall('//strofa//motyw'):
 131         # find relevant verse-level tag
 132         verse, stanza = motif, motif.getparent()
 133         while stanza is not None and stanza.tag != 'strofa':
 134             verse, stanza = stanza, stanza.getparent()
 135         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 136         breaks_after = sum(1 for i in verse.itersiblings('br'))
 137         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 138             move_by = 1
 139             if breaks_after == 2:
 140                 move_by += 1
 141             moved_motif = deepcopy(motif)
 142             motif.tag = 'span'
 143             motif.text = None
 144             moved_motif.tail = None
 145             moved_motif.set('moved', str(move_by))
 146
 147             for br in verse.itersiblings('br'):
 148                 if move_by > 1:
 149                     move_by -= 1
 150                     continue
 151                 br.addnext(moved_motif)
 152                 break
 153
 154
 155 def parse_creator(doc):
 156     """Generates readable versions of creator and translator tags.
 157
 158     Finds all dc:creator and dc.contributor.translator tags
 159     and adds *_parsed versions with forenames first.
 160     """
 161     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 162                     'creator', 'contributor.translator')),
 163                     namespaces = {'dc': str(DCNS)})[::-1]:
 164         if not person.text:
 165             continue
 166         p = Person.from_text(person.text)
 167         person_parsed = deepcopy(person)
 168         person_parsed.tag = person.tag + '_parsed'
 169         person_parsed.set('sortkey', person.text)
 170         person_parsed.text = p.readable()
 171         person.getparent().insert(0, person_parsed)
 172
 173
 174 def get_stylesheet(name):
 175     return get_resource(STYLESHEETS[name])
 176
 177
 178 def package_available(package, args='', verbose=False):
 179     """ check if a verion of a latex package accepting given args is available """
 180     tempdir = mkdtemp('-wl2pdf-test')
 181     fpath = os.path.join(tempdir, 'test.tex')
 182     f = open(fpath, 'w')
 183     f.write(r"""
 184         \documentclass{wl}
 185         \usepackage[%s]{%s}
 186         \begin{document}
 187         \end{document}
 188         """ % (args, package))
 189     f.close()
 190     if verbose:
 191         p = call(['xelatex', '-output-directory', tempdir, fpath])
 192     else:
 193         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 194     shutil.rmtree(tempdir)
 195     return p == 0
 196
 197
 198 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 199               cover=None, flags=None, customizations=None):
 200     """ produces a PDF file with XeLaTeX
 201
 202     wldoc: a WLDocument
 203     verbose: prints all output from LaTeX
 204     save_tex: path to save the intermediary LaTeX file to
 205     morefloats (old/new/none): force specific morefloats
 206     cover: a cover.Cover factory or True for default
 207     flags: less-advertising,
 208     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 209     """
 210
 211     # Parse XSLT
 212     try:
 213         book_info = wldoc.book_info
 214         document = load_including_children(wldoc)
 215         root = document.edoc.getroot()
 216
 217         if cover:
 218             if cover is True:
 219                 cover = DefaultEbookCover
 220             bound_cover = cover(book_info, width=1200)
 221             root.set('data-cover-width', str(bound_cover.width))
 222             root.set('data-cover-height', str(bound_cover.height))
 223             if bound_cover.uses_dc_cover:
 224                 if book_info.cover_by:
 225                     root.set('data-cover-by', book_info.cover_by)
 226                 if book_info.cover_source:
 227                     root.set('data-cover-source',
 228                             book_info.cover_source)
 229         if flags:
 230             for flag in flags:
 231                 root.set('flag-' + flag, 'yes')
 232
 233         # check for LaTeX packages
 234         if morefloats:
 235             root.set('morefloats', morefloats.lower())
 236         elif package_available('morefloats', 'maxfloats=19'):
 237             root.set('morefloats', 'new')
 238
 239         # add customizations
 240         if customizations is not None:
 241             root.set('customizations', u','.join(customizations))
 242
 243         # add editors info
 244         editors = document.editors()
 245         if editors:
 246             root.set('editors', u', '.join(sorted(
 247                 editor.readable() for editor in editors)))
 248         if document.book_info.funders:
 249             root.set('funders', u', '.join(document.book_info.funders))
 250         if document.book_info.thanks:
 251             root.set('thanks', document.book_info.thanks)
 252
 253         # hack the tree
 254         move_motifs_inside(document.edoc)
 255         hack_motifs(document.edoc)
 256         parse_creator(document.edoc)
 257         substitute_hyphens(document.edoc)
 258         fix_hanging(document.edoc)
 259         fix_tables(document.edoc)
 260
 261         # wl -> TeXML
 262         style_filename = get_stylesheet("wl2tex")
 263         style = etree.parse(style_filename)
 264         functions.reg_mathml_latex()
 265
 266         # TeXML -> LaTeX
 267         temp = mkdtemp('-wl2pdf')
 268
 269         for sponsor in book_info.sponsors:
 270             ins = etree.Element("data-sponsor", name=sponsor)
 271             logo = sponsor_logo(sponsor)
 272             if logo:
 273                 fname = 'sponsor-%s' % os.path.basename(logo)
 274                 shutil.copy(logo, os.path.join(temp, fname))
 275                 ins.set('src', fname)
 276             root.insert(0, ins)
 277
 278         if book_info.sponsor_note:
 279             root.set("sponsor-note", book_info.sponsor_note)
 280
 281         texml = document.transform(style)
 282
 283         if cover:
 284             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 285                 bound_cover.save(f, quality=80)
 286
 287         del document # no longer needed large object :)
 288
 289         tex_path = os.path.join(temp, 'doc.tex')
 290         fout = open(tex_path, 'w')
 291         process(StringIO(texml), fout, 'utf-8')
 292         fout.close()
 293         del texml
 294
 295         if save_tex:
 296             shutil.copy(tex_path, save_tex)
 297
 298         # LaTeX -> PDF
 299         shutil.copy(get_resource('pdf/wl.cls'), temp)
 300         shutil.copy(get_resource('res/wl-logo.png'), temp)
 301
 302         try:
 303             cwd = os.getcwd()
 304         except OSError:
 305             cwd = None
 306         os.chdir(temp)
 307
 308         if verbose:
 309             p = call(['xelatex', tex_path])
 310         else:
 311             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 312         if p:
 313             raise ParseError("Error parsing .tex file")
 314
 315         if cwd is not None:
 316             os.chdir(cwd)
 317
 318         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 319         pdf_path = os.path.join(temp, 'doc.pdf')
 320         shutil.move(pdf_path, output_file.name)
 321         shutil.rmtree(temp)
 322         return OutputFile.from_filename(output_file.name)
 323
 324     except (XMLSyntaxError, XSLTApplyError), e:
 325         raise ParseError(e)
 326
 327
 328 def load_including_children(wldoc=None, provider=None, uri=None):
 329     """ Makes one big xml file with children inserted at end.
 330
 331     Either wldoc or provider and URI must be provided.
 332     """
 333
 334     if uri and provider:
 335         f = provider.by_uri(uri)
 336         text = f.read().decode('utf-8')
 337         f.close()
 338     elif wldoc is not None:
 339         text = etree.tostring(wldoc.edoc, encoding=unicode)
 340         provider = wldoc.provider
 341     else:
 342         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 343
 344     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 345
 346     document = WLDocument.from_string(text,
 347                 parse_dublincore=True, provider=provider)
 348     document.swap_endlines()
 349
 350     for child_uri in document.book_info.parts:
 351         child = load_including_children(provider=provider, uri=child_uri)
 352         document.edoc.getroot().append(child.edoc.getroot())
 353     return document