librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import DefaultEbookCover
  31 from .sponsor import sponsor_logo
  32
  33
  34 functions.reg_substitute_entities()
  35 functions.reg_strip()
  36 functions.reg_starts_white()
  37 functions.reg_ends_white()
  38 functions.reg_texcommand()
  39
  40 STYLESHEETS = {
  41     'wl2tex': 'pdf/wl2tex.xslt',
  42 }
  43
  44 # CUSTOMIZATIONS = [
  45 #     'nofootnotes',
  46 #     'nothemes',
  47 #     'defaultleading',
  48 #     'onehalfleading',
  49 #     'doubleleading',
  50 #     'nowlfont',
  51 # ]
  52
  53
  54 def insert_tags(doc, split_re, tagname, exclude=None):
  55     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  56
  57     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  58     >>> insert_tags(t, re.compile('-'), 'd')
  59     >>> print etree.tostring(t)
  60     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  61     """
  62
  63     for elem in doc.iter(tag=etree.Element):
  64         if exclude and elem.tag in exclude:
  65             continue
  66         if elem.text:
  67             chunks = split_re.split(elem.text)
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 elem.insert(0, ins)
  72             elem.text = chunks.pop(0)
  73         if elem.tail:
  74             chunks = split_re.split(elem.tail)
  75             parent = elem.getparent()
  76             ins_index = parent.index(elem) + 1
  77             while len(chunks) > 1:
  78                 ins = etree.Element(tagname)
  79                 ins.tail = chunks.pop()
  80                 parent.insert(ins_index, ins)
  81             elem.tail = chunks.pop(0)
  82
  83
  84 def substitute_hyphens(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  87                 "dywiz",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  89                 )
  90
  91
  92 def fix_hanging(doc):
  93     insert_tags(doc,
  94                 re.compile("(?<=\s\w)\s+"),
  95                 "nbsp",
  96                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  97                 )
  98
  99
 100 def fix_tables(doc):
 101     for kol in doc.iter(tag='kol'):
 102         if kol.tail is not None:
 103             if not kol.tail.strip():
 104                 kol.tail = None
 105     for table in doc.iter(tag='tabela'):
 106         if table.get('ramka') == '1' or table.get('ramki') == '1':
 107             table.set('_format', '|' + 'X|' * len(table[0]))
 108         else:
 109             table.set('_format', 'X' * len(table[0]))
 110
 111
 112 def move_motifs_inside(doc):
 113     """ moves motifs to be into block elements """
 114     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 115                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 116         for motif in master.xpath('motyw'):
 117             for sib in motif.itersiblings():
 118                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 119                                    'begin', 'end', 'motyw', 'extra', 'uwaga'):
 120                     # motif shouldn't have a tail - it would be untagged text
 121                     motif.tail = None
 122                     motif.getparent().remove(motif)
 123                     sib.insert(0, motif)
 124                     break
 125
 126
 127 def hack_motifs(doc):
 128     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 129     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 130
 131     moves motifs in stanzas from first verse to second
 132     and from next to last to last, then inserts negative vspace before them
 133     """
 134     for motif in doc.findall('//strofa//motyw'):
 135         # find relevant verse-level tag
 136         verse, stanza = motif, motif.getparent()
 137         while stanza is not None and stanza.tag != 'strofa':
 138             verse, stanza = stanza, stanza.getparent()
 139         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 140         breaks_after = sum(1 for i in verse.itersiblings('br'))
 141         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 142             move_by = 1
 143             if breaks_after == 2:
 144                 move_by += 1
 145             moved_motif = deepcopy(motif)
 146             motif.tag = 'span'
 147             motif.text = None
 148             moved_motif.tail = None
 149             moved_motif.set('moved', str(move_by))
 150
 151             for br in verse.itersiblings('br'):
 152                 if move_by > 1:
 153                     move_by -= 1
 154                     continue
 155                 br.addnext(moved_motif)
 156                 break
 157
 158
 159 def parse_creator(doc):
 160     """Generates readable versions of creator and translator tags.
 161
 162     Finds all dc:creator and dc.contributor.translator tags
 163     and adds *_parsed versions with forenames first.
 164     """
 165     for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 166                             namespaces={'dc': str(DCNS)})[::-1]:
 167         if not person.text:
 168             continue
 169         p = Person.from_text(person.text)
 170         person_parsed = deepcopy(person)
 171         person_parsed.tag = person.tag + '_parsed'
 172         person_parsed.set('sortkey', person.text)
 173         person_parsed.text = p.readable()
 174         person.getparent().insert(0, person_parsed)
 175
 176
 177 def get_stylesheet(name):
 178     return get_resource(STYLESHEETS[name])
 179
 180
 181 def package_available(package, args='', verbose=False):
 182     """ check if a verion of a latex package accepting given args is available """
 183     tempdir = mkdtemp('-wl2pdf-test')
 184     fpath = os.path.join(tempdir, 'test.tex')
 185     f = open(fpath, 'w')
 186     f.write(r"""
 187         \documentclass{wl}
 188         \usepackage[%s]{%s}
 189         \begin{document}
 190         \end{document}
 191         """ % (args, package))
 192     f.close()
 193     if verbose:
 194         p = call(['xelatex', '-output-directory', tempdir, fpath])
 195     else:
 196         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 197     shutil.rmtree(tempdir)
 198     return p == 0
 199
 200
 201 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 202               cover=None, flags=None, customizations=None, ilustr_path=''):
 203     """ produces a PDF file with XeLaTeX
 204
 205     wldoc: a WLDocument
 206     verbose: prints all output from LaTeX
 207     save_tex: path to save the intermediary LaTeX file to
 208     morefloats (old/new/none): force specific morefloats
 209     cover: a cover.Cover factory or True for default
 210     flags: less-advertising,
 211     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 212     """
 213
 214     # Parse XSLT
 215     try:
 216         book_info = wldoc.book_info
 217         document = load_including_children(wldoc)
 218         root = document.edoc.getroot()
 219
 220         if cover:
 221             if cover is True:
 222                 cover = DefaultEbookCover
 223             bound_cover = cover(book_info, width=1200)
 224             root.set('data-cover-width', str(bound_cover.width))
 225             root.set('data-cover-height', str(bound_cover.height))
 226             if bound_cover.uses_dc_cover:
 227                 if book_info.cover_by:
 228                     root.set('data-cover-by', book_info.cover_by)
 229                 if book_info.cover_source:
 230                     root.set('data-cover-source', book_info.cover_source)
 231         if flags:
 232             for flag in flags:
 233                 root.set('flag-' + flag, 'yes')
 234
 235         # check for LaTeX packages
 236         if morefloats:
 237             root.set('morefloats', morefloats.lower())
 238         elif package_available('morefloats', 'maxfloats=19'):
 239             root.set('morefloats', 'new')
 240
 241         # add customizations
 242         if customizations is not None:
 243             root.set('customizations', u','.join(customizations))
 244
 245         # add editors info
 246         editors = document.editors()
 247         if editors:
 248             root.set('editors', u', '.join(sorted(
 249                 editor.readable() for editor in editors)))
 250         if document.book_info.funders:
 251             root.set('funders', u', '.join(document.book_info.funders))
 252         if document.book_info.thanks:
 253             root.set('thanks', document.book_info.thanks)
 254
 255         # hack the tree
 256         move_motifs_inside(document.edoc)
 257         hack_motifs(document.edoc)
 258         parse_creator(document.edoc)
 259         substitute_hyphens(document.edoc)
 260         fix_hanging(document.edoc)
 261         fix_tables(document.edoc)
 262
 263         # wl -> TeXML
 264         style_filename = get_stylesheet("wl2tex")
 265         style = etree.parse(style_filename)
 266         functions.reg_mathml_latex()
 267
 268         # TeXML -> LaTeX
 269         temp = mkdtemp('-wl2pdf')
 270
 271         for ilustr in document.edoc.findall("//ilustr"):
 272             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 273
 274         for sponsor in book_info.sponsors:
 275             ins = etree.Element("data-sponsor", name=sponsor)
 276             logo = sponsor_logo(sponsor)
 277             if logo:
 278                 fname = 'sponsor-%s' % os.path.basename(logo)
 279                 shutil.copy(logo, os.path.join(temp, fname))
 280                 ins.set('src', fname)
 281             root.insert(0, ins)
 282
 283         if book_info.sponsor_note:
 284             root.set("sponsor-note", book_info.sponsor_note)
 285
 286         texml = document.transform(style)
 287
 288         if cover:
 289             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 290                 bound_cover.save(f, quality=80)
 291
 292         del document  # no longer needed large object :)
 293
 294         tex_path = os.path.join(temp, 'doc.tex')
 295         fout = open(tex_path, 'w')
 296         process(StringIO(texml), fout, 'utf-8')
 297         fout.close()
 298         del texml
 299
 300         if save_tex:
 301             shutil.copy(tex_path, save_tex)
 302
 303         # LaTeX -> PDF
 304         shutil.copy(get_resource('pdf/wl.cls'), temp)
 305         shutil.copy(get_resource('res/wl-logo.png'), temp)
 306
 307         try:
 308             cwd = os.getcwd()
 309         except OSError:
 310             cwd = None
 311         os.chdir(temp)
 312
 313         if verbose:
 314             p = call(['xelatex', tex_path])
 315         else:
 316             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 317         if p:
 318             raise ParseError("Error parsing .tex file")
 319
 320         if cwd is not None:
 321             os.chdir(cwd)
 322
 323         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 324         pdf_path = os.path.join(temp, 'doc.pdf')
 325         shutil.move(pdf_path, output_file.name)
 326         shutil.rmtree(temp)
 327         return OutputFile.from_filename(output_file.name)
 328
 329     except (XMLSyntaxError, XSLTApplyError), e:
 330         raise ParseError(e)
 331
 332
 333 def load_including_children(wldoc=None, provider=None, uri=None):
 334     """ Makes one big xml file with children inserted at end.
 335
 336     Either wldoc or provider and URI must be provided.
 337     """
 338
 339     if uri and provider:
 340         f = provider.by_uri(uri)
 341         text = f.read().decode('utf-8')
 342         f.close()
 343     elif wldoc is not None:
 344         text = etree.tostring(wldoc.edoc, encoding=unicode)
 345         provider = wldoc.provider
 346     else:
 347         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 348
 349     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 350
 351     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 352     document.swap_endlines()
 353
 354     for child_uri in document.book_info.parts:
 355         child = load_including_children(provider=provider, uri=child_uri)
 356         document.edoc.getroot().append(child.edoc.getroot())
 357     return document