librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21 from itertools import chain
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile
  30 from librarian import functions
  31 from librarian.cover import DefaultEbookCover
  32 from .sponsor import sponsor_logo
  33
  34
  35 functions.reg_substitute_entities()
  36 functions.reg_strip()
  37 functions.reg_starts_white()
  38 functions.reg_ends_white()
  39 functions.reg_texcommand()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 # CUSTOMIZATIONS = [
  46 #     'nofootnotes',
  47 #     'nothemes',
  48 #     'defaultleading',
  49 #     'onehalfleading',
  50 #     'doubleleading',
  51 #     'nowlfont',
  52 # ]
  53
  54
  55 def insert_tags(doc, split_re, tagname, exclude=None):
  56     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  57
  58     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  59     >>> insert_tags(t, re.compile('-'), 'd')
  60     >>> print etree.tostring(t)
  61     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  62     """
  63
  64     for elem in doc.iter(tag=etree.Element):
  65         if exclude and elem.tag in exclude:
  66             continue
  67         if elem.text:
  68             chunks = split_re.split(elem.text)
  69             while len(chunks) > 1:
  70                 ins = etree.Element(tagname)
  71                 ins.tail = chunks.pop()
  72                 elem.insert(0, ins)
  73             elem.text = chunks.pop(0)
  74         if elem.tail:
  75             chunks = split_re.split(elem.tail)
  76             parent = elem.getparent()
  77             ins_index = parent.index(elem) + 1
  78             while len(chunks) > 1:
  79                 ins = etree.Element(tagname)
  80                 ins.tail = chunks.pop()
  81                 parent.insert(ins_index, ins)
  82             elem.tail = chunks.pop(0)
  83
  84
  85 def substitute_hyphens(doc):
  86     insert_tags(doc,
  87                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  88                 "dywiz",
  89                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  90                 )
  91
  92
  93 def fix_hanging(doc):
  94     insert_tags(doc,
  95                 re.compile("(?<=\s\w)\s+"),
  96                 "nbsp",
  97                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  98                 )
  99
 100
 101 def fix_tables(doc):
 102     for kol in doc.iter(tag='kol'):
 103         if kol.tail is not None:
 104             if not kol.tail.strip():
 105                 kol.tail = None
 106     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 107         if table.get('ramka') == '1' or table.get('ramki') == '1':
 108             table.set('_format', '|' + 'X|' * len(table[0]))
 109         else:
 110             table.set('_format', 'X' * len(table[0]))
 111
 112
 113 def move_motifs_inside(doc):
 114     """ moves motifs to be into block elements """
 115     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 116                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 117         for motif in master.xpath('motyw'):
 118             for sib in motif.itersiblings():
 119                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 120                                    'begin', 'end', 'motyw', 'extra', 'uwaga'):
 121                     # motif shouldn't have a tail - it would be untagged text
 122                     motif.tail = None
 123                     motif.getparent().remove(motif)
 124                     sib.insert(0, motif)
 125                     break
 126
 127
 128 def hack_motifs(doc):
 129     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 130     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 131
 132     moves motifs in stanzas from first verse to second
 133     and from next to last to last, then inserts negative vspace before them
 134     """
 135     for motif in doc.findall('//strofa//motyw'):
 136         # find relevant verse-level tag
 137         verse, stanza = motif, motif.getparent()
 138         while stanza is not None and stanza.tag != 'strofa':
 139             verse, stanza = stanza, stanza.getparent()
 140         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 141         breaks_after = sum(1 for i in verse.itersiblings('br'))
 142         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 143             move_by = 1
 144             if breaks_after == 2:
 145                 move_by += 1
 146             moved_motif = deepcopy(motif)
 147             motif.tag = 'span'
 148             motif.text = None
 149             moved_motif.tail = None
 150             moved_motif.set('moved', str(move_by))
 151
 152             for br in verse.itersiblings('br'):
 153                 if move_by > 1:
 154                     move_by -= 1
 155                     continue
 156                 br.addnext(moved_motif)
 157                 break
 158
 159
 160 def parse_creator(doc):
 161     """Generates readable versions of creator and translator tags.
 162
 163     Finds all dc:creator and dc.contributor.translator tags
 164     and adds *_parsed versions with forenames first.
 165     """
 166     for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 167                             namespaces={'dc': str(DCNS)})[::-1]:
 168         if not person.text:
 169             continue
 170         p = Person.from_text(person.text)
 171         person_parsed = deepcopy(person)
 172         person_parsed.tag = person.tag + '_parsed'
 173         person_parsed.set('sortkey', person.text)
 174         person_parsed.text = p.readable()
 175         person.getparent().insert(0, person_parsed)
 176
 177
 178 def get_stylesheet(name):
 179     return get_resource(STYLESHEETS[name])
 180
 181
 182 def package_available(package, args='', verbose=False):
 183     """ check if a verion of a latex package accepting given args is available """
 184     tempdir = mkdtemp('-wl2pdf-test')
 185     fpath = os.path.join(tempdir, 'test.tex')
 186     f = open(fpath, 'w')
 187     f.write(r"""
 188         \documentclass{wl}
 189         \usepackage[%s]{%s}
 190         \begin{document}
 191         \end{document}
 192         """ % (args, package))
 193     f.close()
 194     if verbose:
 195         p = call(['xelatex', '-output-directory', tempdir, fpath])
 196     else:
 197         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 198     shutil.rmtree(tempdir)
 199     return p == 0
 200
 201
 202 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 203               cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
 204     """ produces a PDF file with XeLaTeX
 205
 206     wldoc: a WLDocument
 207     verbose: prints all output from LaTeX
 208     save_tex: path to save the intermediary LaTeX file to
 209     morefloats (old/new/none): force specific morefloats
 210     cover: a cover.Cover factory or True for default
 211     flags: less-advertising,
 212     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 213     """
 214
 215     # Parse XSLT
 216     try:
 217         book_info = wldoc.book_info
 218         document = load_including_children(wldoc)
 219         root = document.edoc.getroot()
 220
 221         if cover:
 222             if cover is True:
 223                 cover = DefaultEbookCover
 224             bound_cover = cover(book_info, width=1200)
 225             root.set('data-cover-width', str(bound_cover.width))
 226             root.set('data-cover-height', str(bound_cover.height))
 227             if bound_cover.uses_dc_cover:
 228                 if book_info.cover_by:
 229                     root.set('data-cover-by', book_info.cover_by)
 230                 if book_info.cover_source:
 231                     root.set('data-cover-source', book_info.cover_source)
 232         if flags:
 233             for flag in flags:
 234                 root.set('flag-' + flag, 'yes')
 235
 236         # check for LaTeX packages
 237         if morefloats:
 238             root.set('morefloats', morefloats.lower())
 239         elif package_available('morefloats', 'maxfloats=19'):
 240             root.set('morefloats', 'new')
 241
 242         # add customizations
 243         if customizations is not None:
 244             root.set('customizations', u','.join(customizations))
 245
 246         # add editors info
 247         editors = document.editors()
 248         if editors:
 249             root.set('editors', u', '.join(sorted(
 250                 editor.readable() for editor in editors)))
 251         if document.book_info.funders:
 252             root.set('funders', u', '.join(document.book_info.funders))
 253         if document.book_info.thanks:
 254             root.set('thanks', document.book_info.thanks)
 255
 256         # hack the tree
 257         move_motifs_inside(document.edoc)
 258         hack_motifs(document.edoc)
 259         parse_creator(document.edoc)
 260         substitute_hyphens(document.edoc)
 261         fix_hanging(document.edoc)
 262         fix_tables(document.edoc)
 263
 264         # wl -> TeXML
 265         style_filename = get_stylesheet("wl2tex")
 266         style = etree.parse(style_filename)
 267         functions.reg_mathml_latex()
 268
 269         # TeXML -> LaTeX
 270         temp = mkdtemp('-wl2pdf')
 271
 272         for ilustr in document.edoc.findall("//ilustr"):
 273             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 274
 275         for sponsor in book_info.sponsors:
 276             ins = etree.Element("data-sponsor", name=sponsor)
 277             logo = sponsor_logo(sponsor)
 278             if logo:
 279                 fname = 'sponsor-%s' % os.path.basename(logo)
 280                 shutil.copy(logo, os.path.join(temp, fname))
 281                 ins.set('src', fname)
 282             root.insert(0, ins)
 283
 284         if book_info.sponsor_note:
 285             root.set("sponsor-note", book_info.sponsor_note)
 286
 287         texml = document.transform(style)
 288
 289         if cover:
 290             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 291                 bound_cover.save(f, quality=80)
 292
 293         del document  # no longer needed large object :)
 294
 295         tex_path = os.path.join(temp, 'doc.tex')
 296         fout = open(tex_path, 'w')
 297         process(StringIO(texml), fout, 'utf-8')
 298         fout.close()
 299         del texml
 300
 301         if save_tex:
 302             shutil.copy(tex_path, save_tex)
 303
 304         # LaTeX -> PDF
 305         shutil.copy(get_resource('pdf/wl.cls'), temp)
 306         shutil.copy(get_resource('res/wl-logo.png'), temp)
 307
 308         if latex_dir:
 309             return temp
 310
 311         try:
 312             cwd = os.getcwd()
 313         except OSError:
 314             cwd = None
 315         os.chdir(temp)
 316
 317         # some things work better when compiled twice
 318         # but they are not enabled now (line numbers)
 319         for run in xrange(1):
 320             if verbose:
 321                 p = call(['xelatex', tex_path])
 322             else:
 323                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 324             if p:
 325                 raise ParseError("Error parsing .tex file")
 326
 327         if cwd is not None:
 328             os.chdir(cwd)
 329
 330         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 331         pdf_path = os.path.join(temp, 'doc.pdf')
 332         shutil.move(pdf_path, output_file.name)
 333         shutil.rmtree(temp)
 334         return OutputFile.from_filename(output_file.name)
 335
 336     except (XMLSyntaxError, XSLTApplyError), e:
 337         raise ParseError(e)
 338
 339
 340 def load_including_children(wldoc=None, provider=None, uri=None):
 341     """ Makes one big xml file with children inserted at end.
 342
 343     Either wldoc or provider and URI must be provided.
 344     """
 345
 346     if uri and provider:
 347         f = provider.by_uri(uri)
 348         text = f.read().decode('utf-8')
 349         f.close()
 350     elif wldoc is not None:
 351         text = etree.tostring(wldoc.edoc, encoding=unicode)
 352         provider = wldoc.provider
 353     else:
 354         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 355
 356     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 357
 358     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 359     document.swap_endlines()
 360
 361     for child_uri in document.book_info.parts:
 362         child = load_including_children(provider=provider, uri=child_uri)
 363         document.edoc.getroot().append(child.edoc.getroot())
 364     return document