src/librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import print_function, unicode_literals
  13
  14 import os
  15 import os.path
  16 import shutil
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21 from itertools import chain
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26 import six
  27
  28 from librarian.dcparser import Person
  29 from librarian.parser import WLDocument
  30 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  31 from librarian import functions
  32 from librarian.cover import make_cover
  33 from .sponsor import sponsor_logo
  34
  35
  36 functions.reg_substitute_entities()
  37 functions.reg_strip()
  38 functions.reg_starts_white()
  39 functions.reg_ends_white()
  40 functions.reg_texcommand()
  41
  42 STYLESHEETS = {
  43     'wl2tex': 'pdf/wl2tex.xslt',
  44 }
  45
  46 # CUSTOMIZATIONS = [
  47 #     'nofootnotes',
  48 #     'nothemes',
  49 #     'defaultleading',
  50 #     'onehalfleading',
  51 #     'doubleleading',
  52 #     'nowlfont',
  53 # ]
  54
  55
  56 def insert_tags(doc, split_re, tagname, exclude=None):
  57     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  58
  59     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  60     >>> insert_tags(t, re.compile('-'), 'd')
  61     >>> print(etree.tostring(t, encoding='unicode'))
  62     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  63     """
  64
  65     for elem in doc.iter(tag=etree.Element):
  66         if exclude and elem.tag in exclude:
  67             continue
  68         if elem.text:
  69             chunks = split_re.split(elem.text)
  70             while len(chunks) > 1:
  71                 ins = etree.Element(tagname)
  72                 ins.tail = chunks.pop()
  73                 elem.insert(0, ins)
  74             elem.text = chunks.pop(0)
  75         if elem.tail:
  76             chunks = split_re.split(elem.tail)
  77             parent = elem.getparent()
  78             ins_index = parent.index(elem) + 1
  79             while len(chunks) > 1:
  80                 ins = etree.Element(tagname)
  81                 ins.tail = chunks.pop()
  82                 parent.insert(ins_index, ins)
  83             elem.tail = chunks.pop(0)
  84
  85
  86 def substitute_hyphens(doc):
  87     insert_tags(doc,
  88                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  89                 "dywiz",
  90                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  91                 )
  92
  93
  94 def fix_hanging(doc):
  95     insert_tags(doc,
  96                 re.compile("(?<=\s\w)\s+"),
  97                 "nbsp",
  98                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  99                 )
 100
 101
 102 def fix_tables(doc):
 103     for kol in doc.iter(tag='kol'):
 104         if kol.tail is not None:
 105             if not kol.tail.strip():
 106                 kol.tail = None
 107     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 108         if table.get('ramka') == '1' or table.get('ramki') == '1':
 109             table.set('_format', '|' + 'X|' * len(table[0]))
 110         else:
 111             table.set('_format', 'X' * len(table[0]))
 112
 113
 114 def mark_subauthors(doc):
 115     root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed')))
 116     last_author = None
 117     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 118     # to wstawiamy jakiś znacznik w rdf?
 119     for subutwor in doc.xpath('/utwor/utwor'):
 120         author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
 121         if author not in (last_author, root_author):
 122             subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor'))
 123         last_author = author
 124
 125
 126 def move_motifs_inside(doc):
 127     """ moves motifs to be into block elements """
 128     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 129                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 130         for motif in master.xpath('motyw'):
 131             for sib in motif.itersiblings():
 132                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 133                                    'begin', 'end', 'motyw', 'extra', 'uwaga'):
 134                     # motif shouldn't have a tail - it would be untagged text
 135                     motif.tail = None
 136                     motif.getparent().remove(motif)
 137                     sib.insert(0, motif)
 138                     break
 139
 140
 141 def hack_motifs(doc):
 142     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 143     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 144
 145     moves motifs in stanzas from first verse to second
 146     and from next to last to last, then inserts negative vspace before them
 147     """
 148     for motif in doc.findall('//strofa//motyw'):
 149         # find relevant verse-level tag
 150         verse, stanza = motif, motif.getparent()
 151         while stanza is not None and stanza.tag != 'strofa':
 152             verse, stanza = stanza, stanza.getparent()
 153         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 154         breaks_after = sum(1 for i in verse.itersiblings('br'))
 155         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 156             move_by = 1
 157             if breaks_after == 2:
 158                 move_by += 1
 159             moved_motif = deepcopy(motif)
 160             motif.tag = 'span'
 161             motif.text = None
 162             moved_motif.tail = None
 163             moved_motif.set('moved', str(move_by))
 164
 165             for br in verse.itersiblings('br'):
 166                 if move_by > 1:
 167                     move_by -= 1
 168                     continue
 169                 br.addnext(moved_motif)
 170                 break
 171
 172
 173 def parse_creator(doc):
 174     """Generates readable versions of creator and translator tags.
 175
 176     Finds all dc:creator and dc.contributor.translator tags
 177     and adds *_parsed versions with forenames first.
 178     """
 179     for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 180                             namespaces={'dc': str(DCNS)})[::-1]:
 181         if not person.text:
 182             continue
 183         p = Person.from_text(person.text)
 184         person_parsed = deepcopy(person)
 185         person_parsed.tag = person.tag + '_parsed'
 186         person_parsed.set('sortkey', person.text)
 187         person_parsed.text = p.readable()
 188         person.getparent().insert(0, person_parsed)
 189
 190
 191 def get_stylesheet(name):
 192     return get_resource(STYLESHEETS[name])
 193
 194
 195 def package_available(package, args='', verbose=False):
 196     """ check if a verion of a latex package accepting given args is available """
 197     tempdir = mkdtemp('-wl2pdf-test')
 198     fpath = os.path.join(tempdir, 'test.tex')
 199     f = open(fpath, 'w')
 200     f.write("""
 201         \\documentclass{wl}
 202         \\usepackage[%s]{%s}
 203         \\begin{document}
 204         \\end{document}
 205         """ % (args, package))
 206     f.close()
 207     if verbose:
 208         p = call(['xelatex', '-output-directory', tempdir, fpath])
 209     else:
 210         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 211     shutil.rmtree(tempdir)
 212     return p == 0
 213
 214
 215 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 216               cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
 217     """ produces a PDF file with XeLaTeX
 218
 219     wldoc: a WLDocument
 220     verbose: prints all output from LaTeX
 221     save_tex: path to save the intermediary LaTeX file to
 222     morefloats (old/new/none): force specific morefloats
 223     cover: a cover.Cover factory or True for default
 224     flags: less-advertising,
 225     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 226     """
 227
 228     # Parse XSLT
 229     try:
 230         book_info = wldoc.book_info
 231         document = load_including_children(wldoc)
 232         root = document.edoc.getroot()
 233
 234         if cover:
 235             if cover is True:
 236                 cover = make_cover
 237             bound_cover = cover(book_info, width=1200)
 238             root.set('data-cover-width', str(bound_cover.width))
 239             root.set('data-cover-height', str(bound_cover.height))
 240             if bound_cover.uses_dc_cover:
 241                 if book_info.cover_by:
 242                     root.set('data-cover-by', book_info.cover_by)
 243                 if book_info.cover_source:
 244                     root.set('data-cover-source', book_info.cover_source)
 245         if flags:
 246             for flag in flags:
 247                 root.set('flag-' + flag, 'yes')
 248
 249         # check for LaTeX packages
 250         if morefloats:
 251             root.set('morefloats', morefloats.lower())
 252         elif package_available('morefloats', 'maxfloats=19'):
 253             root.set('morefloats', 'new')
 254
 255         # add customizations
 256         if customizations is not None:
 257             root.set('customizations', u','.join(customizations))
 258
 259         # add editors info
 260         editors = document.editors()
 261         if editors:
 262             root.set('editors', u', '.join(sorted(
 263                 editor.readable() for editor in editors)))
 264         if document.book_info.funders:
 265             root.set('funders', u', '.join(document.book_info.funders))
 266         if document.book_info.thanks:
 267             root.set('thanks', document.book_info.thanks)
 268
 269         # hack the tree
 270         move_motifs_inside(document.edoc)
 271         hack_motifs(document.edoc)
 272         parse_creator(document.edoc)
 273         substitute_hyphens(document.edoc)
 274         fix_hanging(document.edoc)
 275         fix_tables(document.edoc)
 276         mark_subauthors(document.edoc)
 277
 278         # wl -> TeXML
 279         style_filename = get_stylesheet("wl2tex")
 280         style = etree.parse(style_filename)
 281         functions.reg_mathml_latex()
 282
 283         # TeXML -> LaTeX
 284         temp = mkdtemp('-wl2pdf')
 285
 286         for ilustr in document.edoc.findall("//ilustr"):
 287             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 288
 289         for sponsor in book_info.sponsors:
 290             ins = etree.Element("data-sponsor", name=sponsor)
 291             logo = sponsor_logo(sponsor)
 292             if logo:
 293                 fname = 'sponsor-%s' % os.path.basename(logo)
 294                 shutil.copy(logo, os.path.join(temp, fname))
 295                 ins.set('src', fname)
 296             root.insert(0, ins)
 297
 298         if book_info.sponsor_note:
 299             root.set("sponsor-note", book_info.sponsor_note)
 300
 301         texml = document.transform(style)
 302
 303         if cover:
 304             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 305                 bound_cover.save(f, quality=80)
 306
 307         del document  # no longer needed large object :)
 308
 309         tex_path = os.path.join(temp, 'doc.tex')
 310         fout = open(tex_path, 'wb')
 311         process(six.BytesIO(texml), fout, 'utf-8')
 312         fout.close()
 313         del texml
 314
 315         if save_tex:
 316             shutil.copy(tex_path, save_tex)
 317
 318         # LaTeX -> PDF
 319         shutil.copy(get_resource('pdf/wl.cls'), temp)
 320         shutil.copy(get_resource('res/wl-logo.png'), temp)
 321
 322         if latex_dir:
 323             return temp
 324
 325         try:
 326             cwd = os.getcwd()
 327         except OSError:
 328             cwd = None
 329         os.chdir(temp)
 330
 331         # some things work better when compiled twice
 332         # (table of contents, [line numbers - disabled])
 333         for run in range(2):
 334             if verbose:
 335                 p = call(['xelatex', tex_path])
 336             else:
 337                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 338             if p:
 339                 raise ParseError("Error parsing .tex file")
 340
 341         if cwd is not None:
 342             os.chdir(cwd)
 343
 344         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 345         pdf_path = os.path.join(temp, 'doc.pdf')
 346         shutil.move(pdf_path, output_file.name)
 347         shutil.rmtree(temp)
 348         return OutputFile.from_filename(output_file.name)
 349
 350     except (XMLSyntaxError, XSLTApplyError) as e:
 351         raise ParseError(e)
 352
 353
 354 def load_including_children(wldoc=None, provider=None, uri=None):
 355     """ Makes one big xml file with children inserted at end.
 356
 357     Either wldoc or provider and URI must be provided.
 358     """
 359
 360     if uri and provider:
 361         f = provider.by_uri(uri)
 362         text = f.read().decode('utf-8')
 363         f.close()
 364     elif wldoc is not None:
 365         text = etree.tostring(wldoc.edoc, encoding='unicode')
 366         provider = wldoc.provider
 367     else:
 368         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 369
 370     text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
 371
 372     document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider)
 373     document.swap_endlines()
 374
 375     for child_uri in document.book_info.parts:
 376         child = load_including_children(provider=provider, uri=child_uri)
 377         document.edoc.getroot().append(child.edoc.getroot())
 378     return document