src/librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import print_function, unicode_literals
  13
  14 import os
  15 import os.path
  16 import shutil
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21 from itertools import chain
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26 import six
  27
  28 from librarian.dcparser import Person
  29 from librarian.parser import WLDocument
  30 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  31 from librarian import functions
  32 from librarian.cover import make_cover
  33 from .sponsor import sponsor_logo
  34
  35
  36 functions.reg_substitute_entities()
  37 functions.reg_strip()
  38 functions.reg_starts_white()
  39 functions.reg_ends_white()
  40 functions.reg_texcommand()
  41
  42 STYLESHEETS = {
  43     'wl2tex': 'pdf/wl2tex.xslt',
  44 }
  45
  46 # CUSTOMIZATIONS = [
  47 #     'nofootnotes',
  48 #     'nothemes',
  49 #     'defaultleading',
  50 #     'onehalfleading',
  51 #     'doubleleading',
  52 #     'nowlfont',
  53 # ]
  54
  55
  56 def insert_tags(doc, split_re, tagname, exclude=None):
  57     """
  58     Inserts <tagname> for every occurence of `split_re'
  59     in text nodes in the `doc' tree.
  60
  61     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  62     >>> insert_tags(t, re.compile('-'), 'd')
  63     >>> print(etree.tostring(t, encoding='unicode'))
  64     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  65     """
  66
  67     for elem in doc.iter(tag=etree.Element):
  68         if exclude and elem.tag in exclude:
  69             continue
  70         if elem.text:
  71             chunks = split_re.split(elem.text)
  72             while len(chunks) > 1:
  73                 ins = etree.Element(tagname)
  74                 ins.tail = chunks.pop()
  75                 elem.insert(0, ins)
  76             elem.text = chunks.pop(0)
  77         if elem.tail:
  78             chunks = split_re.split(elem.tail)
  79             parent = elem.getparent()
  80             ins_index = parent.index(elem) + 1
  81             while len(chunks) > 1:
  82                 ins = etree.Element(tagname)
  83                 ins.tail = chunks.pop()
  84                 parent.insert(ins_index, ins)
  85             elem.tail = chunks.pop(0)
  86
  87
  88 def substitute_hyphens(doc):
  89     insert_tags(
  90         doc,
  91         re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
  92         "dywiz",
  93         exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  94     )
  95
  96
  97 def fix_hanging(doc):
  98     insert_tags(
  99         doc,
 100         re.compile(r"(?<=\s\w)\s+"),
 101         "nbsp",
 102         exclude=[DCNS("identifier.url"), DCNS("rights.license")]
 103     )
 104
 105
 106 def fix_tables(doc):
 107     for kol in doc.iter(tag='kol'):
 108         if kol.tail is not None:
 109             if not kol.tail.strip():
 110                 kol.tail = None
 111     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 112         if table.get('ramka') == '1' or table.get('ramki') == '1':
 113             table.set('_format', '|' + 'X|' * len(table[0]))
 114         else:
 115             table.set('_format', 'X' * len(table[0]))
 116
 117
 118 def mark_subauthors(doc):
 119     root_author = ', '.join(
 120         elem.text
 121         for elem in doc.findall(
 122                 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
 123         )
 124     )
 125     last_author = None
 126     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 127     # to wstawiamy jakiś znacznik w rdf?
 128     for subutwor in doc.xpath('/utwor/utwor'):
 129         author = ', '.join(
 130             elem.text
 131             for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
 132         )
 133         if author not in (last_author, root_author):
 134             subutwor.find('.//' + RDFNS('RDF')).append(
 135                 etree.Element('use_subauthor')
 136             )
 137         last_author = author
 138
 139
 140 def move_motifs_inside(doc):
 141     """ moves motifs to be into block elements """
 142     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 143                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
 144                             '//dramat_wspolczesny'):
 145         for motif in master.xpath('motyw'):
 146             for sib in motif.itersiblings():
 147                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
 148                                    'separator_linia', 'begin', 'end',
 149                                    'motyw', 'extra', 'uwaga'):
 150                     # motif shouldn't have a tail - it would be untagged text
 151                     motif.tail = None
 152                     motif.getparent().remove(motif)
 153                     sib.insert(0, motif)
 154                     break
 155
 156
 157 def hack_motifs(doc):
 158     """
 159     Dirty hack for the marginpar-creates-orphans LaTeX problem
 160     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 161
 162     Moves motifs in stanzas from first verse to second and from next
 163     to last to last, then inserts negative vspace before them.
 164     """
 165     for motif in doc.findall('//strofa//motyw'):
 166         # find relevant verse-level tag
 167         verse, stanza = motif, motif.getparent()
 168         while stanza is not None and stanza.tag != 'strofa':
 169             verse, stanza = stanza, stanza.getparent()
 170         breaks_before = sum(
 171             1 for i in verse.itersiblings('br', preceding=True)
 172         )
 173         breaks_after = sum(1 for i in verse.itersiblings('br'))
 174         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 175             move_by = 1
 176             if breaks_after == 2:
 177                 move_by += 1
 178             moved_motif = deepcopy(motif)
 179             motif.tag = 'span'
 180             motif.text = None
 181             moved_motif.tail = None
 182             moved_motif.set('moved', str(move_by))
 183
 184             for br in verse.itersiblings('br'):
 185                 if move_by > 1:
 186                     move_by -= 1
 187                     continue
 188                 br.addnext(moved_motif)
 189                 break
 190
 191
 192 def parse_creator(doc):
 193     """Generates readable versions of creator and translator tags.
 194
 195     Finds all dc:creator and dc.contributor.translator tags
 196     and adds *_parsed versions with forenames first.
 197     """
 198     for person in doc.xpath(
 199             "|".join('//dc:' + tag for tag in (
 200                 'creator', 'contributor.translator'
 201             )),
 202             namespaces={'dc': str(DCNS)})[::-1]:
 203         if not person.text:
 204             continue
 205         p = Person.from_text(person.text)
 206         person_parsed = deepcopy(person)
 207         person_parsed.tag = person.tag + '_parsed'
 208         person_parsed.set('sortkey', person.text)
 209         person_parsed.text = p.readable()
 210         person.getparent().insert(0, person_parsed)
 211
 212
 213 def get_stylesheet(name):
 214     return get_resource(STYLESHEETS[name])
 215
 216
 217 def package_available(package, args='', verbose=False):
 218     """
 219     Check if a verion of a latex package accepting given args
 220     is available.
 221     """
 222     tempdir = mkdtemp('-wl2pdf-test')
 223     fpath = os.path.join(tempdir, 'test.tex')
 224     f = open(fpath, 'w')
 225     f.write("""
 226         \\documentclass{wl}
 227         \\usepackage[%s]{%s}
 228         \\begin{document}
 229         \\end{document}
 230         """ % (args, package))
 231     f.close()
 232     if verbose:
 233         p = call(['xelatex', '-output-directory', tempdir, fpath])
 234     else:
 235         p = call(
 236             ['xelatex', '-interaction=batchmode', '-output-directory',
 237              tempdir, fpath],
 238             stdout=PIPE, stderr=PIPE
 239         )
 240     shutil.rmtree(tempdir)
 241     return p == 0
 242
 243
 244 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 245               cover=None, flags=None, customizations=None, ilustr_path='',
 246               latex_dir=False):
 247     """ produces a PDF file with XeLaTeX
 248
 249     wldoc: a WLDocument
 250     verbose: prints all output from LaTeX
 251     save_tex: path to save the intermediary LaTeX file to
 252     morefloats (old/new/none): force specific morefloats
 253     cover: a cover.Cover factory or True for default
 254     flags: less-advertising,
 255     customizations: user requested customizations regarding various
 256         formatting parameters (passed to wl LaTeX class)
 257     """
 258
 259     # Parse XSLT
 260     try:
 261         book_info = wldoc.book_info
 262         document = load_including_children(wldoc)
 263         root = document.edoc.getroot()
 264
 265         if cover:
 266             if cover is True:
 267                 cover = make_cover
 268             bound_cover = cover(book_info, width=1200)
 269             root.set('data-cover-width', str(bound_cover.width))
 270             root.set('data-cover-height', str(bound_cover.height))
 271             if bound_cover.uses_dc_cover:
 272                 if book_info.cover_by:
 273                     root.set('data-cover-by', book_info.cover_by)
 274                 if book_info.cover_source:
 275                     root.set('data-cover-source', book_info.cover_source)
 276         if flags:
 277             for flag in flags:
 278                 root.set('flag-' + flag, 'yes')
 279
 280         # check for LaTeX packages
 281         if morefloats:
 282             root.set('morefloats', morefloats.lower())
 283         elif package_available('morefloats', 'maxfloats=19'):
 284             root.set('morefloats', 'new')
 285
 286         # add customizations
 287         if customizations is not None:
 288             root.set('customizations', u','.join(customizations))
 289
 290         # add editors info
 291         editors = document.editors()
 292         if editors:
 293             root.set('editors', u', '.join(sorted(
 294                 editor.readable() for editor in editors)))
 295         if document.book_info.funders:
 296             root.set('funders', u', '.join(document.book_info.funders))
 297         if document.book_info.thanks:
 298             root.set('thanks', document.book_info.thanks)
 299
 300         # hack the tree
 301         move_motifs_inside(document.edoc)
 302         hack_motifs(document.edoc)
 303         parse_creator(document.edoc)
 304         substitute_hyphens(document.edoc)
 305         fix_hanging(document.edoc)
 306         fix_tables(document.edoc)
 307         mark_subauthors(document.edoc)
 308
 309         # wl -> TeXML
 310         style_filename = get_stylesheet("wl2tex")
 311         style = etree.parse(style_filename)
 312         functions.reg_mathml_latex()
 313
 314         # TeXML -> LaTeX
 315         temp = mkdtemp('-wl2pdf')
 316
 317         for ilustr in document.edoc.findall("//ilustr"):
 318             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 319
 320         for sponsor in book_info.sponsors:
 321             ins = etree.Element("data-sponsor", name=sponsor)
 322             logo = sponsor_logo(sponsor)
 323             if logo:
 324                 fname = 'sponsor-%s' % os.path.basename(logo)
 325                 shutil.copy(logo, os.path.join(temp, fname))
 326                 ins.set('src', fname)
 327             root.insert(0, ins)
 328
 329         if book_info.sponsor_note:
 330             root.set("sponsor-note", book_info.sponsor_note)
 331
 332         texml = document.transform(style)
 333
 334         if cover:
 335             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 336                 bound_cover.save(f, quality=80)
 337
 338         del document  # no longer needed large object :)
 339
 340         tex_path = os.path.join(temp, 'doc.tex')
 341         fout = open(tex_path, 'wb')
 342         process(six.BytesIO(texml), fout, 'utf-8')
 343         fout.close()
 344         del texml
 345
 346         if save_tex:
 347             shutil.copy(tex_path, save_tex)
 348
 349         # LaTeX -> PDF
 350         shutil.copy(get_resource('pdf/wl.cls'), temp)
 351         shutil.copy(get_resource('res/wl-logo.png'), temp)
 352
 353         if latex_dir:
 354             return temp
 355
 356         try:
 357             cwd = os.getcwd()
 358         except OSError:
 359             cwd = None
 360         os.chdir(temp)
 361
 362         # some things work better when compiled twice
 363         # (table of contents, [line numbers - disabled])
 364         for run in range(2):
 365             if verbose:
 366                 p = call(['xelatex', tex_path])
 367             else:
 368                 p = call(
 369                     ['xelatex', '-interaction=batchmode', tex_path],
 370                     stdout=PIPE, stderr=PIPE
 371                 )
 372             if p:
 373                 raise ParseError("Error parsing .tex file")
 374
 375         if cwd is not None:
 376             os.chdir(cwd)
 377
 378         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
 379                                          delete=False)
 380         pdf_path = os.path.join(temp, 'doc.pdf')
 381         shutil.move(pdf_path, output_file.name)
 382         shutil.rmtree(temp)
 383         return OutputFile.from_filename(output_file.name)
 384
 385     except (XMLSyntaxError, XSLTApplyError) as e:
 386         raise ParseError(e)
 387
 388
 389 def load_including_children(wldoc=None, provider=None, uri=None):
 390     """ Makes one big xml file with children inserted at end.
 391
 392     Either wldoc or provider and URI must be provided.
 393     """
 394
 395     if uri and provider:
 396         f = provider.by_uri(uri)
 397         text = f.read().decode('utf-8')
 398         f.close()
 399     elif wldoc is not None:
 400         text = etree.tostring(wldoc.edoc, encoding='unicode')
 401         provider = wldoc.provider
 402     else:
 403         raise ValueError(
 404             'Neither a WLDocument, nor provider and URI were provided.'
 405         )
 406
 407     text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
 408
 409     document = WLDocument.from_bytes(text.encode('utf-8'),
 410                                      parse_dublincore=True, provider=provider)
 411     document.swap_endlines()
 412
 413     for child_uri in document.book_info.parts:
 414         child = load_including_children(provider=provider, uri=child_uri)
 415         document.edoc.getroot().append(child.edoc.getroot())
 416     return document