src/librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import print_function, unicode_literals
  13
  14 import os
  15 import os.path
  16 import shutil
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21 from itertools import chain
  22
  23 from PIL import Image
  24 from Texml.processor import process
  25 from lxml import etree
  26 from lxml.etree import XMLSyntaxError, XSLTApplyError
  27 import six
  28
  29 from librarian.dcparser import Person
  30 from librarian.parser import WLDocument
  31 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  32 from librarian import functions
  33 from librarian.cover import make_cover
  34 from .sponsor import sponsor_logo
  35
  36
  37 functions.reg_substitute_entities()
  38 functions.reg_strip()
  39 functions.reg_starts_white()
  40 functions.reg_ends_white()
  41 functions.reg_texcommand()
  42
  43 STYLESHEETS = {
  44     'wl2tex': 'pdf/wl2tex.xslt',
  45 }
  46
  47 # CUSTOMIZATIONS = [
  48 #     'nofootnotes',
  49 #     'nothemes',
  50 #     'defaultleading',
  51 #     'onehalfleading',
  52 #     'doubleleading',
  53 #     'nowlfont',
  54 # ]
  55
  56
  57 def insert_tags(doc, split_re, tagname, exclude=None):
  58     """
  59     Inserts <tagname> for every occurence of `split_re'
  60     in text nodes in the `doc' tree.
  61
  62     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  63     >>> insert_tags(t, re.compile('-'), 'd')
  64     >>> print(etree.tostring(t, encoding='unicode'))
  65     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  66     """
  67
  68     for elem in doc.iter(tag=etree.Element):
  69         if exclude and elem.tag in exclude:
  70             continue
  71         if elem.text:
  72             chunks = split_re.split(elem.text)
  73             while len(chunks) > 1:
  74                 ins = etree.Element(tagname)
  75                 ins.tail = chunks.pop()
  76                 elem.insert(0, ins)
  77             elem.text = chunks.pop(0)
  78         if elem.tail:
  79             chunks = split_re.split(elem.tail)
  80             parent = elem.getparent()
  81             ins_index = parent.index(elem) + 1
  82             while len(chunks) > 1:
  83                 ins = etree.Element(tagname)
  84                 ins.tail = chunks.pop()
  85                 parent.insert(ins_index, ins)
  86             elem.tail = chunks.pop(0)
  87
  88
  89 def substitute_hyphens(doc):
  90     insert_tags(
  91         doc,
  92         re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
  93         "dywiz",
  94         exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  95     )
  96
  97
  98 def fix_hanging(doc):
  99     insert_tags(
 100         doc,
 101         re.compile(r"(?<=\s\w)\s+"),
 102         "nbsp",
 103         exclude=[DCNS("identifier.url"), DCNS("rights.license")]
 104     )
 105
 106
 107 def fix_tables(doc):
 108     for kol in doc.iter(tag='kol'):
 109         if kol.tail is not None:
 110             if not kol.tail.strip():
 111                 kol.tail = None
 112     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 113         if table.get('ramka') == '1' or table.get('ramki') == '1':
 114             table.set('_format', '|' + 'X|' * len(table[0]))
 115         else:
 116             table.set('_format', 'X' * len(table[0]))
 117
 118
 119 def mark_subauthors(doc):
 120     root_author = ', '.join(
 121         elem.text
 122         for elem in doc.findall(
 123                 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
 124         )
 125     )
 126     last_author = None
 127     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 128     # to wstawiamy jakiś znacznik w rdf?
 129     for subutwor in doc.xpath('/utwor/utwor'):
 130         author = ', '.join(
 131             elem.text
 132             for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
 133         )
 134         if author not in (last_author, root_author):
 135             subutwor.find('.//' + RDFNS('RDF')).append(
 136                 etree.Element('use_subauthor')
 137             )
 138         last_author = author
 139
 140
 141 def move_motifs_inside(doc):
 142     """ moves motifs to be into block elements """
 143     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 144                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
 145                             '//dramat_wspolczesny'):
 146         for motif in master.xpath('motyw'):
 147             for sib in motif.itersiblings():
 148                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
 149                                    'separator_linia', 'begin', 'end',
 150                                    'motyw', 'extra', 'uwaga'):
 151                     # motif shouldn't have a tail - it would be untagged text
 152                     motif.tail = None
 153                     motif.getparent().remove(motif)
 154                     sib.insert(0, motif)
 155                     break
 156
 157
 158 def hack_motifs(doc):
 159     """
 160     Dirty hack for the marginpar-creates-orphans LaTeX problem
 161     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 162
 163     Moves motifs in stanzas from first verse to second and from next
 164     to last to last, then inserts negative vspace before them.
 165     """
 166     for motif in doc.findall('//strofa//motyw'):
 167         # find relevant verse-level tag
 168         verse, stanza = motif, motif.getparent()
 169         while stanza is not None and stanza.tag != 'strofa':
 170             verse, stanza = stanza, stanza.getparent()
 171         breaks_before = sum(
 172             1 for i in verse.itersiblings('br', preceding=True)
 173         )
 174         breaks_after = sum(1 for i in verse.itersiblings('br'))
 175         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 176             move_by = 1
 177             if breaks_after == 2:
 178                 move_by += 1
 179             moved_motif = deepcopy(motif)
 180             motif.tag = 'span'
 181             motif.text = None
 182             moved_motif.tail = None
 183             moved_motif.set('moved', str(move_by))
 184
 185             for br in verse.itersiblings('br'):
 186                 if move_by > 1:
 187                     move_by -= 1
 188                     continue
 189                 br.addnext(moved_motif)
 190                 break
 191
 192
 193 def parse_creator(doc):
 194     """Generates readable versions of creator and translator tags.
 195
 196     Finds all dc:creator and dc.contributor.translator tags
 197     and adds *_parsed versions with forenames first.
 198     """
 199     for person in doc.xpath(
 200             "|".join('//dc:' + tag for tag in (
 201                 'creator', 'contributor.translator'
 202             )),
 203             namespaces={'dc': str(DCNS)})[::-1]:
 204         if not person.text:
 205             continue
 206         p = Person.from_text(person.text)
 207         person_parsed = deepcopy(person)
 208         person_parsed.tag = person.tag + '_parsed'
 209         person_parsed.set('sortkey', person.text)
 210         person_parsed.text = p.readable()
 211         person.getparent().insert(0, person_parsed)
 212
 213
 214 def get_stylesheet(name):
 215     return get_resource(STYLESHEETS[name])
 216
 217
 218 def package_available(package, args='', verbose=False):
 219     """
 220     Check if a verion of a latex package accepting given args
 221     is available.
 222     """
 223     tempdir = mkdtemp('-wl2pdf-test')
 224     fpath = os.path.join(tempdir, 'test.tex')
 225     f = open(fpath, 'w')
 226     f.write("""
 227         \\documentclass{wl}
 228         \\usepackage[%s]{%s}
 229         \\begin{document}
 230         \\end{document}
 231         """ % (args, package))
 232     f.close()
 233     if verbose:
 234         p = call(['xelatex', '-output-directory', tempdir, fpath])
 235     else:
 236         p = call(
 237             ['xelatex', '-interaction=batchmode', '-output-directory',
 238              tempdir, fpath],
 239             stdout=PIPE, stderr=PIPE
 240         )
 241     shutil.rmtree(tempdir)
 242     return p == 0
 243
 244
 245 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 246               cover=None, flags=None, customizations=None, base_url='file://./',
 247               latex_dir=False):
 248     """ produces a PDF file with XeLaTeX
 249
 250     wldoc: a WLDocument
 251     verbose: prints all output from LaTeX
 252     save_tex: path to save the intermediary LaTeX file to
 253     morefloats (old/new/none): force specific morefloats
 254     cover: a cover.Cover factory or True for default
 255     flags: less-advertising,
 256     customizations: user requested customizations regarding various
 257         formatting parameters (passed to wl LaTeX class)
 258     """
 259
 260     # Parse XSLT
 261     try:
 262         book_info = wldoc.book_info
 263         document = load_including_children(wldoc)
 264         root = document.edoc.getroot()
 265
 266         if cover:
 267             if cover is True:
 268                 cover = make_cover
 269             bound_cover = cover(book_info, width=1200)
 270             root.set('data-cover-width', str(bound_cover.width))
 271             root.set('data-cover-height', str(bound_cover.height))
 272             if bound_cover.uses_dc_cover:
 273                 if book_info.cover_by:
 274                     root.set('data-cover-by', book_info.cover_by)
 275                 if book_info.cover_source:
 276                     root.set('data-cover-source', book_info.cover_source)
 277         if flags:
 278             for flag in flags:
 279                 root.set('flag-' + flag, 'yes')
 280
 281         # check for LaTeX packages
 282         if morefloats:
 283             root.set('morefloats', morefloats.lower())
 284         elif package_available('morefloats', 'maxfloats=19'):
 285             root.set('morefloats', 'new')
 286
 287         if customizations is None:
 288             customizations = []
 289         else:
 290             customizations = list(customizations)
 291
 292         if book_info.endnotes:
 293             customizations.append('endnotes')
 294
 295         # add customizations
 296         if customizations is not None:
 297             root.set('customizations', u','.join(customizations))
 298
 299         # add editors info
 300         editors = document.editors()
 301         if editors:
 302             root.set('editors', u', '.join(sorted(
 303                 editor.readable() for editor in editors)))
 304         if document.book_info.funders:
 305             root.set('funders', u', '.join(document.book_info.funders))
 306         if document.book_info.thanks:
 307             root.set('thanks', document.book_info.thanks)
 308
 309         # hack the tree
 310         move_motifs_inside(document.edoc)
 311         hack_motifs(document.edoc)
 312         parse_creator(document.edoc)
 313         substitute_hyphens(document.edoc)
 314         fix_hanging(document.edoc)
 315         fix_tables(document.edoc)
 316         mark_subauthors(document.edoc)
 317         document.fix_pa_akap()
 318
 319         # wl -> TeXML
 320         style_filename = get_stylesheet("wl2tex")
 321         style = etree.parse(style_filename)
 322         functions.reg_mathml_latex()
 323
 324         # TeXML -> LaTeX
 325         temp = mkdtemp('-wl2pdf')
 326
 327         for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 328             url = six.moves.urllib.parse.urljoin(
 329                 base_url,
 330                 ilustr.get('src')
 331             )
 332             imgfile = six.moves.urllib.request.urlopen(url)
 333             img = Image.open(imgfile)
 334
 335             th_format, ext, media_type = {
 336                 'GIF': ('GIF', 'gif', 'image/gif'),
 337                 'PNG': ('PNG', 'png', 'image/png'),
 338             }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 339
 340             width = 2400
 341             if img.size[0] < width:
 342                 th = img
 343             else:
 344                 th = img.resize((width, round(width * img.size[1] / img.size[0])))
 345
 346             file_name = 'image%d.%s' % (i, ext)
 347             th.save(os.path.join(temp, file_name))
 348             ilustr.set('src', file_name)
 349
 350             imgfile.close()
 351
 352         for sponsor in book_info.sponsors:
 353             ins = etree.Element("data-sponsor", name=sponsor)
 354             logo = sponsor_logo(sponsor)
 355             if logo:
 356                 fname = 'sponsor-%s' % os.path.basename(logo)
 357                 shutil.copy(logo, os.path.join(temp, fname))
 358                 ins.set('src', fname)
 359             root.insert(0, ins)
 360
 361         if book_info.sponsor_note:
 362             root.set("sponsor-note", book_info.sponsor_note)
 363
 364         texml = document.transform(style)
 365
 366         if cover:
 367             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 368                 bound_cover.save(f, quality=80)
 369
 370         del document  # no longer needed large object :)
 371
 372         tex_path = os.path.join(temp, 'doc.tex')
 373         fout = open(tex_path, 'wb')
 374         process(six.BytesIO(texml), fout, 'utf-8')
 375         fout.close()
 376         del texml
 377
 378         if save_tex:
 379             shutil.copy(tex_path, save_tex)
 380
 381         # LaTeX -> PDF
 382         shutil.copy(get_resource('pdf/wl.cls'), temp)
 383         shutil.copy(get_resource('res/wl-logo.png'), temp)
 384
 385         if latex_dir:
 386             return temp
 387
 388         try:
 389             cwd = os.getcwd()
 390         except OSError:
 391             cwd = None
 392         os.chdir(temp)
 393
 394         # some things work better when compiled twice
 395         # (table of contents, [line numbers - disabled])
 396         for run in range(2):
 397             if verbose:
 398                 p = call(['xelatex', tex_path])
 399             else:
 400                 p = call(
 401                     ['xelatex', '-interaction=batchmode', tex_path],
 402                     stdout=PIPE, stderr=PIPE
 403                 )
 404             if p:
 405                 raise ParseError("Error parsing .tex file")
 406
 407         if cwd is not None:
 408             os.chdir(cwd)
 409
 410         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
 411                                          delete=False)
 412         pdf_path = os.path.join(temp, 'doc.pdf')
 413         shutil.move(pdf_path, output_file.name)
 414         shutil.rmtree(temp)
 415         return OutputFile.from_filename(output_file.name)
 416
 417     except (XMLSyntaxError, XSLTApplyError) as e:
 418         raise ParseError(e)
 419
 420
 421 def load_including_children(wldoc=None, provider=None, uri=None):
 422     """ Makes one big xml file with children inserted at end.
 423
 424     Either wldoc or provider and URI must be provided.
 425     """
 426
 427     if uri and provider:
 428         f = provider.by_uri(uri)
 429         text = f.read().decode('utf-8')
 430         f.close()
 431     elif wldoc is not None:
 432         text = etree.tostring(wldoc.edoc, encoding='unicode')
 433         provider = wldoc.provider
 434     else:
 435         raise ValueError(
 436             'Neither a WLDocument, nor provider and URI were provided.'
 437         )
 438
 439     # Cyrrilic
 440     text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
 441     # Geometric shapes.
 442     text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
 443
 444     document = WLDocument.from_bytes(text.encode('utf-8'),
 445                                      parse_dublincore=True, provider=provider)
 446     document.swap_endlines()
 447
 448     for child_uri in document.book_info.parts:
 449         child = load_including_children(provider=provider, uri=child_uri)
 450         document.edoc.getroot().append(child.edoc.getroot())
 451     return document