src/librarian/pdf.py

   1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
   3 #
   4 """PDF creation library.
   5
   6 Creates one big XML from the book and its children, converts it to LaTeX
   7 with TeXML, then runs it by XeLaTeX.
   8
   9 """
  10 import io
  11 import os
  12 import os.path
  13 import shutil
  14 from tempfile import mkdtemp, NamedTemporaryFile
  15 import re
  16 from copy import deepcopy
  17 from subprocess import call, PIPE
  18 from itertools import chain
  19 import urllib.parse
  20 import urllib.request
  21
  22 from PIL import Image
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  30 from librarian import functions
  31 from librarian.cover import make_cover
  32 from .sponsor import sponsor_logo
  33
  34
  35 functions.reg_substitute_entities()
  36 functions.reg_strip()
  37 functions.reg_starts_white()
  38 functions.reg_ends_white()
  39 functions.reg_texcommand()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 # CUSTOMIZATIONS = [
  46 #     'nofootnotes',
  47 #     'nothemes',
  48 #     'defaultleading',
  49 #     'onehalfleading',
  50 #     'doubleleading',
  51 #     'nowlfont',
  52 # ]
  53
  54
  55 def insert_tags(doc, split_re, tagname, exclude=None):
  56     """
  57     Inserts <tagname> for every occurence of `split_re'
  58     in text nodes in the `doc' tree.
  59
  60     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  61     >>> insert_tags(t, re.compile('-'), 'd')
  62     >>> print(etree.tostring(t, encoding='unicode'))
  63     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  64     """
  65
  66     for elem in doc.iter(tag=etree.Element):
  67         if exclude and elem.tag in exclude:
  68             continue
  69         if elem.text:
  70             chunks = split_re.split(elem.text)
  71             while len(chunks) > 1:
  72                 ins = etree.Element(tagname)
  73                 ins.tail = chunks.pop()
  74                 elem.insert(0, ins)
  75             elem.text = chunks.pop(0)
  76         if elem.tail:
  77             chunks = split_re.split(elem.tail)
  78             parent = elem.getparent()
  79             ins_index = parent.index(elem) + 1
  80             while len(chunks) > 1:
  81                 ins = etree.Element(tagname)
  82                 ins.tail = chunks.pop()
  83                 parent.insert(ins_index, ins)
  84             elem.tail = chunks.pop(0)
  85
  86
  87 def substitute_hyphens(doc):
  88     insert_tags(
  89         doc,
  90         re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
  91         "dywiz",
  92         exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  93     )
  94
  95
  96 def fix_hanging(doc):
  97     insert_tags(
  98         doc,
  99         re.compile(r"(?<=\s\w)\s+"),
 100         "nbsp",
 101         exclude=[DCNS("identifier.url"), DCNS("rights.license")]
 102     )
 103
 104
 105 def fix_tables(doc):
 106     for kol in doc.iter(tag='kol'):
 107         if kol.tail is not None:
 108             if not kol.tail.strip():
 109                 kol.tail = None
 110     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 111         if table.get('ramka') == '1' or table.get('ramki') == '1':
 112             table.set('_format', '|' + 'X|' * len(table[0]))
 113         else:
 114             table.set('_format', 'X' * len(table[0]))
 115
 116
 117 def mark_subauthors(doc):
 118     root_author = ', '.join(
 119         elem.text
 120         for elem in doc.findall(
 121                 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
 122         )
 123     )
 124     last_author = None
 125     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 126     # to wstawiamy jakiś znacznik w rdf?
 127     for subutwor in doc.xpath('/utwor/utwor'):
 128         author = ', '.join(
 129             elem.text
 130             for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
 131         )
 132         if author not in (last_author, root_author):
 133             subutwor.find('.//' + RDFNS('RDF')).append(
 134                 etree.Element('use_subauthor')
 135             )
 136         last_author = author
 137
 138
 139 def move_motifs_inside(doc):
 140     """ moves motifs to be into block elements """
 141     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 142                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
 143                             '//dramat_wspolczesny'):
 144         for motif in master.xpath('motyw'):
 145             for sib in motif.itersiblings():
 146                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
 147                                    'separator_linia', 'begin', 'end',
 148                                    'motyw', 'extra', 'uwaga'):
 149                     # motif shouldn't have a tail - it would be untagged text
 150                     motif.tail = None
 151                     motif.getparent().remove(motif)
 152                     sib.insert(0, motif)
 153                     break
 154
 155
 156 def hack_motifs(doc):
 157     """
 158     Dirty hack for the marginpar-creates-orphans LaTeX problem
 159     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 160
 161     Moves motifs in stanzas from first verse to second and from next
 162     to last to last, then inserts negative vspace before them.
 163     """
 164     for motif in doc.findall('//strofa//motyw'):
 165         # find relevant verse-level tag
 166         verse, stanza = motif, motif.getparent()
 167         while stanza is not None and stanza.tag != 'strofa':
 168             verse, stanza = stanza, stanza.getparent()
 169         breaks_before = sum(
 170             1 for i in verse.itersiblings('br', preceding=True)
 171         )
 172         breaks_after = sum(1 for i in verse.itersiblings('br'))
 173         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 174             move_by = 1
 175             if breaks_after == 2:
 176                 move_by += 1
 177             moved_motif = deepcopy(motif)
 178             motif.tag = 'span'
 179             motif.text = None
 180             moved_motif.tail = None
 181             moved_motif.set('moved', str(move_by))
 182
 183             for br in verse.itersiblings('br'):
 184                 if move_by > 1:
 185                     move_by -= 1
 186                     continue
 187                 br.addnext(moved_motif)
 188                 break
 189
 190
 191 def add_fundraising(doc, fundraising):
 192     # Before each naglowek_rozdzial and naglowek_scena and in the end
 193     spots = []
 194     for naglowek in doc.xpath('//naglowek_czesc|//naglowek_akt'):
 195         spot = etree.Element('f_spot')
 196         naglowek.addprevious(spot)
 197         spots.append(spot)
 198     spot = etree.Element('f_spot')
 199     doc.getroot()[-1].append(spot)
 200     spots.append(spot)
 201     e = len(spots)
 202     nfunds = len(fundraising)
 203     if e > 4 * nfunds:
 204         nfunds *= 2
 205     for f in range(nfunds):
 206         spot_index = int(f / nfunds * e)
 207         spots[spot_index].set('active', 'true')
 208         elem = etree.fromstring('<f_spot>' + fundraising[f % len(fundraising)] + '</f_spot>')
 209         spots[spot_index].text = elem.text
 210         for c in elem:
 211             spots[spot_index].append(c)
 212
 213
 214 def parse_creator(doc):
 215     """Generates readable versions of creator and translator tags.
 216
 217     Finds all dc:creator and dc.contributor.translator tags
 218     and adds *_parsed versions with forenames first.
 219     """
 220     for person in doc.xpath(
 221             "|".join('//dc:' + tag for tag in (
 222                 'creator', 'contributor.translator'
 223             )),
 224             namespaces={'dc': str(DCNS)})[::-1]:
 225         if not person.text:
 226             continue
 227         p = Person.from_text(person.text)
 228         person_parsed = deepcopy(person)
 229         person_parsed.tag = person.tag + '_parsed'
 230         person_parsed.set('sortkey', person.text)
 231         person_parsed.text = p.readable()
 232         person.getparent().insert(0, person_parsed)
 233
 234
 235 def get_stylesheet(name):
 236     return get_resource(STYLESHEETS[name])
 237
 238
 239 def package_available(package, args='', verbose=False):
 240     """
 241     Check if a verion of a latex package accepting given args
 242     is available.
 243     """
 244     tempdir = mkdtemp('-wl2pdf-test')
 245     fpath = os.path.join(tempdir, 'test.tex')
 246     f = open(fpath, 'w')
 247     f.write("""
 248         \\documentclass{wl}
 249         \\usepackage[%s]{%s}
 250         \\begin{document}
 251         \\end{document}
 252         """ % (args, package))
 253     f.close()
 254     if verbose:
 255         p = call(['xelatex', '-output-directory', tempdir, fpath])
 256     else:
 257         p = call(
 258             ['xelatex', '-interaction=batchmode', '-output-directory',
 259              tempdir, fpath],
 260             stdout=PIPE, stderr=PIPE
 261         )
 262     shutil.rmtree(tempdir)
 263     return p == 0
 264
 265
 266 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 267               cover=None, flags=None, customizations=None, base_url='file://./',
 268               latex_dir=False, fundraising=None):
 269     """ produces a PDF file with XeLaTeX
 270
 271     wldoc: a WLDocument
 272     verbose: prints all output from LaTeX
 273     save_tex: path to save the intermediary LaTeX file to
 274     morefloats (old/new/none): force specific morefloats
 275     cover: a cover.Cover factory or True for default
 276     flags: less-advertising,
 277     customizations: user requested customizations regarding various
 278         formatting parameters (passed to wl LaTeX class)
 279     """
 280
 281     # Parse XSLT
 282     try:
 283         book_info = wldoc.book_info
 284         document = load_including_children(wldoc)
 285         root = document.edoc.getroot()
 286
 287         if cover:
 288             if cover is True:
 289                 cover = make_cover
 290             bound_cover = cover(book_info, width=1200)
 291             root.set('data-cover-width', str(bound_cover.width))
 292             root.set('data-cover-height', str(bound_cover.height))
 293             if bound_cover.uses_dc_cover:
 294                 if book_info.cover_by:
 295                     root.set('data-cover-by', book_info.cover_by)
 296                 if book_info.cover_source:
 297                     root.set('data-cover-source', book_info.cover_source)
 298         if flags:
 299             for flag in flags:
 300                 root.set('flag-' + flag, 'yes')
 301
 302         # check for LaTeX packages
 303         if morefloats:
 304             root.set('morefloats', morefloats.lower())
 305         elif package_available('morefloats', 'maxfloats=19'):
 306             root.set('morefloats', 'new')
 307
 308         if customizations is None:
 309             customizations = []
 310         else:
 311             customizations = list(customizations)
 312
 313         if book_info.endnotes:
 314             customizations.append('endnotes')
 315
 316         # add customizations
 317         if customizations is not None:
 318             root.set('customizations', ','.join(customizations))
 319
 320         # add editors info
 321         editors = document.editors()
 322         if editors:
 323             root.set('editors', ', '.join(sorted(
 324                 editor.readable() for editor in editors)))
 325         if document.book_info.funders:
 326             root.set('funders', ', '.join(document.book_info.funders))
 327         if document.book_info.thanks:
 328             root.set('thanks', document.book_info.thanks)
 329
 330         # hack the tree
 331         if fundraising:
 332             add_fundraising(document.edoc, fundraising)
 333         move_motifs_inside(document.edoc)
 334         hack_motifs(document.edoc)
 335         parse_creator(document.edoc)
 336         substitute_hyphens(document.edoc)
 337         fix_hanging(document.edoc)
 338         fix_tables(document.edoc)
 339         mark_subauthors(document.edoc)
 340         document.fix_pa_akap()
 341
 342         # wl -> TeXML
 343         style_filename = get_stylesheet("wl2tex")
 344         style = etree.parse(style_filename)
 345         functions.reg_mathml_latex()
 346
 347         # TeXML -> LaTeX
 348         temp = mkdtemp('-wl2pdf')
 349
 350         for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 351             url = urllib.parse.urljoin(
 352                 base_url,
 353                 ilustr.get('src')
 354             )
 355             imgfile = urllib.request.urlopen(url)
 356             img = Image.open(imgfile)
 357
 358             th_format, ext, media_type = {
 359                 'GIF': ('GIF', 'gif', 'image/gif'),
 360                 'PNG': ('PNG', 'png', 'image/png'),
 361             }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 362
 363             width = 2400
 364             if img.size[0] < width:
 365                 th = img
 366             else:
 367                 th = img.resize((width, round(width * img.size[1] / img.size[0])))
 368
 369             file_name = 'image%d.%s' % (i, ext)
 370             th.save(os.path.join(temp, file_name))
 371             ilustr.set('src', file_name)
 372
 373             imgfile.close()
 374
 375         for sponsor in book_info.sponsors:
 376             ins = etree.Element("data-sponsor", name=sponsor)
 377             logo = sponsor_logo(sponsor)
 378             if logo:
 379                 fname = 'sponsor-%s' % os.path.basename(logo)
 380                 shutil.copy(logo, os.path.join(temp, fname))
 381                 ins.set('src', fname)
 382             root.insert(0, ins)
 383
 384         if book_info.sponsor_note:
 385             root.set("sponsor-note", book_info.sponsor_note)
 386
 387         texml = document.transform(style)
 388
 389         if cover:
 390             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 391                 bound_cover.save(f, quality=80)
 392
 393         del document  # no longer needed large object :)
 394
 395         tex_path = os.path.join(temp, 'doc.tex')
 396         fout = open(tex_path, 'wb')
 397         process(io.BytesIO(texml), fout, 'utf-8')
 398         fout.close()
 399         del texml
 400
 401         if save_tex:
 402             shutil.copy(tex_path, save_tex)
 403
 404         # LaTeX -> PDF
 405         shutil.copy(get_resource('pdf/wl.cls'), temp)
 406         shutil.copy(get_resource('res/wl-logo.png'), temp)
 407
 408         if latex_dir:
 409             return temp
 410
 411         try:
 412             cwd = os.getcwd()
 413         except OSError:
 414             cwd = None
 415         os.chdir(temp)
 416
 417         # some things work better when compiled twice
 418         # (table of contents, [line numbers - disabled])
 419         for run in range(2):
 420             if verbose:
 421                 p = call(['xelatex', tex_path])
 422             else:
 423                 p = call(
 424                     ['xelatex', '-interaction=batchmode', tex_path],
 425                     stdout=PIPE, stderr=PIPE
 426                 )
 427             if p:
 428                 raise ParseError("Error parsing .tex file")
 429
 430         if cwd is not None:
 431             os.chdir(cwd)
 432
 433         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
 434                                          delete=False)
 435         pdf_path = os.path.join(temp, 'doc.pdf')
 436         shutil.move(pdf_path, output_file.name)
 437         shutil.rmtree(temp)
 438         return OutputFile.from_filename(output_file.name)
 439
 440     except (XMLSyntaxError, XSLTApplyError) as e:
 441         raise ParseError(e)
 442
 443
 444 def load_including_children(wldoc=None, provider=None, uri=None):
 445     """ Makes one big xml file with children inserted at end.
 446
 447     Either wldoc or provider and URI must be provided.
 448     """
 449
 450     if uri and provider:
 451         f = provider.by_slug(uri.slug)
 452         text = f.read().decode('utf-8')
 453         f.close()
 454     elif wldoc is not None:
 455         text = etree.tostring(wldoc.edoc, encoding='unicode')
 456         provider = wldoc.provider
 457     else:
 458         raise ValueError(
 459             'Neither a WLDocument, nor provider and URI were provided.'
 460         )
 461
 462     # Cyrrilic
 463     text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
 464     # Geometric shapes.
 465     text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
 466     # Hebrew
 467     text = re.sub(r"([\u0590-\u05ff]+)", r"<fallback>\1</fallback>", text)
 468
 469     document = WLDocument.from_bytes(text.encode('utf-8'),
 470                                      parse_dublincore=True, provider=provider)
 471     document.swap_endlines()
 472
 473     for child_uri in document.book_info.parts:
 474         child = load_including_children(provider=provider, uri=child_uri)
 475         document.edoc.getroot().append(child.edoc.getroot())
 476     return document