src/librarian/pdf.py

   1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
   3 #
   4 """PDF creation library.
   5
   6 Creates one big XML from the book and its children, converts it to LaTeX
   7 with TeXML, then runs it by XeLaTeX.
   8
   9 """
  10 import io
  11 import os
  12 import os.path
  13 import shutil
  14 from tempfile import mkdtemp, NamedTemporaryFile
  15 import re
  16 from copy import deepcopy
  17 from subprocess import call, PIPE
  18 from itertools import chain
  19 import urllib.parse
  20 import urllib.request
  21
  22 from PIL import Image
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  30 from librarian import functions
  31 from librarian.cover import make_cover
  32 from .sponsor import sponsor_logo
  33
  34
  35 functions.reg_substitute_entities()
  36 functions.reg_strip()
  37 functions.reg_starts_white()
  38 functions.reg_ends_white()
  39 functions.reg_texcommand()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 # CUSTOMIZATIONS = [
  46 #     'nofootnotes',
  47 #     'nothemes',
  48 #     'defaultleading',
  49 #     'onehalfleading',
  50 #     'doubleleading',
  51 #     'nowlfont',
  52 # ]
  53
  54
  55 def insert_tags(doc, split_re, tagname, exclude=None):
  56     """
  57     Inserts <tagname> for every occurence of `split_re'
  58     in text nodes in the `doc' tree.
  59
  60     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  61     >>> insert_tags(t, re.compile('-'), 'd')
  62     >>> print(etree.tostring(t, encoding='unicode'))
  63     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  64     """
  65
  66     for elem in doc.iter(tag=etree.Element):
  67         if exclude and elem.tag in exclude:
  68             continue
  69         if elem.text:
  70             chunks = split_re.split(elem.text)
  71             while len(chunks) > 1:
  72                 ins = etree.Element(tagname)
  73                 ins.tail = chunks.pop()
  74                 elem.insert(0, ins)
  75             elem.text = chunks.pop(0)
  76         if elem.tail:
  77             chunks = split_re.split(elem.tail)
  78             parent = elem.getparent()
  79             ins_index = parent.index(elem) + 1
  80             while len(chunks) > 1:
  81                 ins = etree.Element(tagname)
  82                 ins.tail = chunks.pop()
  83                 parent.insert(ins_index, ins)
  84             elem.tail = chunks.pop(0)
  85
  86
  87 def substitute_hyphens(doc):
  88     insert_tags(
  89         doc,
  90         re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
  91         "dywiz",
  92         exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  93     )
  94
  95
  96 def fix_hanging(doc):
  97     insert_tags(
  98         doc,
  99         re.compile(r"(?<=\s\w)\s+"),
 100         "nbsp",
 101         exclude=[DCNS("identifier.url"), DCNS("rights.license")]
 102     )
 103
 104
 105 def fix_tables(doc):
 106     for kol in doc.iter(tag='kol'):
 107         if kol.tail is not None:
 108             if not kol.tail.strip():
 109                 kol.tail = None
 110     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 111         if table.get('ramka') == '1' or table.get('ramki') == '1':
 112             table.set('_format', '|' + 'X|' * len(table[0]))
 113         else:
 114             table.set('_format', 'X' * len(table[0]))
 115
 116
 117 def mark_subauthors(doc):
 118     root_author = ', '.join(
 119         elem.text
 120         for elem in doc.findall(
 121                 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
 122         )
 123     )
 124     last_author = None
 125     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 126     # to wstawiamy jakiś znacznik w rdf?
 127     for subutwor in doc.xpath('/utwor/utwor'):
 128         author = ', '.join(
 129             elem.text
 130             for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
 131         )
 132         if author not in (last_author, root_author):
 133             subutwor.find('.//' + RDFNS('RDF')).append(
 134                 etree.Element('use_subauthor')
 135             )
 136         last_author = author
 137
 138
 139 def move_motifs_inside(doc):
 140     """ moves motifs to be into block elements """
 141     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 142                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
 143                             '//dramat_wspolczesny'):
 144         for motif in master.xpath('motyw'):
 145             for sib in motif.itersiblings():
 146                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
 147                                    'separator_linia', 'begin', 'end',
 148                                    'motyw', 'extra', 'uwaga'):
 149                     # motif shouldn't have a tail - it would be untagged text
 150                     motif.tail = None
 151                     motif.getparent().remove(motif)
 152                     sib.insert(0, motif)
 153                     break
 154
 155
 156 def hack_motifs(doc):
 157     """
 158     Dirty hack for the marginpar-creates-orphans LaTeX problem
 159     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 160
 161     Moves motifs in stanzas from first verse to second and from next
 162     to last to last, then inserts negative vspace before them.
 163     """
 164     for motif in doc.findall('//strofa//motyw'):
 165         # find relevant verse-level tag
 166         verse, stanza = motif, motif.getparent()
 167         while stanza is not None and stanza.tag != 'strofa':
 168             verse, stanza = stanza, stanza.getparent()
 169         breaks_before = sum(
 170             1 for i in verse.itersiblings('br', preceding=True)
 171         )
 172         breaks_after = sum(1 for i in verse.itersiblings('br'))
 173         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 174             move_by = 1
 175             if breaks_after == 2:
 176                 move_by += 1
 177             moved_motif = deepcopy(motif)
 178             motif.tag = 'span'
 179             motif.text = None
 180             moved_motif.tail = None
 181             moved_motif.set('moved', str(move_by))
 182
 183             for br in verse.itersiblings('br'):
 184                 if move_by > 1:
 185                     move_by -= 1
 186                     continue
 187                 br.addnext(moved_motif)
 188                 break
 189
 190
 191 def parse_creator(doc):
 192     """Generates readable versions of creator and translator tags.
 193
 194     Finds all dc:creator and dc.contributor.translator tags
 195     and adds *_parsed versions with forenames first.
 196     """
 197     for person in doc.xpath(
 198             "|".join('//dc:' + tag for tag in (
 199                 'creator', 'contributor.translator'
 200             )),
 201             namespaces={'dc': str(DCNS)})[::-1]:
 202         if not person.text:
 203             continue
 204         p = Person.from_text(person.text)
 205         person_parsed = deepcopy(person)
 206         person_parsed.tag = person.tag + '_parsed'
 207         person_parsed.set('sortkey', person.text)
 208         person_parsed.text = p.readable()
 209         person.getparent().insert(0, person_parsed)
 210
 211
 212 def get_stylesheet(name):
 213     return get_resource(STYLESHEETS[name])
 214
 215
 216 def package_available(package, args='', verbose=False):
 217     """
 218     Check if a verion of a latex package accepting given args
 219     is available.
 220     """
 221     tempdir = mkdtemp('-wl2pdf-test')
 222     fpath = os.path.join(tempdir, 'test.tex')
 223     f = open(fpath, 'w')
 224     f.write("""
 225         \\documentclass{wl}
 226         \\usepackage[%s]{%s}
 227         \\begin{document}
 228         \\end{document}
 229         """ % (args, package))
 230     f.close()
 231     if verbose:
 232         p = call(['xelatex', '-output-directory', tempdir, fpath])
 233     else:
 234         p = call(
 235             ['xelatex', '-interaction=batchmode', '-output-directory',
 236              tempdir, fpath],
 237             stdout=PIPE, stderr=PIPE
 238         )
 239     shutil.rmtree(tempdir)
 240     return p == 0
 241
 242
 243 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 244               cover=None, flags=None, customizations=None, base_url='file://./',
 245               latex_dir=False):
 246     """ produces a PDF file with XeLaTeX
 247
 248     wldoc: a WLDocument
 249     verbose: prints all output from LaTeX
 250     save_tex: path to save the intermediary LaTeX file to
 251     morefloats (old/new/none): force specific morefloats
 252     cover: a cover.Cover factory or True for default
 253     flags: less-advertising,
 254     customizations: user requested customizations regarding various
 255         formatting parameters (passed to wl LaTeX class)
 256     """
 257
 258     # Parse XSLT
 259     try:
 260         book_info = wldoc.book_info
 261         document = load_including_children(wldoc)
 262         root = document.edoc.getroot()
 263
 264         if cover:
 265             if cover is True:
 266                 cover = make_cover
 267             bound_cover = cover(book_info, width=1200)
 268             root.set('data-cover-width', str(bound_cover.width))
 269             root.set('data-cover-height', str(bound_cover.height))
 270             if bound_cover.uses_dc_cover:
 271                 if book_info.cover_by:
 272                     root.set('data-cover-by', book_info.cover_by)
 273                 if book_info.cover_source:
 274                     root.set('data-cover-source', book_info.cover_source)
 275         if flags:
 276             for flag in flags:
 277                 root.set('flag-' + flag, 'yes')
 278
 279         # check for LaTeX packages
 280         if morefloats:
 281             root.set('morefloats', morefloats.lower())
 282         elif package_available('morefloats', 'maxfloats=19'):
 283             root.set('morefloats', 'new')
 284
 285         if customizations is None:
 286             customizations = []
 287         else:
 288             customizations = list(customizations)
 289
 290         if book_info.endnotes:
 291             customizations.append('endnotes')
 292
 293         # add customizations
 294         if customizations is not None:
 295             root.set('customizations', ','.join(customizations))
 296
 297         # add editors info
 298         editors = document.editors()
 299         if editors:
 300             root.set('editors', ', '.join(sorted(
 301                 editor.readable() for editor in editors)))
 302         if document.book_info.funders:
 303             root.set('funders', ', '.join(document.book_info.funders))
 304         if document.book_info.thanks:
 305             root.set('thanks', document.book_info.thanks)
 306
 307         # hack the tree
 308         move_motifs_inside(document.edoc)
 309         hack_motifs(document.edoc)
 310         parse_creator(document.edoc)
 311         substitute_hyphens(document.edoc)
 312         fix_hanging(document.edoc)
 313         fix_tables(document.edoc)
 314         mark_subauthors(document.edoc)
 315         document.fix_pa_akap()
 316
 317         # wl -> TeXML
 318         style_filename = get_stylesheet("wl2tex")
 319         style = etree.parse(style_filename)
 320         functions.reg_mathml_latex()
 321
 322         # TeXML -> LaTeX
 323         temp = mkdtemp('-wl2pdf')
 324
 325         for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
 326             url = urllib.parse.urljoin(
 327                 base_url,
 328                 ilustr.get('src')
 329             )
 330             imgfile = urllib.request.urlopen(url)
 331             img = Image.open(imgfile)
 332
 333             th_format, ext, media_type = {
 334                 'GIF': ('GIF', 'gif', 'image/gif'),
 335                 'PNG': ('PNG', 'png', 'image/png'),
 336             }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
 337
 338             width = 2400
 339             if img.size[0] < width:
 340                 th = img
 341             else:
 342                 th = img.resize((width, round(width * img.size[1] / img.size[0])))
 343
 344             file_name = 'image%d.%s' % (i, ext)
 345             th.save(os.path.join(temp, file_name))
 346             ilustr.set('src', file_name)
 347
 348             imgfile.close()
 349
 350         for sponsor in book_info.sponsors:
 351             ins = etree.Element("data-sponsor", name=sponsor)
 352             logo = sponsor_logo(sponsor)
 353             if logo:
 354                 fname = 'sponsor-%s' % os.path.basename(logo)
 355                 shutil.copy(logo, os.path.join(temp, fname))
 356                 ins.set('src', fname)
 357             root.insert(0, ins)
 358
 359         if book_info.sponsor_note:
 360             root.set("sponsor-note", book_info.sponsor_note)
 361
 362         texml = document.transform(style)
 363
 364         if cover:
 365             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 366                 bound_cover.save(f, quality=80)
 367
 368         del document  # no longer needed large object :)
 369
 370         tex_path = os.path.join(temp, 'doc.tex')
 371         fout = open(tex_path, 'wb')
 372         process(io.BytesIO(texml), fout, 'utf-8')
 373         fout.close()
 374         del texml
 375
 376         if save_tex:
 377             shutil.copy(tex_path, save_tex)
 378
 379         # LaTeX -> PDF
 380         shutil.copy(get_resource('pdf/wl.cls'), temp)
 381         shutil.copy(get_resource('res/wl-logo.png'), temp)
 382
 383         if latex_dir:
 384             return temp
 385
 386         try:
 387             cwd = os.getcwd()
 388         except OSError:
 389             cwd = None
 390         os.chdir(temp)
 391
 392         # some things work better when compiled twice
 393         # (table of contents, [line numbers - disabled])
 394         for run in range(2):
 395             if verbose:
 396                 p = call(['xelatex', tex_path])
 397             else:
 398                 p = call(
 399                     ['xelatex', '-interaction=batchmode', tex_path],
 400                     stdout=PIPE, stderr=PIPE
 401                 )
 402             if p:
 403                 raise ParseError("Error parsing .tex file")
 404
 405         if cwd is not None:
 406             os.chdir(cwd)
 407
 408         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
 409                                          delete=False)
 410         pdf_path = os.path.join(temp, 'doc.pdf')
 411         shutil.move(pdf_path, output_file.name)
 412         shutil.rmtree(temp)
 413         return OutputFile.from_filename(output_file.name)
 414
 415     except (XMLSyntaxError, XSLTApplyError) as e:
 416         raise ParseError(e)
 417
 418
 419 def load_including_children(wldoc=None, provider=None, uri=None):
 420     """ Makes one big xml file with children inserted at end.
 421
 422     Either wldoc or provider and URI must be provided.
 423     """
 424
 425     if uri and provider:
 426         f = provider.by_slug(uri.slug)
 427         text = f.read().decode('utf-8')
 428         f.close()
 429     elif wldoc is not None:
 430         text = etree.tostring(wldoc.edoc, encoding='unicode')
 431         provider = wldoc.provider
 432     else:
 433         raise ValueError(
 434             'Neither a WLDocument, nor provider and URI were provided.'
 435         )
 436
 437     # Cyrrilic
 438     text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
 439     # Geometric shapes.
 440     text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
 441
 442     document = WLDocument.from_bytes(text.encode('utf-8'),
 443                                      parse_dublincore=True, provider=provider)
 444     document.swap_endlines()
 445
 446     for child_uri in document.book_info.parts:
 447         child = load_including_children(provider=provider, uri=child_uri)
 448         document.edoc.getroot().append(child.edoc.getroot())
 449     return document