librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21 from itertools import chain
  22
  23 from Texml.processor import process
  24 from lxml import etree
  25 from lxml.etree import XMLSyntaxError, XSLTApplyError
  26
  27 from librarian.dcparser import Person
  28 from librarian.parser import WLDocument
  29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
  30 from librarian import functions
  31 from librarian.cover import make_cover
  32 from .sponsor import sponsor_logo
  33
  34
  35 functions.reg_substitute_entities()
  36 functions.reg_strip()
  37 functions.reg_starts_white()
  38 functions.reg_ends_white()
  39 functions.reg_texcommand()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 # CUSTOMIZATIONS = [
  46 #     'nofootnotes',
  47 #     'nothemes',
  48 #     'defaultleading',
  49 #     'onehalfleading',
  50 #     'doubleleading',
  51 #     'nowlfont',
  52 # ]
  53
  54
  55 def insert_tags(doc, split_re, tagname, exclude=None):
  56     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  57
  58     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
  59     >>> insert_tags(t, re.compile('-'), 'd')
  60     >>> print etree.tostring(t)
  61     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  62     """
  63
  64     for elem in doc.iter(tag=etree.Element):
  65         if exclude and elem.tag in exclude:
  66             continue
  67         if elem.text:
  68             chunks = split_re.split(elem.text)
  69             while len(chunks) > 1:
  70                 ins = etree.Element(tagname)
  71                 ins.tail = chunks.pop()
  72                 elem.insert(0, ins)
  73             elem.text = chunks.pop(0)
  74         if elem.tail:
  75             chunks = split_re.split(elem.tail)
  76             parent = elem.getparent()
  77             ins_index = parent.index(elem) + 1
  78             while len(chunks) > 1:
  79                 ins = etree.Element(tagname)
  80                 ins.tail = chunks.pop()
  81                 parent.insert(ins_index, ins)
  82             elem.tail = chunks.pop(0)
  83
  84
  85 def substitute_hyphens(doc):
  86     insert_tags(doc,
  87                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  88                 "dywiz",
  89                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
  90                 )
  91
  92
  93 def fix_hanging(doc):
  94     insert_tags(doc,
  95                 re.compile("(?<=\s\w)\s+"),
  96                 "nbsp",
  97                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  98                 )
  99
 100
 101 def fix_tables(doc):
 102     for kol in doc.iter(tag='kol'):
 103         if kol.tail is not None:
 104             if not kol.tail.strip():
 105                 kol.tail = None
 106     for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
 107         if table.get('ramka') == '1' or table.get('ramki') == '1':
 108             table.set('_format', '|' + 'X|' * len(table[0]))
 109         else:
 110             table.set('_format', 'X' * len(table[0]))
 111
 112
 113 def mark_subauthors(doc):
 114     root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed')))
 115     last_author = None
 116     # jeśli autor jest inny niż autor całości i niż poprzedni autor
 117     # to wstawiamy jakiś znacznik w rdf?
 118     for subutwor in doc.xpath('/utwor/utwor'):
 119         author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
 120         if author not in (last_author, root_author):
 121             subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor'))
 122         last_author = author
 123
 124
 125 def move_motifs_inside(doc):
 126     """ moves motifs to be into block elements """
 127     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
 128                             '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 129         for motif in master.xpath('motyw'):
 130             for sib in motif.itersiblings():
 131                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
 132                                    'begin', 'end', 'motyw', 'extra', 'uwaga'):
 133                     # motif shouldn't have a tail - it would be untagged text
 134                     motif.tail = None
 135                     motif.getparent().remove(motif)
 136                     sib.insert(0, motif)
 137                     break
 138
 139
 140 def hack_motifs(doc):
 141     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 142     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 143
 144     moves motifs in stanzas from first verse to second
 145     and from next to last to last, then inserts negative vspace before them
 146     """
 147     for motif in doc.findall('//strofa//motyw'):
 148         # find relevant verse-level tag
 149         verse, stanza = motif, motif.getparent()
 150         while stanza is not None and stanza.tag != 'strofa':
 151             verse, stanza = stanza, stanza.getparent()
 152         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 153         breaks_after = sum(1 for i in verse.itersiblings('br'))
 154         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 155             move_by = 1
 156             if breaks_after == 2:
 157                 move_by += 1
 158             moved_motif = deepcopy(motif)
 159             motif.tag = 'span'
 160             motif.text = None
 161             moved_motif.tail = None
 162             moved_motif.set('moved', str(move_by))
 163
 164             for br in verse.itersiblings('br'):
 165                 if move_by > 1:
 166                     move_by -= 1
 167                     continue
 168                 br.addnext(moved_motif)
 169                 break
 170
 171
 172 def parse_creator(doc):
 173     """Generates readable versions of creator and translator tags.
 174
 175     Finds all dc:creator and dc.contributor.translator tags
 176     and adds *_parsed versions with forenames first.
 177     """
 178     for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
 179                             namespaces={'dc': str(DCNS)})[::-1]:
 180         if not person.text:
 181             continue
 182         p = Person.from_text(person.text)
 183         person_parsed = deepcopy(person)
 184         person_parsed.tag = person.tag + '_parsed'
 185         person_parsed.set('sortkey', person.text)
 186         person_parsed.text = p.readable()
 187         person.getparent().insert(0, person_parsed)
 188
 189
 190 def get_stylesheet(name):
 191     return get_resource(STYLESHEETS[name])
 192
 193
 194 def package_available(package, args='', verbose=False):
 195     """ check if a verion of a latex package accepting given args is available """
 196     tempdir = mkdtemp('-wl2pdf-test')
 197     fpath = os.path.join(tempdir, 'test.tex')
 198     f = open(fpath, 'w')
 199     f.write(r"""
 200         \documentclass{wl}
 201         \usepackage[%s]{%s}
 202         \begin{document}
 203         \end{document}
 204         """ % (args, package))
 205     f.close()
 206     if verbose:
 207         p = call(['xelatex', '-output-directory', tempdir, fpath])
 208     else:
 209         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 210     shutil.rmtree(tempdir)
 211     return p == 0
 212
 213
 214 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 215               cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
 216     """ produces a PDF file with XeLaTeX
 217
 218     wldoc: a WLDocument
 219     verbose: prints all output from LaTeX
 220     save_tex: path to save the intermediary LaTeX file to
 221     morefloats (old/new/none): force specific morefloats
 222     cover: a cover.Cover factory or True for default
 223     flags: less-advertising,
 224     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 225     """
 226
 227     # Parse XSLT
 228     try:
 229         book_info = wldoc.book_info
 230         document = load_including_children(wldoc)
 231         root = document.edoc.getroot()
 232
 233         if cover:
 234             if cover is True:
 235                 cover = make_cover
 236             bound_cover = cover(book_info, width=1200)
 237             root.set('data-cover-width', str(bound_cover.width))
 238             root.set('data-cover-height', str(bound_cover.height))
 239             if bound_cover.uses_dc_cover:
 240                 if book_info.cover_by:
 241                     root.set('data-cover-by', book_info.cover_by)
 242                 if book_info.cover_source:
 243                     root.set('data-cover-source', book_info.cover_source)
 244         if flags:
 245             for flag in flags:
 246                 root.set('flag-' + flag, 'yes')
 247
 248         # check for LaTeX packages
 249         if morefloats:
 250             root.set('morefloats', morefloats.lower())
 251         elif package_available('morefloats', 'maxfloats=19'):
 252             root.set('morefloats', 'new')
 253
 254         # add customizations
 255         if customizations is not None:
 256             root.set('customizations', u','.join(customizations))
 257
 258         # add editors info
 259         editors = document.editors()
 260         if editors:
 261             root.set('editors', u', '.join(sorted(
 262                 editor.readable() for editor in editors)))
 263         if document.book_info.funders:
 264             root.set('funders', u', '.join(document.book_info.funders))
 265         if document.book_info.thanks:
 266             root.set('thanks', document.book_info.thanks)
 267
 268         # hack the tree
 269         move_motifs_inside(document.edoc)
 270         hack_motifs(document.edoc)
 271         parse_creator(document.edoc)
 272         substitute_hyphens(document.edoc)
 273         fix_hanging(document.edoc)
 274         fix_tables(document.edoc)
 275         mark_subauthors(document.edoc)
 276
 277         # wl -> TeXML
 278         style_filename = get_stylesheet("wl2tex")
 279         style = etree.parse(style_filename)
 280         functions.reg_mathml_latex()
 281
 282         # TeXML -> LaTeX
 283         temp = mkdtemp('-wl2pdf')
 284
 285         for ilustr in document.edoc.findall("//ilustr"):
 286             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 287
 288         for sponsor in book_info.sponsors:
 289             ins = etree.Element("data-sponsor", name=sponsor)
 290             logo = sponsor_logo(sponsor)
 291             if logo:
 292                 fname = 'sponsor-%s' % os.path.basename(logo)
 293                 shutil.copy(logo, os.path.join(temp, fname))
 294                 ins.set('src', fname)
 295             root.insert(0, ins)
 296
 297         if book_info.sponsor_note:
 298             root.set("sponsor-note", book_info.sponsor_note)
 299
 300         texml = document.transform(style)
 301
 302         if cover:
 303             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 304                 bound_cover.save(f, quality=80)
 305
 306         del document  # no longer needed large object :)
 307
 308         tex_path = os.path.join(temp, 'doc.tex')
 309         fout = open(tex_path, 'w')
 310         process(StringIO(texml), fout, 'utf-8')
 311         fout.close()
 312         del texml
 313
 314         if save_tex:
 315             shutil.copy(tex_path, save_tex)
 316
 317         # LaTeX -> PDF
 318         shutil.copy(get_resource('pdf/wl.cls'), temp)
 319         shutil.copy(get_resource('res/wl-logo.png'), temp)
 320
 321         if latex_dir:
 322             return temp
 323
 324         try:
 325             cwd = os.getcwd()
 326         except OSError:
 327             cwd = None
 328         os.chdir(temp)
 329
 330         # some things work better when compiled twice
 331         # (table of contents, [line numbers - disabled])
 332         for run in xrange(2):
 333             if verbose:
 334                 p = call(['xelatex', tex_path])
 335             else:
 336                 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 337             if p:
 338                 raise ParseError("Error parsing .tex file")
 339
 340         if cwd is not None:
 341             os.chdir(cwd)
 342
 343         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 344         pdf_path = os.path.join(temp, 'doc.pdf')
 345         shutil.move(pdf_path, output_file.name)
 346         shutil.rmtree(temp)
 347         return OutputFile.from_filename(output_file.name)
 348
 349     except (XMLSyntaxError, XSLTApplyError), e:
 350         raise ParseError(e)
 351
 352
 353 def load_including_children(wldoc=None, provider=None, uri=None):
 354     """ Makes one big xml file with children inserted at end.
 355
 356     Either wldoc or provider and URI must be provided.
 357     """
 358
 359     if uri and provider:
 360         f = provider.by_uri(uri)
 361         text = f.read().decode('utf-8')
 362         f.close()
 363     elif wldoc is not None:
 364         text = etree.tostring(wldoc.edoc, encoding=unicode)
 365         provider = wldoc.provider
 366     else:
 367         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 368
 369     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 370
 371     document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
 372     document.swap_endlines()
 373
 374     for child_uri in document.book_info.parts:
 375         child = load_including_children(provider=provider, uri=child_uri)
 376         document.edoc.getroot().append(child.edoc.getroot())
 377     return document