librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 """PDF creation library.
   7
   8 Creates one big XML from the book and its children, converts it to LaTeX
   9 with TeXML, then runs it by XeLaTeX.
  10
  11 """
  12 from __future__ import with_statement
  13 import os
  14 import os.path
  15 import shutil
  16 from StringIO import StringIO
  17 from tempfile import mkdtemp, NamedTemporaryFile
  18 import re
  19 from copy import deepcopy
  20 from subprocess import call, PIPE
  21
  22 from Texml.processor import process
  23 from lxml import etree
  24 from lxml.etree import XMLSyntaxError, XSLTApplyError
  25
  26 from librarian.dcparser import Person
  27 from librarian.parser import WLDocument
  28 from librarian import ParseError, DCNS, get_resource, OutputFile
  29 from librarian import functions
  30 from librarian.cover import WLCover
  31
  32
  33 functions.reg_substitute_entities()
  34 functions.reg_strip()
  35 functions.reg_starts_white()
  36 functions.reg_ends_white()
  37 functions.reg_texcommand()
  38 functions.reg_urlquote()
  39 functions.reg_breakurl()
  40
  41 STYLESHEETS = {
  42     'wl2tex': 'pdf/wl2tex.xslt',
  43 }
  44
  45 #CUSTOMIZATIONS = [
  46 #    'nofootnotes',
  47 #    'nothemes',
  48 #    'defaultleading',
  49 #    'onehalfleading',
  50 #    'doubleleading',
  51 #    'nowlfont',
  52 #    ]
  53
  54 def insert_tags(doc, split_re, tagname, exclude=None):
  55     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  56
  57     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  58     >>> insert_tags(t, re.compile('-'), 'd');
  59     >>> print etree.tostring(t)
  60     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  61     """
  62
  63     for elem in doc.iter(tag=etree.Element):
  64         if exclude and elem.tag in exclude:
  65             continue
  66         if elem.text:
  67             chunks = split_re.split(elem.text)
  68             while len(chunks) > 1:
  69                 ins = etree.Element(tagname)
  70                 ins.tail = chunks.pop()
  71                 elem.insert(0, ins)
  72             elem.text = chunks.pop(0)
  73         if elem.tail:
  74             chunks = split_re.split(elem.tail)
  75             parent = elem.getparent()
  76             ins_index = parent.index(elem) + 1
  77             while len(chunks) > 1:
  78                 ins = etree.Element(tagname)
  79                 ins.tail = chunks.pop()
  80                 parent.insert(ins_index, ins)
  81             elem.tail = chunks.pop(0)
  82
  83
  84 def substitute_hyphens(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  87                 "dywiz",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
  89                 )
  90
  91
  92 def fix_hanging(doc):
  93     insert_tags(doc,
  94                 re.compile("(?<=\s\w)\s+"),
  95                 "nbsp",
  96                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  97                 )
  98
  99 def fix_tables(doc):
 100     for kol in doc.iter(tag='kol'):
 101         if kol.tail is not None:
 102             if not kol.tail.strip():
 103                 kol.tail = None
 104     for table in doc.iter(tag='tabela'):
 105         if table.get('ramka') == '1' or table.get('ramki') == '1':
 106             table.set('_format', '|' + 'X|' * len(table[0]))
 107         else:
 108             table.set('_format', 'X' * len(table[0]))
 109
 110
 111
 112 def move_motifs_inside(doc):
 113     """ moves motifs to be into block elements """
 114     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
 115         for motif in master.xpath('motyw'):
 116             for sib in motif.itersiblings():
 117                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
 118                     # motif shouldn't have a tail - it would be untagged text
 119                     motif.tail = None
 120                     motif.getparent().remove(motif)
 121                     sib.insert(0, motif)
 122                     break
 123
 124
 125 def hack_motifs(doc):
 126     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 127     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 128
 129     moves motifs in stanzas from first verse to second
 130     and from next to last to last, then inserts negative vspace before them
 131     """
 132     for motif in doc.findall('//strofa//motyw'):
 133         # find relevant verse-level tag
 134         verse, stanza = motif, motif.getparent()
 135         while stanza is not None and stanza.tag != 'strofa':
 136             verse, stanza = stanza, stanza.getparent()
 137         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 138         breaks_after = sum(1 for i in verse.itersiblings('br'))
 139         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 140             move_by = 1
 141             if breaks_after == 2:
 142                 move_by += 1
 143             moved_motif = deepcopy(motif)
 144             motif.tag = 'span'
 145             motif.text = None
 146             moved_motif.tail = None
 147             moved_motif.set('moved', str(move_by))
 148
 149             for br in verse.itersiblings('br'):
 150                 if move_by > 1:
 151                     move_by -= 1
 152                     continue
 153                 br.addnext(moved_motif)
 154                 break
 155
 156
 157 def parse_creator(doc):
 158     """Generates readable versions of creator and translator tags.
 159
 160     Finds all dc:creator and dc.contributor.translator tags
 161     and adds *_parsed versions with forenames first.
 162     """
 163     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 164                     'creator', 'contributor.translator',
 165                     'contributor.editor', 'contributor.technical_editor')),
 166                     namespaces = {'dc': str(DCNS)})[::-1]:
 167         if not person.text:
 168             continue
 169         p = Person.from_text(person.text)
 170         person_parsed = deepcopy(person)
 171         person_parsed.tag = person.tag + '_parsed'
 172         person_parsed.set('sortkey', person.text)
 173         person_parsed.text = p.readable()
 174         person.getparent().insert(0, person_parsed)
 175
 176
 177 def get_stylesheet(name):
 178     return get_resource(STYLESHEETS[name])
 179
 180
 181 def package_available(package, args='', verbose=False):
 182     """ check if a verion of a latex package accepting given args is available """
 183     tempdir = mkdtemp('-wl2pdf-test')
 184     fpath = os.path.join(tempdir, 'test.tex')
 185     f = open(fpath, 'w')
 186     f.write(r"""
 187         \documentclass{wl}
 188         \usepackage[%s]{%s}
 189         \begin{document}
 190         \end{document}
 191         """ % (args, package))
 192     f.close()
 193     if verbose:
 194         p = call(['xelatex', '-output-directory', tempdir, fpath])
 195     else:
 196         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 197     shutil.rmtree(tempdir)
 198     return p == 0
 199
 200
 201 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
 202               cover=None, flags=None, customizations=None, ilustr_path=''):
 203     """ produces a PDF file with XeLaTeX
 204
 205     wldoc: a WLDocument
 206     verbose: prints all output from LaTeX
 207     save_tex: path to save the intermediary LaTeX file to
 208     morefloats (old/new/none): force specific morefloats
 209     cover: a cover.Cover factory or True for default
 210     flags: less-advertising,
 211     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 212     """
 213
 214     # Parse XSLT
 215     try:
 216         book_info = wldoc.book_info
 217         document = load_including_children(wldoc)
 218         root = document.edoc.getroot()
 219
 220         if cover:
 221             if cover is True:
 222                 cover = WLCover
 223             bound_cover = cover(book_info, width=2400)
 224             root.set('data-cover-width', str(bound_cover.width))
 225             root.set('data-cover-height', str(bound_cover.height))
 226             if bound_cover.uses_dc_cover:
 227                 if book_info.cover_by:
 228                     root.set('data-cover-by', book_info.cover_by)
 229                 if book_info.cover_source:
 230                     root.set('data-cover-source',
 231                             book_info.cover_source)
 232         if flags:
 233             for flag in flags:
 234                 root.set('flag-' + flag, 'yes')
 235
 236         # check for LaTeX packages
 237         if morefloats:
 238             root.set('morefloats', morefloats.lower())
 239         elif package_available('morefloats', 'maxfloats=19'):
 240             root.set('morefloats', 'new')
 241
 242         # add customizations
 243         if customizations is not None:
 244             root.set('customizations', u','.join(customizations))
 245
 246         # add editors info
 247         root.set('editors', u', '.join(sorted(
 248             editor.readable() for editor in document.editors())))
 249
 250         # hack the tree
 251         move_motifs_inside(document.edoc)
 252         hack_motifs(document.edoc)
 253         parse_creator(document.edoc)
 254         substitute_hyphens(document.edoc)
 255         fix_hanging(document.edoc)
 256         fix_tables(document.edoc)
 257
 258         # wl -> TeXML
 259         style_filename = get_stylesheet("wl2tex")
 260         style = etree.parse(style_filename)
 261
 262         texml = document.transform(style)
 263
 264         # TeXML -> LaTeX
 265         temp = mkdtemp('-wl2pdf')
 266
 267         for ilustr in document.edoc.findall("//ilustr"):
 268             shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
 269
 270         if cover:
 271             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 272                 bound_cover.save(f)
 273
 274         del document # no longer needed large object :)
 275
 276         tex_path = os.path.join(temp, 'doc.tex')
 277         fout = open(tex_path, 'w')
 278         process(StringIO(texml), fout, 'utf-8')
 279         fout.close()
 280         del texml
 281
 282         if save_tex:
 283             shutil.copy(tex_path, save_tex)
 284
 285         # LaTeX -> PDF
 286         shutil.copy(get_resource('pdf/wl.cls'), temp)
 287         shutil.copy(get_resource('res/wl-logo.png'), temp)
 288         shutil.copy(get_resource('res/prawokultury-logo.png'), temp)
 289         # shutil.copy(get_resource('res/trust-logo.eps'), temp)
 290         shutil.copy(get_resource('res/fnp-logo.eps'), temp)
 291         # shutil.copy(get_resource('res/koed-logo.eps'), temp)
 292
 293         try:
 294             cwd = os.getcwd()
 295         except OSError:
 296             cwd = None
 297         os.chdir(temp)
 298
 299         if verbose:
 300             p = call(['xelatex', tex_path])
 301         else:
 302             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 303         if p:
 304             raise ParseError("Error parsing .tex file")
 305
 306         if cwd is not None:
 307             os.chdir(cwd)
 308
 309         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
 310         pdf_path = os.path.join(temp, 'doc.pdf')
 311         shutil.move(pdf_path, output_file.name)
 312         shutil.rmtree(temp)
 313         return OutputFile.from_filename(output_file.name)
 314
 315     except (XMLSyntaxError, XSLTApplyError), e:
 316         raise ParseError(e)
 317
 318
 319 def load_including_children(wldoc=None, provider=None, uri=None):
 320     """ Makes one big xml file with children inserted at end.
 321
 322     Either wldoc or provider and URI must be provided.
 323     """
 324
 325     if uri and provider:
 326         f = provider.by_uri(uri)
 327         text = f.read().decode('utf-8')
 328         f.close()
 329     elif wldoc is not None:
 330         text = etree.tostring(wldoc.edoc, encoding=unicode)
 331         provider = wldoc.provider
 332     else:
 333         raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
 334
 335     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 336
 337     document = WLDocument.from_string(text,
 338                 parse_dublincore=True, provider=provider)
 339     document.swap_endlines()
 340
 341     for child_uri in document.book_info.parts:
 342         child = load_including_children(provider=provider, uri=child_uri)
 343         document.edoc.getroot().append(child.edoc.getroot())
 344     return document