librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS, get_resource
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'pdf/wl2tex.xslt',
  36 }
  37
  38 CUSTOMIZATIONS = [
  39     'nofootnotes',
  40     'nothemes',
  41     'onehalfleading',
  42     'doubleleading',
  43     'nowlfont',
  44     ]
  45
  46 def insert_tags(doc, split_re, tagname, exclude=None):
  47     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  48
  49     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  50     >>> insert_tags(t, re.compile('-'), 'd');
  51     >>> print etree.tostring(t)
  52     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  53     """
  54
  55     for elem in doc.iter(tag=etree.Element):
  56         if exclude and elem.tag in exclude:
  57             continue
  58         if elem.text:
  59             chunks = split_re.split(elem.text)
  60             while len(chunks) > 1:
  61                 ins = etree.Element(tagname)
  62                 ins.tail = chunks.pop()
  63                 elem.insert(0, ins)
  64             elem.text = chunks.pop(0)
  65         if elem.tail:
  66             chunks = split_re.split(elem.tail)
  67             parent = elem.getparent()
  68             ins_index = parent.index(elem) + 1
  69             while len(chunks) > 1:
  70                 ins = etree.Element(tagname)
  71                 ins.tail = chunks.pop()
  72                 parent.insert(ins_index, ins)
  73             elem.tail = chunks.pop(0)
  74
  75
  76 def substitute_hyphens(doc):
  77     insert_tags(doc,
  78                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  79                 "dywiz",
  80                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  81                 )
  82
  83
  84 def fix_hanging(doc):
  85     insert_tags(doc,
  86                 re.compile("(?<=\s\w)\s+"),
  87                 "nbsp",
  88                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  89                 )
  90
  91
  92 def move_motifs_inside(doc):
  93     """ moves motifs to be into block elements """
  94     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  95         for motif in master.xpath('motyw'):
  96             for sib in motif.itersiblings():
  97                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  98                     # motif shouldn't have a tail - it would be untagged text
  99                     motif.tail = None
 100                     motif.getparent().remove(motif)
 101                     sib.insert(0, motif)
 102                     break
 103
 104
 105 def hack_motifs(doc):
 106     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 107     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 108
 109     moves motifs in stanzas from first verse to second
 110     and from next to last to last, then inserts negative vspace before them
 111     """
 112     for motif in doc.findall('//strofa//motyw'):
 113         # find relevant verse-level tag
 114         verse, stanza = motif, motif.getparent()
 115         while stanza is not None and stanza.tag != 'strofa':
 116             verse, stanza = stanza, stanza.getparent()
 117         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 118         breaks_after = sum(1 for i in verse.itersiblings('br'))
 119         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 120             move_by = 1
 121             if breaks_after == 2:
 122                 move_by += 1
 123             moved_motif = deepcopy(motif)
 124             motif.tag = 'span'
 125             motif.text = None
 126             moved_motif.tail = None
 127             moved_motif.set('moved', str(move_by))
 128
 129             for br in verse.itersiblings('br'):
 130                 if move_by > 1:
 131                     move_by -= 1
 132                     continue
 133                 br.addnext(moved_motif)
 134                 break
 135
 136
 137 def parse_creator(doc):
 138     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 139     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 140                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 141                     namespaces = {'dc': str(DCNS)})[::-1]:
 142         if not person.text:
 143             continue
 144         p = Person.from_text(person.text)
 145         person_parsed = deepcopy(person)
 146         person_parsed.tag = person.tag + '_parsed'
 147         person_parsed.set('sortkey', person.text)
 148         person_parsed.text = p.readable()
 149         person.getparent().insert(0, person_parsed)
 150
 151
 152 def get_stylesheet(name):
 153     return get_resource(STYLESHEETS[name])
 154
 155
 156 def package_available(package, args='', verbose=False):
 157     """ check if a verion of a latex package accepting given args is available """
 158     tempdir = mkdtemp('-wl2pdf-test')
 159     fpath = os.path.join(tempdir, 'test.tex')
 160     f = open(fpath, 'w')
 161     f.write(r"""
 162         \documentclass{wl}
 163         \usepackage[%s]{%s}
 164         \begin{document}
 165         \end{document}
 166         """ % (args, package))
 167     f.close()
 168     if verbose:
 169         p = call(['xelatex', '-output-directory', tempdir, fpath])
 170     else:
 171         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 172     shutil.rmtree(tempdir)
 173     return p == 0
 174
 175
 176 def transform(provider, slug=None, file_path=None,
 177               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
 178               cover=None, flags=None, customizations=None):
 179     """ produces a PDF file with XeLaTeX
 180
 181     provider: a DocProvider
 182     slug: slug of file to process, available by provider
 183     file_path can be provided instead of a slug
 184     output_file: file-like object or path to output file
 185     output_dir: path to directory to save output file to; either this or output_file must be present
 186     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 187     verbose: prints all output from LaTeX
 188     save_tex: path to save the intermediary LaTeX file to
 189     morefloats (old/new/none): force specific morefloats
 190     cover: a cover.Cover object
 191     flags: less-advertising,
 192     customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
 193     """
 194
 195     # Parse XSLT
 196     try:
 197         if file_path:
 198             if slug:
 199                 raise ValueError('slug or file_path should be specified, not both')
 200             document = load_including_children(provider, file_path=file_path)
 201         else:
 202             if not slug:
 203                 raise ValueError('either slug or file_path should be specified')
 204             document = load_including_children(provider, slug=slug)
 205
 206         if cover:
 207             document.edoc.getroot().set('data-cover-width', str(cover.width))
 208             document.edoc.getroot().set('data-cover-height', str(cover.height))
 209         if flags:
 210             for flag in flags:
 211                 document.edoc.getroot().set('flag-' + flag, 'yes')
 212
 213         # check for LaTeX packages
 214         if morefloats:
 215             document.edoc.getroot().set('morefloats', morefloats.lower())
 216         elif package_available('morefloats', 'maxfloats=19'):
 217             document.edoc.getroot().set('morefloats', 'new')
 218
 219         # add customizations
 220         if customizations is not None:
 221             document.edoc.getroot().set('customizations', u','.join(customizations))
 222
 223         # hack the tree
 224         move_motifs_inside(document.edoc)
 225         hack_motifs(document.edoc)
 226         parse_creator(document.edoc)
 227         substitute_hyphens(document.edoc)
 228         fix_hanging(document.edoc)
 229
 230         # find output dir
 231         if make_dir and output_dir is not None:
 232             author = unicode(document.book_info.author)
 233             output_dir = os.path.join(output_dir, author)
 234
 235         # wl -> TeXML
 236         style_filename = get_stylesheet("wl2tex")
 237         style = etree.parse(style_filename)
 238
 239         texml = document.transform(style)
 240
 241         # TeXML -> LaTeX
 242         temp = mkdtemp('-wl2pdf')
 243
 244         if cover:
 245             c = cover(document.book_info.author.readable(), document.book_info.title)
 246             with open(os.path.join(temp, 'cover.png'), 'w') as f:
 247                 c.save(f)
 248
 249         del document # no longer needed large object :)
 250
 251         tex_path = os.path.join(temp, 'doc.tex')
 252         fout = open(tex_path, 'w')
 253         process(StringIO(texml), fout, 'utf-8')
 254         fout.close()
 255         del texml
 256
 257         if save_tex:
 258             shutil.copy(tex_path, save_tex)
 259
 260         # LaTeX -> PDF
 261         shutil.copy(get_resource('pdf/wl.cls'), temp)
 262         shutil.copy(get_resource('res/wl-logo.png'), temp)
 263
 264         cwd = os.getcwd()
 265         os.chdir(temp)
 266
 267         if verbose:
 268             p = call(['xelatex', tex_path])
 269         else:
 270             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 271         if p:
 272             raise ParseError("Error parsing .tex file")
 273
 274         os.chdir(cwd)
 275
 276         # save the PDF
 277         pdf_path = os.path.join(temp, 'doc.pdf')
 278         if output_dir is not None:
 279             try:
 280                 os.makedirs(output_dir)
 281             except OSError:
 282                 pass
 283             if slug:
 284                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 285             else:
 286                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 287             shutil.move(pdf_path, output_path)
 288         else:
 289             if hasattr(output_file, 'write'):
 290                 # file-like object
 291                 with open(pdf_path) as f:
 292                     output_file.write(f.read())
 293                 output_file.close()
 294             else:
 295                 # path to output file
 296                 shutil.copy(pdf_path, output_file)
 297         shutil.rmtree(temp)
 298
 299     except (XMLSyntaxError, XSLTApplyError), e:
 300         raise ParseError(e)
 301
 302
 303 def load_including_children(provider, slug=None, uri=None, file_path=None):
 304     """ makes one big xml file with children inserted at end
 305     either slug or uri must be provided
 306     """
 307
 308     if uri:
 309         f = provider.by_uri(uri)
 310     elif slug:
 311         f = provider[slug]
 312     elif file_path:
 313         f = open(file_path, 'r')
 314     else:
 315         raise ValueError('Neither slug, URI nor file path provided for a book.')
 316
 317     text = f.read().decode('utf-8')
 318     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 319
 320     document = WLDocument.from_string(text, True,
 321         parse_dublincore=True)
 322
 323     f.close()
 324     for child_uri in document.book_info.parts:
 325         print child_uri
 326         child = load_including_children(provider, uri=child_uri)
 327         document.edoc.getroot().append(child.edoc.getroot())
 328     return document