librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS, get_resource
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'pdf/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname, exclude=None):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter(tag=etree.Element):
  49         if exclude and elem.tag in exclude:
  50             continue
  51         if elem.text:
  52             chunks = split_re.split(elem.text)
  53             while len(chunks) > 1:
  54                 ins = etree.Element(tagname)
  55                 ins.tail = chunks.pop()
  56                 elem.insert(0, ins)
  57             elem.text = chunks.pop(0)
  58         if elem.tail:
  59             chunks = split_re.split(elem.tail)
  60             parent = elem.getparent()
  61             ins_index = parent.index(elem) + 1
  62             while len(chunks) > 1:
  63                 ins = etree.Element(tagname)
  64                 ins.tail = chunks.pop()
  65                 parent.insert(ins_index, ins)
  66             elem.tail = chunks.pop(0)
  67
  68
  69 def substitute_hyphens(doc):
  70     insert_tags(doc,
  71                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  72                 "dywiz",
  73                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  74                 )
  75
  76
  77 def fix_hanging(doc):
  78     insert_tags(doc,
  79                 re.compile("(?<=\s\w)\s+"),
  80                 "nbsp",
  81                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  82                 )
  83
  84
  85 def move_motifs_inside(doc):
  86     """ moves motifs to be into block elements """
  87     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  88         for motif in master.xpath('motyw'):
  89             for sib in motif.itersiblings():
  90                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  91                     # motif shouldn't have a tail - it would be untagged text
  92                     motif.tail = None
  93                     motif.getparent().remove(motif)
  94                     sib.insert(0, motif)
  95                     break
  96
  97
  98 def hack_motifs(doc):
  99     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 100     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 101
 102     moves motifs in stanzas from first verse to second
 103     and from next to last to last, then inserts negative vspace before them
 104     """
 105     for motif in doc.findall('//strofa//motyw'):
 106         # find relevant verse-level tag
 107         verse, stanza = motif, motif.getparent()
 108         while stanza is not None and stanza.tag != 'strofa':
 109             verse, stanza = stanza, stanza.getparent()
 110         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 111         breaks_after = sum(1 for i in verse.itersiblings('br'))
 112         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 113             move_by = 1
 114             if breaks_after == 2:
 115                 move_by += 1
 116             moved_motif = deepcopy(motif)
 117             motif.tag = 'span'
 118             motif.text = None
 119             moved_motif.tail = None
 120             moved_motif.set('moved', str(move_by))
 121
 122             for br in verse.itersiblings('br'):
 123                 if move_by > 1:
 124                     move_by -= 1
 125                     continue
 126                 br.addnext(moved_motif)
 127                 break
 128
 129
 130 def parse_creator(doc):
 131     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 132     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 133                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 134                     namespaces = {'dc': str(DCNS)})[::-1]:
 135         if not person.text:
 136             continue
 137         p = Person.from_text(person.text)
 138         person_parsed = deepcopy(person)
 139         person_parsed.tag = person.tag + '_parsed'
 140         person_parsed.set('sortkey', person.text)
 141         person_parsed.text = p.readable()
 142         person.getparent().insert(0, person_parsed)
 143
 144
 145 def get_stylesheet(name):
 146     return get_resource(STYLESHEETS[name])
 147
 148
 149 def package_available(package, args='', verbose=False):
 150     """ check if a verion of a latex package accepting given args is available """
 151     tempdir = mkdtemp('-wl2pdf-test')
 152     fpath = os.path.join(tempdir, 'test.tex')
 153     f = open(fpath, 'w')
 154     f.write(r"""
 155         \documentclass{book}
 156         \usepackage[%s]{%s}
 157         \begin{document}
 158         \end{document}
 159         """ % (args, package))
 160     f.close()
 161     if verbose:
 162         p = call(['xelatex', '-output-directory', tempdir, fpath])
 163     else:
 164         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 165     shutil.rmtree(tempdir)
 166     return p == 0
 167
 168
 169 def transform(provider, slug=None, file_path=None,
 170               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
 171     """ produces a PDF file with XeLaTeX
 172
 173     provider: a DocProvider
 174     slug: slug of file to process, available by provider
 175     file_path can be provided instead of a slug
 176     output_file: file-like object or path to output file
 177     output_dir: path to directory to save output file to; either this or output_file must be present
 178     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 179     verbose: prints all output from LaTeX
 180     save_tex: path to save the intermediary LaTeX file to
 181     morefloats (old/new/none): force specific morefloats
 182     """
 183
 184     # Parse XSLT
 185     try:
 186         if file_path:
 187             if slug:
 188                 raise ValueError('slug or file_path should be specified, not both')
 189             document = load_including_children(provider, file_path=file_path)
 190         else:
 191             if not slug:
 192                 raise ValueError('either slug or file_path should be specified')
 193             document = load_including_children(provider, slug=slug)
 194
 195         # check for LaTeX packages
 196         if morefloats:
 197             document.edoc.getroot().set('morefloats', morefloats.lower())
 198         elif package_available('morefloats', 'maxfloats=19'):
 199             document.edoc.getroot().set('morefloats', 'new')
 200
 201         # hack the tree
 202         move_motifs_inside(document.edoc)
 203         hack_motifs(document.edoc)
 204         parse_creator(document.edoc)
 205         substitute_hyphens(document.edoc)
 206         fix_hanging(document.edoc)
 207
 208         # find output dir
 209         if make_dir and output_dir is not None:
 210             author = unicode(document.book_info.author)
 211             output_dir = os.path.join(output_dir, author)
 212
 213         # wl -> TeXML
 214         style_filename = get_stylesheet("wl2tex")
 215         style = etree.parse(style_filename)
 216         texml = document.transform(style)
 217         del document # no longer needed large object :)
 218
 219         # TeXML -> LaTeX
 220         temp = mkdtemp('-wl2pdf')
 221         tex_path = os.path.join(temp, 'doc.tex')
 222         fout = open(tex_path, 'w')
 223         process(StringIO(texml), fout, 'utf-8')
 224         fout.close()
 225         del texml
 226
 227         if save_tex:
 228             shutil.copy(tex_path, save_tex)
 229
 230         # LaTeX -> PDF
 231         shutil.copy(get_resource('pdf/wl.sty'), temp)
 232         shutil.copy(get_resource('res/wl-logo.png'), temp)
 233
 234         cwd = os.getcwd()
 235         os.chdir(temp)
 236
 237         if verbose:
 238             p = call(['xelatex', tex_path])
 239         else:
 240             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 241         if p:
 242             raise ParseError("Error parsing .tex file")
 243
 244         os.chdir(cwd)
 245
 246         # save the PDF
 247         pdf_path = os.path.join(temp, 'doc.pdf')
 248         if output_dir is not None:
 249             try:
 250                 os.makedirs(output_dir)
 251             except OSError:
 252                 pass
 253             if slug:
 254                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 255             else:
 256                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 257             shutil.move(pdf_path, output_path)
 258         else:
 259             if hasattr(output_file, 'write'):
 260                 # file-like object
 261                 with open(pdf_path) as f:
 262                     output_file.write(f.read())
 263                 output_file.close()
 264             else:
 265                 # path to output file
 266                 shutil.copy(pdf_path, output_file)
 267         shutil.rmtree(temp)
 268
 269     except (XMLSyntaxError, XSLTApplyError), e:
 270         raise ParseError(e)
 271
 272
 273 def load_including_children(provider, slug=None, uri=None, file_path=None):
 274     """ makes one big xml file with children inserted at end
 275     either slug or uri must be provided
 276     """
 277
 278     if uri:
 279         f = provider.by_uri(uri)
 280     elif slug:
 281         f = provider[slug]
 282     elif file_path:
 283         f = open(file_path, 'r')
 284     else:
 285         raise ValueError('Neither slug, URI nor file path provided for a book.')
 286
 287     text = f.read().decode('utf-8')
 288     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 289
 290     document = WLDocument.from_string(text, True,
 291         parse_dublincore=True)
 292
 293     f.close()
 294     for child_uri in document.book_info.parts:
 295         print child_uri
 296         child = load_including_children(provider, uri=child_uri)
 297         document.edoc.getroot().append(child.edoc.getroot())
 298     return document