librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS, get_resource
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'pdf/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname, exclude=None):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter(tag=etree.Element):
  49         if exclude and elem.tag in exclude:
  50             continue
  51         if elem.text:
  52             chunks = split_re.split(elem.text)
  53             while len(chunks) > 1:
  54                 ins = etree.Element(tagname)
  55                 ins.tail = chunks.pop()
  56                 elem.insert(0, ins)
  57             elem.text = chunks.pop(0)
  58         if elem.tail:
  59             chunks = split_re.split(elem.tail)
  60             parent = elem.getparent()
  61             ins_index = parent.index(elem) + 1
  62             while len(chunks) > 1:
  63                 ins = etree.Element(tagname)
  64                 ins.tail = chunks.pop()
  65                 parent.insert(ins_index, ins)
  66             elem.tail = chunks.pop(0)
  67
  68
  69 def substitute_hyphens(doc):
  70     insert_tags(doc,
  71                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  72                 "dywiz",
  73                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  74                 )
  75
  76
  77 def fix_hanging(doc):
  78     insert_tags(doc,
  79                 re.compile("(?<=\s\w)\s+"),
  80                 "nbsp",
  81                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  82                 )
  83
  84
  85 def move_motifs_inside(doc):
  86     """ moves motifs to be into block elements """
  87     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  88         for motif in master.xpath('motyw'):
  89             for sib in motif.itersiblings():
  90                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  91                     # motif shouldn't have a tail - it would be untagged text
  92                     motif.tail = None
  93                     motif.getparent().remove(motif)
  94                     sib.insert(0, motif)
  95                     break
  96
  97
  98 def hack_motifs(doc):
  99     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 100     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 101
 102     moves motifs in stanzas from first verse to second
 103     and from next to last to last, then inserts negative vspace before them
 104     """
 105     for motif in doc.findall('//strofa//motyw'):
 106         # find relevant verse-level tag
 107         verse, stanza = motif, motif.getparent()
 108         while stanza is not None and stanza.tag != 'strofa':
 109             verse, stanza = stanza, stanza.getparent()
 110         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 111         breaks_after = sum(1 for i in verse.itersiblings('br'))
 112         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 113             move_by = 1
 114             if breaks_after == 2:
 115                 move_by += 1
 116             moved_motif = deepcopy(motif)
 117             motif.tag = 'span'
 118             motif.text = None
 119             moved_motif.tail = None
 120             moved_motif.set('moved', str(move_by))
 121
 122             for br in verse.itersiblings('br'):
 123                 if move_by > 1:
 124                     move_by -= 1
 125                     continue
 126                 br.addnext(moved_motif)
 127                 break
 128
 129
 130 def parse_creator(doc):
 131     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 132     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 133                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 134                     namespaces = {'dc': str(DCNS)})[::-1]:
 135         p = Person.from_text(person.text)
 136         person_parsed = deepcopy(person)
 137         person_parsed.tag = person.tag + '_parsed'
 138         person_parsed.set('sortkey', person.text)
 139         person_parsed.text = p.readable()
 140         person.getparent().insert(0, person_parsed)
 141
 142
 143 def get_stylesheet(name):
 144     return get_resource(STYLESHEETS[name])
 145
 146
 147 def package_available(package, args='', verbose=False):
 148     """ check if a verion of a latex package accepting given args is available """
 149     tempdir = mkdtemp('-wl2pdf-test')
 150     fpath = os.path.join(tempdir, 'test.tex')
 151     f = open(fpath, 'w')
 152     f.write(r"""
 153         \documentclass{book}
 154         \usepackage[%s]{%s}
 155         \begin{document}
 156         \end{document}
 157         """ % (args, package))
 158     f.close()
 159     if verbose:
 160         p = call(['xelatex', '-output-directory', tempdir, fpath])
 161     else:
 162         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 163     shutil.rmtree(tempdir)
 164     return p == 0
 165
 166
 167 def transform(provider, slug=None, file_path=None,
 168               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
 169     """ produces a PDF file with XeLaTeX
 170
 171     provider: a DocProvider
 172     slug: slug of file to process, available by provider
 173     file_path can be provided instead of a slug
 174     output_file: file-like object or path to output file
 175     output_dir: path to directory to save output file to; either this or output_file must be present
 176     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 177     verbose: prints all output from LaTeX
 178     save_tex: path to save the intermediary LaTeX file to
 179     morefloats (old/new/none): force specific morefloats
 180     """
 181
 182     # Parse XSLT
 183     try:
 184         if file_path:
 185             if slug:
 186                 raise ValueError('slug or file_path should be specified, not both')
 187             document = load_including_children(provider, file_path=file_path)
 188         else:
 189             if not slug:
 190                 raise ValueError('either slug or file_path should be specified')
 191             document = load_including_children(provider, slug=slug)
 192
 193         # check for LaTeX packages
 194         if morefloats:
 195             document.edoc.getroot().set('morefloats', morefloats.lower())
 196         elif package_available('morefloats', 'maxfloats=19'):
 197             document.edoc.getroot().set('morefloats', 'new')
 198
 199         # hack the tree
 200         move_motifs_inside(document.edoc)
 201         hack_motifs(document.edoc)
 202         parse_creator(document.edoc)
 203         substitute_hyphens(document.edoc)
 204         fix_hanging(document.edoc)
 205
 206         # find output dir
 207         if make_dir and output_dir is not None:
 208             author = unicode(document.book_info.author)
 209             output_dir = os.path.join(output_dir, author)
 210
 211         # wl -> TeXML
 212         style_filename = get_stylesheet("wl2tex")
 213         style = etree.parse(style_filename)
 214         texml = document.transform(style)
 215         del document # no longer needed large object :)
 216
 217         # TeXML -> LaTeX
 218         temp = mkdtemp('-wl2pdf')
 219         tex_path = os.path.join(temp, 'doc.tex')
 220         fout = open(tex_path, 'w')
 221         process(StringIO(texml), fout, 'utf-8')
 222         fout.close()
 223         del texml
 224
 225         if save_tex:
 226             shutil.copy(tex_path, save_tex)
 227
 228         # LaTeX -> PDF
 229         shutil.copy(get_resource('pdf/wl.sty'), temp)
 230         shutil.copy(get_resource('res/wl-logo.png'), temp)
 231
 232         cwd = os.getcwd()
 233         os.chdir(temp)
 234
 235         if verbose:
 236             p = call(['xelatex', tex_path])
 237         else:
 238             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 239         if p:
 240             raise ParseError("Error parsing .tex file")
 241
 242         os.chdir(cwd)
 243
 244         # save the PDF
 245         pdf_path = os.path.join(temp, 'doc.pdf')
 246         if output_dir is not None:
 247             try:
 248                 os.makedirs(output_dir)
 249             except OSError:
 250                 pass
 251             if slug:
 252                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 253             else:
 254                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 255             shutil.move(pdf_path, output_path)
 256         else:
 257             if hasattr(output_file, 'write'):
 258                 # file-like object
 259                 with open(pdf_path) as f:
 260                     output_file.write(f.read())
 261                 output_file.close()
 262             else:
 263                 # path to output file
 264                 shutil.copy(pdf_path, output_file)
 265         shutil.rmtree(temp)
 266
 267     except (XMLSyntaxError, XSLTApplyError), e:
 268         raise ParseError(e)
 269
 270
 271 def load_including_children(provider, slug=None, uri=None, file_path=None):
 272     """ makes one big xml file with children inserted at end
 273     either slug or uri must be provided
 274     """
 275
 276     if uri:
 277         f = provider.by_uri(uri)
 278     elif slug:
 279         f = provider[slug]
 280     elif file_path:
 281         f = open(file_path, 'r')
 282     else:
 283         raise ValueError('Neither slug, URI nor file path provided for a book.')
 284
 285     text = f.read().decode('utf-8')
 286     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 287
 288     document = WLDocument.from_string(text, True,
 289         parse_dublincore=True)
 290
 291     f.close()
 292     for child_uri in document.book_info.parts:
 293         print child_uri
 294         child = load_including_children(provider, uri=child_uri)
 295         document.edoc.getroot().append(child.edoc.getroot())
 296     return document