librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS, get_resource
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32 functions.reg_texcommand()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'pdf/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname, exclude=None):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter(tag=etree.Element):
  49         if exclude and elem.tag in exclude:
  50             continue
  51         if elem.text:
  52             chunks = split_re.split(elem.text)
  53             while len(chunks) > 1:
  54                 ins = etree.Element(tagname)
  55                 ins.tail = chunks.pop()
  56                 elem.insert(0, ins)
  57             elem.text = chunks.pop(0)
  58         if elem.tail:
  59             chunks = split_re.split(elem.tail)
  60             parent = elem.getparent()
  61             ins_index = parent.index(elem) + 1
  62             while len(chunks) > 1:
  63                 ins = etree.Element(tagname)
  64                 ins.tail = chunks.pop()
  65                 parent.insert(ins_index, ins)
  66             elem.tail = chunks.pop(0)
  67
  68
  69 def substitute_hyphens(doc):
  70     insert_tags(doc,
  71                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  72                 "dywiz",
  73                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  74                 )
  75
  76
  77 def fix_hanging(doc):
  78     insert_tags(doc,
  79                 re.compile("(?<=\s\w)\s+"),
  80                 "nbsp",
  81                 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
  82                 )
  83
  84
  85 def move_motifs_inside(doc):
  86     """ moves motifs to be into block elements """
  87     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  88         for motif in master.xpath('motyw'):
  89             for sib in motif.itersiblings():
  90                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  91                     # motif shouldn't have a tail - it would be untagged text
  92                     motif.tail = None
  93                     motif.getparent().remove(motif)
  94                     sib.insert(0, motif)
  95                     break
  96
  97
  98 def hack_motifs(doc):
  99     """ dirty hack for the marginpar-creates-orphans LaTeX problem
 100     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 101
 102     moves motifs in stanzas from first verse to second
 103     and from next to last to last, then inserts negative vspace before them
 104     """
 105     for motif in doc.findall('//strofa//motyw'):
 106         # find relevant verse-level tag
 107         verse, stanza = motif, motif.getparent()
 108         while stanza is not None and stanza.tag != 'strofa':
 109             verse, stanza = stanza, stanza.getparent()
 110         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 111         breaks_after = sum(1 for i in verse.itersiblings('br'))
 112         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 113             move_by = 1
 114             if breaks_after == 2:
 115                 move_by += 1
 116             moved_motif = deepcopy(motif)
 117             motif.tag = 'span'
 118             motif.text = None
 119             moved_motif.tail = None
 120             moved_motif.set('moved', str(move_by))
 121
 122             for br in verse.itersiblings('br'):
 123                 if move_by > 1:
 124                     move_by -= 1
 125                     continue
 126                 br.addnext(moved_motif)
 127                 break
 128
 129
 130 def parse_creator(doc):
 131     """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
 132     for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
 133                     'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
 134                     namespaces = {'dc': str(DCNS)})[::-1]:
 135         p = Person.from_text(person.text)
 136         person_parsed = deepcopy(person)
 137         person_parsed.tag = person.tag + '_parsed'
 138         person_parsed.text = p.readable()
 139         person.getparent().insert(0, person_parsed)
 140
 141
 142 def get_stylesheet(name):
 143     return get_resource(STYLESHEETS[name])
 144
 145
 146 def package_available(package, args='', verbose=False):
 147     """ check if a verion of a latex package accepting given args is available """
 148     tempdir = mkdtemp('-wl2pdf-test')
 149     fpath = os.path.join(tempdir, 'test.tex')
 150     f = open(fpath, 'w')
 151     f.write(r"""
 152         \documentclass{book}
 153         \usepackage[%s]{%s}
 154         \begin{document}
 155         \end{document}
 156         """ % (args, package))
 157     f.close()
 158     if verbose:
 159         p = call(['xelatex', '-output-directory', tempdir, fpath])
 160     else:
 161         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 162     shutil.rmtree(tempdir)
 163     return p == 0
 164
 165
 166 def transform(provider, slug=None, file_path=None,
 167               output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
 168     """ produces a PDF file with XeLaTeX
 169
 170     provider: a DocProvider
 171     slug: slug of file to process, available by provider
 172     file_path can be provided instead of a slug
 173     output_file: file-like object or path to output file
 174     output_dir: path to directory to save output file to; either this or output_file must be present
 175     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 176     verbose: prints all output from LaTeX
 177     save_tex: path to save the intermediary LaTeX file to
 178     morefloats (old/new/none): force specific morefloats
 179     """
 180
 181     # Parse XSLT
 182     try:
 183         if file_path:
 184             if slug:
 185                 raise ValueError('slug or file_path should be specified, not both')
 186             document = load_including_children(provider, file_path=file_path)
 187         else:
 188             if not slug:
 189                 raise ValueError('either slug or file_path should be specified')
 190             document = load_including_children(provider, slug=slug)
 191
 192         # check for LaTeX packages
 193         if morefloats:
 194             document.edoc.getroot().set('morefloats', morefloats.lower())
 195         elif package_available('morefloats', 'maxfloats=19'):
 196             document.edoc.getroot().set('morefloats', 'new')
 197
 198         # hack the tree
 199         move_motifs_inside(document.edoc)
 200         hack_motifs(document.edoc)
 201         parse_creator(document.edoc)
 202         substitute_hyphens(document.edoc)
 203         fix_hanging(document.edoc)
 204
 205         # find output dir
 206         if make_dir and output_dir is not None:
 207             author = unicode(document.book_info.author)
 208             output_dir = os.path.join(output_dir, author)
 209
 210         # wl -> TeXML
 211         style_filename = get_stylesheet("wl2tex")
 212         style = etree.parse(style_filename)
 213         texml = document.transform(style)
 214         del document # no longer needed large object :)
 215
 216         # TeXML -> LaTeX
 217         temp = mkdtemp('-wl2pdf')
 218         tex_path = os.path.join(temp, 'doc.tex')
 219         fout = open(tex_path, 'w')
 220         process(StringIO(texml), fout, 'utf-8')
 221         fout.close()
 222         del texml
 223
 224         if save_tex:
 225             shutil.copy(tex_path, save_tex)
 226
 227         # LaTeX -> PDF
 228         shutil.copy(get_resource('pdf/wl.sty'), temp)
 229         shutil.copy(get_resource('res/wl-logo.png'), temp)
 230
 231         cwd = os.getcwd()
 232         os.chdir(temp)
 233
 234         if verbose:
 235             p = call(['xelatex', tex_path])
 236         else:
 237             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 238         if p:
 239             raise ParseError("Error parsing .tex file")
 240
 241         os.chdir(cwd)
 242
 243         # save the PDF
 244         pdf_path = os.path.join(temp, 'doc.pdf')
 245         if output_dir is not None:
 246             try:
 247                 os.makedirs(output_dir)
 248             except OSError:
 249                 pass
 250             if slug:
 251                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 252             else:
 253                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 254             shutil.move(pdf_path, output_path)
 255         else:
 256             if hasattr(output_file, 'write'):
 257                 # file-like object
 258                 with open(pdf_path) as f:
 259                     output_file.write(f.read())
 260                 output_file.close()
 261             else:
 262                 # path to output file
 263                 shutil.copy(pdf_path, output_file)
 264         shutil.rmtree(temp)
 265
 266     except (XMLSyntaxError, XSLTApplyError), e:
 267         raise ParseError(e)
 268
 269
 270 def load_including_children(provider, slug=None, uri=None, file_path=None):
 271     """ makes one big xml file with children inserted at end
 272     either slug or uri must be provided
 273     """
 274
 275     if uri:
 276         f = provider.by_uri(uri)
 277     elif slug:
 278         f = provider[slug]
 279     elif file_path:
 280         f = open(file_path, 'r')
 281     else:
 282         raise ValueError('Neither slug, URI nor file path provided for a book.')
 283
 284     text = f.read().decode('utf-8')
 285     text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
 286
 287     document = WLDocument.from_string(text, True,
 288         parse_dublincore=True)
 289
 290     f.close()
 291     print document.book_info
 292     for child_uri in document.book_info.parts:
 293         print child_uri
 294         child = load_including_children(provider, uri=child_uri)
 295         document.edoc.getroot().append(child.edoc.getroot())
 296     return document