librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14 from subprocess import call, PIPE
  15
  16 import sys
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.dcparser import Person
  23 from librarian.parser import WLDocument
  24 from librarian import ParseError, DCNS
  25 from librarian import functions
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'xslt/wl2tex.xslt',
  35 }
  36
  37
  38 def insert_tags(doc, split_re, tagname):
  39     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  40
  41     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  42     >>> insert_tags(t, re.compile('-'), 'd');
  43     >>> print etree.tostring(t)
  44     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  45     """
  46
  47     for elem in doc.iter(tag=etree.Element):
  48         if elem.text:
  49             chunks = split_re.split(elem.text)
  50             while len(chunks) > 1:
  51                 ins = etree.Element(tagname)
  52                 ins.tail = chunks.pop()
  53                 elem.insert(0, ins)
  54             elem.text = chunks.pop(0)
  55         if elem.tail:
  56             chunks = split_re.split(elem.tail)
  57             parent = elem.getparent()
  58             ins_index = parent.index(elem) + 1
  59             while len(chunks) > 1:
  60                 ins = etree.Element(tagname)
  61                 ins.tail = chunks.pop()
  62                 parent.insert(ins_index, ins)
  63             elem.tail = chunks.pop(0)
  64
  65
  66 def substitute_hyphens(doc):
  67     insert_tags(doc,
  68                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  69                 "dywiz")
  70
  71
  72 def fix_hanging(doc):
  73     insert_tags(doc,
  74                 re.compile("(?<=\s\w)\s+"),
  75                 "nbsp")
  76
  77
  78 def move_motifs_inside(doc):
  79     """ moves motifs to be into block elements """
  80     for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
  81         for motif in master.xpath('motyw'):
  82             print motif.text
  83             for sib in motif.itersiblings():
  84                 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
  85                     # motif shouldn't have a tail - it would be untagged text
  86                     motif.tail = None
  87                     motif.getparent().remove(motif)
  88                     sib.insert(0, motif)
  89                     break
  90
  91
  92 def hack_motifs(doc):
  93     """ dirty hack for the marginpar-creates-orphans LaTeX problem
  94     see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
  95
  96     moves motifs in stanzas from first verse to second
  97     and from next to last to last, then inserts negative vspace before them
  98     """
  99     for motif in doc.findall('//strofa//motyw'):
 100         # find relevant verse-level tag
 101         verse, stanza = motif, motif.getparent()
 102         while stanza is not None and stanza.tag != 'strofa':
 103             verse, stanza = stanza, stanza.getparent()
 104         breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 105         breaks_after = sum(1 for i in verse.itersiblings('br'))
 106         if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 107             move_by = 1
 108             if breaks_after == 2:
 109                 move_by += 1
 110             moved_motif = deepcopy(motif)
 111             motif.tag = 'span'
 112             motif.text = None
 113             moved_motif.tail = None
 114             moved_motif.set('moved', str(move_by))
 115
 116             for br in verse.itersiblings('br'):
 117                 if move_by > 1:
 118                     move_by -= 1
 119                     continue
 120                 br.addnext(moved_motif)
 121                 break
 122
 123
 124 def parse_creator(doc):
 125     """ find all dc:creator tags and add dc:creator_parsed with forenames first """
 126     for creator in doc.findall('//'+DCNS('creator')):
 127         p = Person.from_text(creator.text)
 128         creator_parsed = deepcopy(creator)
 129         creator_parsed.tag = DCNS('creator_parsed')
 130         creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
 131         creator.getparent().insert(0, creator_parsed)
 132
 133
 134 def get_resource(path):
 135     return os.path.join(os.path.dirname(__file__), path)
 136
 137 def get_stylesheet(name):
 138     return get_resource(STYLESHEETS[name])
 139
 140
 141 def package_available(package, args='', verbose=False):
 142     """ check if a verion of a latex package accepting given args is available """
 143     tempdir = mkdtemp('-wl2pdf-test')
 144     fpath = os.path.join(tempdir, 'test.tex')
 145     f = open(fpath, 'w')
 146     f.write(r"""
 147         \documentclass{book}
 148         \usepackage[%s]{%s}
 149         \begin{document}
 150         \end{document}
 151         """ % (args, package))
 152     f.close()
 153     if verbose:
 154         p = call(['xelatex', '-output-directory', tempdir, fpath])
 155     else:
 156         p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
 157     shutil.rmtree(tempdir)
 158     return p == 0
 159
 160
 161 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
 162     """ produces a PDF file with XeLaTeX
 163
 164     provider: a DocProvider
 165     slug: slug of file to process, available by provider
 166     file_path can be provided instead of a slug
 167     output_file: file-like object or path to output file
 168     output_dir: path to directory to save output file to; either this or output_file must be present
 169     make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
 170     verbose: prints all output from LaTeX
 171     save_tex: path to save the intermediary LaTeX file to
 172     """
 173
 174     # Parse XSLT
 175     try:
 176         if file_path:
 177             if slug:
 178                 raise ValueError('slug or file_path should be specified, not both')
 179             document = load_including_children(provider, file_path=file_path)
 180         else:
 181             if not slug:
 182                 raise ValueError('either slug or file_path should be specified')
 183             document = load_including_children(provider, slug=slug)
 184
 185         # check for LaTeX packages
 186         if not package_available('morefloats', 'maxfloats=19'):
 187             # using old morefloats or none at all
 188             document.edoc.getroot().set('old-morefloats', 'yes')
 189
 190         # hack the tree
 191         move_motifs_inside(document.edoc)
 192         hack_motifs(document.edoc)
 193         parse_creator(document.edoc)
 194         substitute_hyphens(document.edoc)
 195         fix_hanging(document.edoc)
 196
 197         # find output dir
 198         if make_dir and output_dir is not None:
 199             author = unicode(document.book_info.author)
 200             output_dir = os.path.join(output_dir, author)
 201
 202         # wl -> TeXML
 203         style_filename = get_stylesheet("wl2tex")
 204         style = etree.parse(style_filename)
 205         texml = document.transform(style)
 206         del document # no longer needed large object :)
 207
 208         # TeXML -> LaTeX
 209         temp = mkdtemp('-wl2pdf')
 210         tex_path = os.path.join(temp, 'doc.tex')
 211         fout = open(tex_path, 'w')
 212         process(StringIO(texml), fout, 'utf-8')
 213         fout.close()
 214         del texml
 215
 216         if save_tex:
 217             shutil.copy(tex_path, save_tex)
 218
 219         # LaTeX -> PDF
 220         shutil.copy(get_resource('pdf/wl.sty'), temp)
 221         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 222
 223         cwd = os.getcwd()
 224         os.chdir(temp)
 225
 226         if verbose:
 227             p = call(['xelatex', tex_path])
 228         else:
 229             p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
 230         if p:
 231             raise ParseError("Error parsing .tex file")
 232
 233         os.chdir(cwd)
 234
 235         # save the PDF
 236         pdf_path = os.path.join(temp, 'doc.pdf')
 237         if output_dir is not None:
 238             try:
 239                 os.makedirs(output_dir)
 240             except OSError:
 241                 pass
 242             if slug:
 243                 output_path = os.path.join(output_dir, '%s.pdf' % slug)
 244             else:
 245                 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
 246             shutil.move(pdf_path, output_path)
 247         else:
 248             if hasattr(output_file, 'write'):
 249                 # file-like object
 250                 with open(pdf_path) as f:
 251                     output_file.write(f.read())
 252                 output_file.close()
 253             else:
 254                 # path to output file
 255                 shutil.copy(pdf_path, output_file)
 256         shutil.rmtree(temp)
 257
 258     except (XMLSyntaxError, XSLTApplyError), e:
 259         raise ParseError(e)
 260
 261
 262 def load_including_children(provider, slug=None, uri=None, file_path=None):
 263     """ makes one big xml file with children inserted at end
 264     either slug or uri must be provided
 265     """
 266
 267     if uri:
 268         f = provider.by_uri(uri)
 269     elif slug:
 270         f = provider[slug]
 271     elif file_path:
 272         f = open(file_path, 'r')
 273     else:
 274         raise ValueError('Neither slug, URI nor file path provided for a book.')
 275
 276     document = WLDocument.from_file(f, True,
 277         parse_dublincore=True,
 278         preserve_lines=False)
 279
 280     f.close()
 281
 282     for child_uri in document.book_info.parts:
 283         child = load_including_children(provider, uri=child_uri)
 284         document.edoc.getroot().append(child.edoc.getroot())
 285
 286     return document