librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13
  14 import sys
  15 sys.path.append('..') # for running from working copy
  16
  17 from Texml.processor import process
  18 from lxml import etree
  19 from lxml.etree import XMLSyntaxError, XSLTApplyError
  20
  21 from librarian.parser import WLDocument
  22 from librarian import ParseError
  23 from librarian import functions
  24
  25
  26
  27 functions.reg_substitute_entities()
  28 functions.reg_person_name()
  29 functions.reg_strip()
  30 functions.reg_starts_white()
  31 functions.reg_ends_white()
  32
  33 STYLESHEETS = {
  34     'wl2tex': 'xslt/wl2tex.xslt',
  35 }
  36
  37
  38 def insert_tags(doc, split_re, tagname):
  39     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  40
  41     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  42     >>> insert_tags(t, re.compile('-'), 'd');
  43     >>> print etree.tostring(t)
  44     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  45     """
  46
  47     for elem in doc.iter():
  48         if elem.text:
  49             chunks = split_re.split(elem.text)
  50             elem.text = chunks.pop(0)
  51             while chunks:
  52                 ins = etree.Element(tagname)
  53                 ins.tail = chunks.pop()
  54                 elem.insert(0, ins)
  55         if elem.tail:
  56             chunks = split_re.split(elem.tail)
  57             parent = elem.getparent()
  58             ins_index = parent.index(elem) + 1
  59             elem.tail = chunks.pop(0)
  60             while chunks:
  61                 ins = etree.Element(tagname)
  62                 ins.tail = chunks.pop()
  63                 parent.insert(ins_index, ins)
  64
  65
  66 def substitute_hyphens(doc):
  67     insert_tags(doc,
  68                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  69                 "dywiz")
  70
  71
  72 def fix_hanging(doc):
  73     insert_tags(doc,
  74                 re.compile("(?<=\s\w)\s+"),
  75                 "nbsp")
  76
  77
  78 def get_resource(path):
  79     return os.path.join(os.path.dirname(__file__), path)
  80
  81 def get_stylesheet(name):
  82     return get_resource(STYLESHEETS[name])
  83
  84 def transform(provider, slug, output_file=None, output_dir=None):
  85     """ produces a pdf file
  86
  87     provider is a DocProvider
  88     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
  89     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
  90     """
  91
  92     # Parse XSLT
  93     try:
  94         style_filename = get_stylesheet("wl2tex")
  95         style = etree.parse(style_filename)
  96
  97         document = load_including_children(provider, slug)
  98
  99         substitute_hyphens(document.edoc)
 100         fix_hanging(document.edoc)
 101
 102         # if output to dir, create the file
 103         if output_dir is not None:
 104             author = unicode(document.book_info.author)
 105             output_dir = os.path.join(output_dir, author)
 106
 107         texml = document.transform(style)
 108         del document # no longer needed large object :)
 109
 110         temp = mkdtemp('wl2pdf-')
 111         tex_path = os.path.join(temp, 'doc.tex')
 112         fout = open(tex_path, 'w')
 113         process(StringIO(texml), fout, 'utf8', 255, 0, 0)
 114         fout.close()
 115         del texml
 116
 117         shutil.copy(get_resource('pdf/wl.sty'), temp)
 118         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 119         print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
 120         if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
 121             raise ParseError("Error parsing .tex file")
 122
 123         pdf_path = os.path.join(temp, 'doc.pdf')
 124         if output_dir is not None:
 125             try:
 126                 os.makedirs(output_dir)
 127             except OSError:
 128                 pass
 129             output_path = os.path.join(output_dir, '%s.pdf' % slug)
 130             shutil.move(pdf_path, output_path)
 131         else:
 132             with open(pdf_path) as f:
 133                 output_file.write(f.read())
 134             output_file.close()
 135
 136         return True
 137     except (XMLSyntaxError, XSLTApplyError), e:
 138         raise ParseError(e)
 139
 140
 141 def load_including_children(provider, slug=None, uri=None):
 142     """ makes one big xml file with children inserted at end
 143     either slug or uri must be provided
 144     """
 145
 146     if uri:
 147         f = provider.by_uri(uri)
 148     elif slug:
 149         f = provider[slug]
 150     else:
 151         raise ValueError('Neither slug nor URI provided for a book.')
 152
 153     document = WLDocument.from_file(f, True,
 154         parse_dublincore=True,
 155         preserve_lines=False)
 156
 157     for child_uri in document.book_info.parts:
 158         child = load_including_children(provider, uri=child_uri)
 159         document.edoc.getroot().append(child.edoc.getroot())
 160
 161     return document
 162
 163
 164 if __name__ == '__main__':
 165     import sys
 166     from librarian import DirDocProvider
 167
 168     if len(sys.argv) < 2:
 169         print >> sys.stderr, 'Usage: python pdf.py <input file>'
 170         sys.exit(1)
 171
 172     main_input = sys.argv[1]
 173     basepath, ext = os.path.splitext(main_input)
 174     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 175     provider = DirDocProvider(path)
 176     transform(provider, slug, output_dir=path)
 177