librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 import os
   7 import os.path
   8 import shutil
   9 from StringIO import StringIO
  10 from tempfile import mkdtemp
  11 import re
  12
  13 from Texml.processor import process
  14 from lxml import etree
  15 from lxml.etree import XMLSyntaxError, XSLTApplyError
  16
  17 from librarian.parser import WLDocument
  18 from librarian import ParseError
  19 from librarian import functions
  20
  21 functions.reg_substitute_entities()
  22
  23 STYLESHEETS = {
  24     'wl2tex': 'xslt/wl2tex.xslt',
  25 }
  26
  27
  28 def insert_tags(doc, split_re, tagname):
  29     print tagname
  30     for elem in doc.iter():
  31         if elem.text:
  32             chunks = split_re.split(elem.text)
  33             elem.text = chunks.pop(0)
  34             while chunks:
  35                 ins = etree.Element(tagname)
  36                 ins.tail = chunks.pop()
  37                 elem.insert(0, ins)
  38         if elem.tail:
  39             chunks = split_re.split(elem.tail)
  40             parent = elem.getparent()
  41             ins_index = parent.index(elem) + 1
  42             elem.tail = chunks.pop(0)
  43             while chunks:
  44                 ins = etree.Element(tagname)
  45                 ins.tail = chunks.pop(0)
  46                 parent.insert(ins_index, ins)
  47
  48
  49 def substitute_hyphens(doc):
  50     insert_tags(doc,
  51                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  52                 "dywiz")
  53
  54
  55 def fix_hanging(doc):
  56     insert_tags(doc,
  57                 re.compile("(?<=\s\w)\s+"),
  58                 "nbsp")
  59
  60
  61 def get_stylesheet(name):
  62     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  63
  64 def transform(provider, slug, output_file=None, output_dir=None):
  65     """ produces a pdf file
  66
  67     provider is a DocProvider
  68     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
  69     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
  70     """
  71
  72     # Parse XSLT
  73     try:
  74         style_filename = get_stylesheet("wl2tex")
  75         style = etree.parse(style_filename)
  76
  77         document = load_including_children(provider, slug)
  78
  79         substitute_hyphens(document.edoc)
  80         fix_hanging(document.edoc)
  81
  82         print etree.tostring(document.edoc)
  83
  84         # if output to dir, create the file
  85         if output_dir is not None:
  86             author = unicode(document.book_info.author)
  87             output_dir = os.path.join(output_dir, author)
  88
  89         texml = document.transform(style)
  90         del document # no longer needed large object :)
  91
  92         temp = mkdtemp('wl2pdf-')
  93         tex_path = os.path.join(temp, 'doc.tex')
  94         fout = open(tex_path, 'w')
  95         process(StringIO(texml), fout, 'utf8', 255, 0, 0)
  96         fout.close()
  97         del texml
  98
  99         print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
 100         if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
 101             raise ParseError("Error parsing .tex file")
 102
 103         pdf_path = os.path.join(temp, 'doc.pdf')
 104         if output_dir is not None:
 105             try:
 106                 os.makedirs(output_dir)
 107             except OSError:
 108                 pass
 109             output_path = os.path.join(output_dir, '%s.pdf' % slug)
 110             shutil.move(pdf_path, output_path)
 111         else:
 112             with open(pdf_path) as f:
 113                 output_file.write(f.read())
 114             output_file.close()
 115
 116         return True
 117     except (XMLSyntaxError, XSLTApplyError), e:
 118         raise ParseError(e)
 119
 120
 121 def load_including_children(provider, slug=None, uri=None):
 122     """ makes one big xml file with children inserted at end
 123     either slug or uri must be provided
 124     """
 125
 126     if uri:
 127         f = provider.by_uri(uri)
 128     elif slug:
 129         f = provider[slug]
 130     else:
 131         raise ValueError('Neither slug nor URI provided for a book.')
 132
 133     document = WLDocument.from_file(f, True,
 134         parse_dublincore=True,
 135         preserve_lines=False)
 136
 137     for child_uri in document.book_info.parts:
 138         child = load_including_children(provider, uri=child_uri)
 139         document.edoc.getroot().append(child.edoc.getroot())
 140
 141     return document
 142
 143
 144 if __name__ == '__main__':
 145     import sys
 146     from librarian import DirDocProvider
 147
 148     if len(sys.argv) < 2:
 149         print >> sys.stderr, 'Usage: python pdf.py <input file>'
 150         sys.exit(1)
 151
 152     main_input = sys.argv[1]
 153     basepath, ext = os.path.splitext(main_input)
 154     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 155     provider = DirDocProvider(path)
 156     transform(provider, slug, output_dir=path)
 157