librarian/pdf.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import with_statement
   7 import os
   8 import os.path
   9 import shutil
  10 from StringIO import StringIO
  11 from tempfile import mkdtemp
  12 import re
  13 from copy import deepcopy
  14
  15 import sys
  16 sys.path.append('..') # for running from working copy
  17
  18 from Texml.processor import process
  19 from lxml import etree
  20 from lxml.etree import XMLSyntaxError, XSLTApplyError
  21
  22 from librarian.parser import WLDocument
  23 from librarian import ParseError
  24 from librarian import functions
  25
  26
  27
  28 functions.reg_substitute_entities()
  29 functions.reg_person_name()
  30 functions.reg_strip()
  31 functions.reg_starts_white()
  32 functions.reg_ends_white()
  33
  34 STYLESHEETS = {
  35     'wl2tex': 'xslt/wl2tex.xslt',
  36 }
  37
  38
  39 def insert_tags(doc, split_re, tagname):
  40     """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
  41
  42     >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
  43     >>> insert_tags(t, re.compile('-'), 'd');
  44     >>> print etree.tostring(t)
  45     <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
  46     """
  47
  48     for elem in doc.iter():
  49         if elem.text:
  50             chunks = split_re.split(elem.text)
  51             elem.text = chunks.pop(0)
  52             while chunks:
  53                 ins = etree.Element(tagname)
  54                 ins.tail = chunks.pop()
  55                 elem.insert(0, ins)
  56         if elem.tail:
  57             chunks = split_re.split(elem.tail)
  58             parent = elem.getparent()
  59             ins_index = parent.index(elem) + 1
  60             elem.tail = chunks.pop(0)
  61             while chunks:
  62                 ins = etree.Element(tagname)
  63                 ins.tail = chunks.pop()
  64                 parent.insert(ins_index, ins)
  65
  66
  67 def substitute_hyphens(doc):
  68     insert_tags(doc,
  69                 re.compile("(?<=[^-\s])-(?=[^-\s])"),
  70                 "dywiz")
  71
  72
  73 def fix_hanging(doc):
  74     insert_tags(doc,
  75                 re.compile("(?<=\s\w)\s+"),
  76                 "nbsp")
  77
  78
  79 def get_resource(path):
  80     return os.path.join(os.path.dirname(__file__), path)
  81
  82 def get_stylesheet(name):
  83     return get_resource(STYLESHEETS[name])
  84
  85 def transform(provider, slug, output_file=None, output_dir=None):
  86     """ produces a pdf file
  87
  88     provider is a DocProvider
  89     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
  90     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
  91     """
  92
  93     # Parse XSLT
  94     try:
  95         style_filename = get_stylesheet("wl2tex")
  96         style = etree.parse(style_filename)
  97
  98         document = load_including_children(provider, slug)
  99
 100         # dirty hack for the marginpar-creates-orphans LaTeX problem
 101         # see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
 102         for motif in document.edoc.findall('//strofa//motyw'):
 103             # find relevant verse-level tag
 104             verse, stanza = motif, motif.getparent()
 105             while stanza is not None and stanza.tag != 'strofa':
 106                 verse, stanza = stanza, stanza.getparent()
 107             breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
 108             breaks_after = sum(1 for i in verse.itersiblings('br'))
 109             if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
 110                 move_by = 1
 111                 if breaks_after == 2:
 112                     move_by += 1
 113                 moved_motif = deepcopy(motif)
 114                 motif.tag = 'span'
 115                 motif.text = None
 116                 moved_motif.tail = None
 117                 moved_motif.set('moved', str(move_by))
 118
 119                 for br in verse.itersiblings(tag='br'):
 120                     if move_by > 1:
 121                         move_by -= 1
 122                         continue
 123                     br.addnext(moved_motif)
 124                     break
 125
 126         substitute_hyphens(document.edoc)
 127         fix_hanging(document.edoc)
 128
 129         # if output to dir, create the file
 130         if output_dir is not None:
 131             author = unicode(document.book_info.author)
 132             output_dir = os.path.join(output_dir, author)
 133
 134         texml = document.transform(style)
 135         del document # no longer needed large object :)
 136
 137         temp = mkdtemp('wl2pdf-')
 138         tex_path = os.path.join(temp, 'doc.tex')
 139         fout = open(tex_path, 'w')
 140         process(StringIO(texml), fout, 'utf8', 255, 0, 0)
 141         fout.close()
 142         del texml
 143
 144         shutil.copy(get_resource('pdf/wl.sty'), temp)
 145         shutil.copy(get_resource('pdf/wl-logo.png'), temp)
 146         print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
 147         if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
 148             raise ParseError("Error parsing .tex file")
 149
 150         pdf_path = os.path.join(temp, 'doc.pdf')
 151         if output_dir is not None:
 152             try:
 153                 os.makedirs(output_dir)
 154             except OSError:
 155                 pass
 156             output_path = os.path.join(output_dir, '%s.pdf' % slug)
 157             shutil.move(pdf_path, output_path)
 158         else:
 159             with open(pdf_path) as f:
 160                 output_file.write(f.read())
 161             output_file.close()
 162
 163         return True
 164     except (XMLSyntaxError, XSLTApplyError), e:
 165         raise ParseError(e)
 166
 167
 168 def load_including_children(provider, slug=None, uri=None):
 169     """ makes one big xml file with children inserted at end
 170     either slug or uri must be provided
 171     """
 172
 173     if uri:
 174         f = provider.by_uri(uri)
 175     elif slug:
 176         f = provider[slug]
 177     else:
 178         raise ValueError('Neither slug nor URI provided for a book.')
 179
 180     document = WLDocument.from_file(f, True,
 181         parse_dublincore=True,
 182         preserve_lines=False)
 183
 184     for child_uri in document.book_info.parts:
 185         child = load_including_children(provider, uri=child_uri)
 186         document.edoc.getroot().append(child.edoc.getroot())
 187
 188     return document
 189
 190
 191 if __name__ == '__main__':
 192     import sys
 193     from librarian import DirDocProvider
 194
 195     if len(sys.argv) < 2:
 196         print >> sys.stderr, 'Usage: python pdf.py <input file>'
 197         sys.exit(1)
 198
 199     main_input = sys.argv[1]
 200     basepath, ext = os.path.splitext(main_input)
 201     path, slug = os.path.realpath(basepath).rsplit('/', 1)
 202     provider = DirDocProvider(path)
 203     transform(provider, slug, output_dir=path)
 204