1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
15 sys.path.append('..') # for running from working copy
17 from Texml.processor import process
18 from lxml import etree
19 from lxml.etree import XMLSyntaxError, XSLTApplyError
21 from librarian.parser import WLDocument
22 from librarian import ParseError
23 from librarian import functions
27 functions.reg_substitute_entities()
28 functions.reg_person_name()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
34 'wl2tex': 'xslt/wl2tex.xslt',
38 def insert_tags(doc, split_re, tagname):
39 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
41 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
42 >>> insert_tags(t, re.compile('-'), 'd');
43 >>> print etree.tostring(t)
44 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
47 for elem in doc.iter():
49 chunks = split_re.split(elem.text)
50 elem.text = chunks.pop(0)
52 ins = etree.Element(tagname)
53 ins.tail = chunks.pop()
56 chunks = split_re.split(elem.tail)
57 parent = elem.getparent()
58 ins_index = parent.index(elem) + 1
59 elem.tail = chunks.pop(0)
61 ins = etree.Element(tagname)
62 ins.tail = chunks.pop()
63 parent.insert(ins_index, ins)
66 def substitute_hyphens(doc):
68 re.compile("(?<=[^-\s])-(?=[^-\s])"),
74 re.compile("(?<=\s\w)\s+"),
78 def get_resource(path):
79 return os.path.join(os.path.dirname(__file__), path)
81 def get_stylesheet(name):
82 return get_resource(STYLESHEETS[name])
84 def transform(provider, slug, output_file=None, output_dir=None):
85 """ produces a pdf file
87 provider is a DocProvider
88 either output_file (a file-like object) or output_dir (path to file/dir) should be specified
89 if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
94 style_filename = get_stylesheet("wl2tex")
95 style = etree.parse(style_filename)
97 document = load_including_children(provider, slug)
99 substitute_hyphens(document.edoc)
100 fix_hanging(document.edoc)
102 # if output to dir, create the file
103 if output_dir is not None:
104 author = unicode(document.book_info.author)
105 output_dir = os.path.join(output_dir, author)
107 texml = document.transform(style)
108 del document # no longer needed large object :)
110 temp = mkdtemp('wl2pdf-')
111 tex_path = os.path.join(temp, 'doc.tex')
112 fout = open(tex_path, 'w')
113 process(StringIO(texml), fout, 'utf8', 255, 0, 0)
117 shutil.copy(get_resource('pdf/wl.sty'), temp)
118 shutil.copy(get_resource('pdf/wl-logo.png'), temp)
119 print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
120 if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
121 raise ParseError("Error parsing .tex file")
123 pdf_path = os.path.join(temp, 'doc.pdf')
124 if output_dir is not None:
126 os.makedirs(output_dir)
129 output_path = os.path.join(output_dir, '%s.pdf' % slug)
130 shutil.move(pdf_path, output_path)
132 with open(pdf_path) as f:
133 output_file.write(f.read())
137 except (XMLSyntaxError, XSLTApplyError), e:
141 def load_including_children(provider, slug=None, uri=None):
142 """ makes one big xml file with children inserted at end
143 either slug or uri must be provided
147 f = provider.by_uri(uri)
151 raise ValueError('Neither slug nor URI provided for a book.')
153 document = WLDocument.from_file(f, True,
154 parse_dublincore=True,
155 preserve_lines=False)
157 for child_uri in document.book_info.parts:
158 child = load_including_children(provider, uri=child_uri)
159 document.edoc.getroot().append(child.edoc.getroot())
164 if __name__ == '__main__':
166 from librarian import DirDocProvider
168 if len(sys.argv) < 2:
169 print >> sys.stderr, 'Usage: python pdf.py <input file>'
172 main_input = sys.argv[1]
173 basepath, ext = os.path.splitext(main_input)
174 path, slug = os.path.realpath(basepath).rsplit('/', 1)
175 provider = DirDocProvider(path)
176 transform(provider, slug, output_dir=path)