1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
9 from StringIO import StringIO
10 from tempfile import mkdtemp
13 from Texml.processor import process
14 from lxml import etree
15 from lxml.etree import XMLSyntaxError, XSLTApplyError
17 from librarian.parser import WLDocument
18 from librarian import ParseError
19 from librarian import functions
21 functions.reg_substitute_entities()
24 'wl2tex': 'xslt/wl2tex.xslt',
28 def insert_tags(doc, split_re, tagname):
30 for elem in doc.iter():
32 chunks = split_re.split(elem.text)
33 elem.text = chunks.pop(0)
35 ins = etree.Element(tagname)
36 ins.tail = chunks.pop()
39 chunks = split_re.split(elem.tail)
40 parent = elem.getparent()
41 ins_index = parent.index(elem) + 1
42 elem.tail = chunks.pop(0)
44 ins = etree.Element(tagname)
45 ins.tail = chunks.pop(0)
46 parent.insert(ins_index, ins)
49 def substitute_hyphens(doc):
51 re.compile("(?<=[^-\s])-(?=[^-\s])"),
57 re.compile("(?<=\s\w)\s+"),
61 def get_stylesheet(name):
62 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
64 def transform(provider, slug, output_file=None, output_dir=None):
65 """ produces a pdf file
67 provider is a DocProvider
68 either output_file (a file-like object) or output_dir (path to file/dir) should be specified
69 if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
74 style_filename = get_stylesheet("wl2tex")
75 style = etree.parse(style_filename)
77 document = load_including_children(provider, slug)
79 substitute_hyphens(document.edoc)
80 fix_hanging(document.edoc)
82 print etree.tostring(document.edoc)
84 # if output to dir, create the file
85 if output_dir is not None:
86 author = unicode(document.book_info.author)
87 output_dir = os.path.join(output_dir, author)
89 texml = document.transform(style)
90 del document # no longer needed large object :)
92 temp = mkdtemp('wl2pdf-')
93 tex_path = os.path.join(temp, 'doc.tex')
94 fout = open(tex_path, 'w')
95 process(StringIO(texml), fout, 'utf8', 255, 0, 0)
99 print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
100 if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
101 raise ParseError("Error parsing .tex file")
103 pdf_path = os.path.join(temp, 'doc.pdf')
104 if output_dir is not None:
106 os.makedirs(output_dir)
109 output_path = os.path.join(output_dir, '%s.pdf' % slug)
110 shutil.move(pdf_path, output_path)
112 with open(pdf_path) as f:
113 output_file.write(f.read())
117 except (XMLSyntaxError, XSLTApplyError), e:
121 def load_including_children(provider, slug=None, uri=None):
122 """ makes one big xml file with children inserted at end
123 either slug or uri must be provided
127 f = provider.by_uri(uri)
131 raise ValueError('Neither slug nor URI provided for a book.')
133 document = WLDocument.from_file(f, True,
134 parse_dublincore=True,
135 preserve_lines=False)
137 for child_uri in document.book_info.parts:
138 child = load_including_children(provider, uri=child_uri)
139 document.edoc.getroot().append(child.edoc.getroot())
144 if __name__ == '__main__':
146 from librarian import DirDocProvider
148 if len(sys.argv) < 2:
149 print >> sys.stderr, 'Usage: python pdf.py <input file>'
152 main_input = sys.argv[1]
153 basepath, ext = os.path.splitext(main_input)
154 path, slug = os.path.realpath(basepath).rsplit('/', 1)
155 provider = DirDocProvider(path)
156 transform(provider, slug, output_dir=path)