1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
16 sys.path.append('..') # for running from working copy
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.parser import WLDocument
23 from librarian import ParseError
24 from librarian import functions
28 functions.reg_substitute_entities()
29 functions.reg_person_name()
31 functions.reg_starts_white()
32 functions.reg_ends_white()
35 'wl2tex': 'xslt/wl2tex.xslt',
39 def insert_tags(doc, split_re, tagname):
40 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
42 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
43 >>> insert_tags(t, re.compile('-'), 'd');
44 >>> print etree.tostring(t)
45 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
48 for elem in doc.iter():
50 chunks = split_re.split(elem.text)
51 elem.text = chunks.pop(0)
53 ins = etree.Element(tagname)
54 ins.tail = chunks.pop()
57 chunks = split_re.split(elem.tail)
58 parent = elem.getparent()
59 ins_index = parent.index(elem) + 1
60 elem.tail = chunks.pop(0)
62 ins = etree.Element(tagname)
63 ins.tail = chunks.pop()
64 parent.insert(ins_index, ins)
67 def substitute_hyphens(doc):
69 re.compile("(?<=[^-\s])-(?=[^-\s])"),
75 re.compile("(?<=\s\w)\s+"),
79 def get_resource(path):
80 return os.path.join(os.path.dirname(__file__), path)
82 def get_stylesheet(name):
83 return get_resource(STYLESHEETS[name])
85 def transform(provider, slug, output_file=None, output_dir=None):
86 """ produces a pdf file
88 provider is a DocProvider
89 either output_file (a file-like object) or output_dir (path to file/dir) should be specified
90 if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.pdf
95 style_filename = get_stylesheet("wl2tex")
96 style = etree.parse(style_filename)
98 document = load_including_children(provider, slug)
100 # dirty hack for the marginpar-creates-orphans LaTeX problem
101 # see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
102 for motif in document.edoc.findall('//strofa//motyw'):
103 # find relevant verse-level tag
104 verse, stanza = motif, motif.getparent()
105 while stanza is not None and stanza.tag != 'strofa':
106 verse, stanza = stanza, stanza.getparent()
107 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
108 breaks_after = sum(1 for i in verse.itersiblings('br'))
109 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
111 if breaks_after == 2:
113 moved_motif = deepcopy(motif)
116 moved_motif.tail = None
117 moved_motif.set('moved', str(move_by))
119 for br in verse.itersiblings(tag='br'):
123 br.addnext(moved_motif)
126 substitute_hyphens(document.edoc)
127 fix_hanging(document.edoc)
129 # if output to dir, create the file
130 if output_dir is not None:
131 author = unicode(document.book_info.author)
132 output_dir = os.path.join(output_dir, author)
134 texml = document.transform(style)
135 del document # no longer needed large object :)
137 temp = mkdtemp('wl2pdf-')
138 tex_path = os.path.join(temp, 'doc.tex')
139 fout = open(tex_path, 'w')
140 process(StringIO(texml), fout, 'utf8', 255, 0, 0)
144 shutil.copy(get_resource('pdf/wl.sty'), temp)
145 shutil.copy(get_resource('pdf/wl-logo.png'), temp)
146 print "pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))
147 if os.system("pdflatex -output-directory %s %s" % (temp, os.path.join(temp, 'doc.tex'))):
148 raise ParseError("Error parsing .tex file")
150 pdf_path = os.path.join(temp, 'doc.pdf')
151 if output_dir is not None:
153 os.makedirs(output_dir)
156 output_path = os.path.join(output_dir, '%s.pdf' % slug)
157 shutil.move(pdf_path, output_path)
159 with open(pdf_path) as f:
160 output_file.write(f.read())
164 except (XMLSyntaxError, XSLTApplyError), e:
168 def load_including_children(provider, slug=None, uri=None):
169 """ makes one big xml file with children inserted at end
170 either slug or uri must be provided
174 f = provider.by_uri(uri)
178 raise ValueError('Neither slug nor URI provided for a book.')
180 document = WLDocument.from_file(f, True,
181 parse_dublincore=True,
182 preserve_lines=False)
184 for child_uri in document.book_info.parts:
185 child = load_including_children(provider, uri=child_uri)
186 document.edoc.getroot().append(child.edoc.getroot())
191 if __name__ == '__main__':
193 from librarian import DirDocProvider
195 if len(sys.argv) < 2:
196 print >> sys.stderr, 'Usage: python pdf.py <input file>'
199 main_input = sys.argv[1]
200 basepath, ext = os.path.splitext(main_input)
201 path, slug = os.path.realpath(basepath).rsplit('/', 1)
202 provider = DirDocProvider(path)
203 transform(provider, slug, output_dir=path)