1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
14 from subprocess import call, PIPE
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.dcparser import Person
23 from librarian.parser import WLDocument
24 from librarian import ParseError, DCNS
25 from librarian import functions
28 functions.reg_substitute_entities()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
32 functions.reg_texcommand()
35 'wl2tex': 'xslt/wl2tex.xslt',
39 def insert_tags(doc, split_re, tagname):
40 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
42 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
43 >>> insert_tags(t, re.compile('-'), 'd');
44 >>> print etree.tostring(t)
45 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
48 for elem in doc.iter(tag=etree.Element):
50 chunks = split_re.split(elem.text)
51 while len(chunks) > 1:
52 ins = etree.Element(tagname)
53 ins.tail = chunks.pop()
55 elem.text = chunks.pop(0)
57 chunks = split_re.split(elem.tail)
58 parent = elem.getparent()
59 ins_index = parent.index(elem) + 1
60 while len(chunks) > 1:
61 ins = etree.Element(tagname)
62 ins.tail = chunks.pop()
63 parent.insert(ins_index, ins)
64 elem.tail = chunks.pop(0)
67 def substitute_hyphens(doc):
69 re.compile("(?<=[^-\s])-(?=[^-\s])"),
75 re.compile("(?<=\s\w)\s+"),
79 def move_motifs_inside(doc):
80 """ moves motifs to be into block elements """
81 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
82 for motif in master.xpath('motyw'):
83 for sib in motif.itersiblings():
84 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
85 # motif shouldn't have a tail - it would be untagged text
87 motif.getparent().remove(motif)
93 """ dirty hack for the marginpar-creates-orphans LaTeX problem
94 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
96 moves motifs in stanzas from first verse to second
97 and from next to last to last, then inserts negative vspace before them
99 for motif in doc.findall('//strofa//motyw'):
100 # find relevant verse-level tag
101 verse, stanza = motif, motif.getparent()
102 while stanza is not None and stanza.tag != 'strofa':
103 verse, stanza = stanza, stanza.getparent()
104 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
105 breaks_after = sum(1 for i in verse.itersiblings('br'))
106 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
108 if breaks_after == 2:
110 moved_motif = deepcopy(motif)
113 moved_motif.tail = None
114 moved_motif.set('moved', str(move_by))
116 for br in verse.itersiblings('br'):
120 br.addnext(moved_motif)
124 def parse_creator(doc):
125 """ find all dc:creator tags and add dc:creator_parsed with forenames first """
126 for creator in doc.findall('//'+DCNS('creator')):
127 p = Person.from_text(creator.text)
128 creator_parsed = deepcopy(creator)
129 creator_parsed.tag = DCNS('creator_parsed')
130 creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
131 creator.getparent().insert(0, creator_parsed)
134 def get_resource(path):
135 return os.path.join(os.path.dirname(__file__), path)
137 def get_stylesheet(name):
138 return get_resource(STYLESHEETS[name])
141 def package_available(package, args='', verbose=False):
142 """ check if a verion of a latex package accepting given args is available """
143 tempdir = mkdtemp('-wl2pdf-test')
144 fpath = os.path.join(tempdir, 'test.tex')
151 """ % (args, package))
154 p = call(['xelatex', '-output-directory', tempdir, fpath])
156 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
157 shutil.rmtree(tempdir)
161 def transform(provider, slug=None, file_path=None,
162 output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
163 """ produces a PDF file with XeLaTeX
165 provider: a DocProvider
166 slug: slug of file to process, available by provider
167 file_path can be provided instead of a slug
168 output_file: file-like object or path to output file
169 output_dir: path to directory to save output file to; either this or output_file must be present
170 make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
171 verbose: prints all output from LaTeX
172 save_tex: path to save the intermediary LaTeX file to
173 morefloats (old/new/none): force specific morefloats
180 raise ValueError('slug or file_path should be specified, not both')
181 document = load_including_children(provider, file_path=file_path)
184 raise ValueError('either slug or file_path should be specified')
185 document = load_including_children(provider, slug=slug)
187 # check for LaTeX packages
189 document.edoc.getroot().set('morefloats', morefloats.lower())
190 elif package_available('morefloats', 'maxfloats=19'):
191 document.edoc.getroot().set('morefloats', 'new')
194 move_motifs_inside(document.edoc)
195 hack_motifs(document.edoc)
196 parse_creator(document.edoc)
197 substitute_hyphens(document.edoc)
198 fix_hanging(document.edoc)
201 if make_dir and output_dir is not None:
202 author = unicode(document.book_info.author)
203 output_dir = os.path.join(output_dir, author)
206 style_filename = get_stylesheet("wl2tex")
207 style = etree.parse(style_filename)
208 texml = document.transform(style)
209 del document # no longer needed large object :)
212 temp = mkdtemp('-wl2pdf')
213 tex_path = os.path.join(temp, 'doc.tex')
214 fout = open(tex_path, 'w')
215 process(StringIO(texml), fout, 'utf-8')
220 shutil.copy(tex_path, save_tex)
223 shutil.copy(get_resource('pdf/wl.sty'), temp)
224 shutil.copy(get_resource('pdf/wl-logo.png'), temp)
230 p = call(['xelatex', tex_path])
232 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
234 raise ParseError("Error parsing .tex file")
239 pdf_path = os.path.join(temp, 'doc.pdf')
240 if output_dir is not None:
242 os.makedirs(output_dir)
246 output_path = os.path.join(output_dir, '%s.pdf' % slug)
248 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
249 shutil.move(pdf_path, output_path)
251 if hasattr(output_file, 'write'):
253 with open(pdf_path) as f:
254 output_file.write(f.read())
257 # path to output file
258 shutil.copy(pdf_path, output_file)
261 except (XMLSyntaxError, XSLTApplyError), e:
265 def load_including_children(provider, slug=None, uri=None, file_path=None):
266 """ makes one big xml file with children inserted at end
267 either slug or uri must be provided
271 f = provider.by_uri(uri)
275 f = open(file_path, 'r')
277 raise ValueError('Neither slug, URI nor file path provided for a book.')
279 document = WLDocument.from_file(f, True,
280 parse_dublincore=True,
281 preserve_lines=False)
285 for child_uri in document.book_info.parts:
286 child = load_including_children(provider, uri=child_uri)
287 document.edoc.getroot().append(child.edoc.getroot())