1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
14 from subprocess import call, PIPE
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.dcparser import Person
23 from librarian.parser import WLDocument
24 from librarian import ParseError, DCNS
25 from librarian import functions
28 functions.reg_substitute_entities()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
34 'wl2tex': 'xslt/wl2tex.xslt',
38 def insert_tags(doc, split_re, tagname):
39 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
41 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
42 >>> insert_tags(t, re.compile('-'), 'd');
43 >>> print etree.tostring(t)
44 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
47 for elem in doc.iter(tag=etree.Element):
49 chunks = split_re.split(elem.text)
50 while len(chunks) > 1:
51 ins = etree.Element(tagname)
52 ins.tail = chunks.pop()
54 elem.text = chunks.pop(0)
56 chunks = split_re.split(elem.tail)
57 parent = elem.getparent()
58 ins_index = parent.index(elem) + 1
59 while len(chunks) > 1:
60 ins = etree.Element(tagname)
61 ins.tail = chunks.pop()
62 parent.insert(ins_index, ins)
63 elem.tail = chunks.pop(0)
66 def substitute_hyphens(doc):
68 re.compile("(?<=[^-\s])-(?=[^-\s])"),
74 re.compile("(?<=\s\w)\s+"),
78 def move_motifs_inside(doc):
79 """ moves motifs to be into block elements """
80 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
81 for motif in master.xpath('motyw'):
82 for sib in motif.itersiblings():
83 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
84 # motif shouldn't have a tail - it would be untagged text
86 motif.getparent().remove(motif)
92 """ dirty hack for the marginpar-creates-orphans LaTeX problem
93 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
95 moves motifs in stanzas from first verse to second
96 and from next to last to last, then inserts negative vspace before them
98 for motif in doc.findall('//strofa//motyw'):
99 # find relevant verse-level tag
100 verse, stanza = motif, motif.getparent()
101 while stanza is not None and stanza.tag != 'strofa':
102 verse, stanza = stanza, stanza.getparent()
103 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
104 breaks_after = sum(1 for i in verse.itersiblings('br'))
105 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
107 if breaks_after == 2:
109 moved_motif = deepcopy(motif)
112 moved_motif.tail = None
113 moved_motif.set('moved', str(move_by))
115 for br in verse.itersiblings('br'):
119 br.addnext(moved_motif)
123 def parse_creator(doc):
124 """ find all dc:creator tags and add dc:creator_parsed with forenames first """
125 for creator in doc.findall('//'+DCNS('creator')):
126 p = Person.from_text(creator.text)
127 creator_parsed = deepcopy(creator)
128 creator_parsed.tag = DCNS('creator_parsed')
129 creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
130 creator.getparent().insert(0, creator_parsed)
133 def get_resource(path):
134 return os.path.join(os.path.dirname(__file__), path)
136 def get_stylesheet(name):
137 return get_resource(STYLESHEETS[name])
140 def package_available(package, args='', verbose=False):
141 """ check if a verion of a latex package accepting given args is available """
142 tempdir = mkdtemp('-wl2pdf-test')
143 fpath = os.path.join(tempdir, 'test.tex')
150 """ % (args, package))
153 p = call(['xelatex', '-output-directory', tempdir, fpath])
155 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
156 shutil.rmtree(tempdir)
160 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None):
161 """ produces a PDF file with XeLaTeX
163 provider: a DocProvider
164 slug: slug of file to process, available by provider
165 file_path can be provided instead of a slug
166 output_file: file-like object or path to output file
167 output_dir: path to directory to save output file to; either this or output_file must be present
168 make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
169 verbose: prints all output from LaTeX
170 save_tex: path to save the intermediary LaTeX file to
177 raise ValueError('slug or file_path should be specified, not both')
178 document = load_including_children(provider, file_path=file_path)
181 raise ValueError('either slug or file_path should be specified')
182 document = load_including_children(provider, slug=slug)
184 # check for LaTeX packages
185 if not package_available('morefloats', 'maxfloats=19'):
186 # using old morefloats or none at all
187 document.edoc.getroot().set('old-morefloats', 'yes')
190 move_motifs_inside(document.edoc)
191 hack_motifs(document.edoc)
192 parse_creator(document.edoc)
193 substitute_hyphens(document.edoc)
194 fix_hanging(document.edoc)
197 if make_dir and output_dir is not None:
198 author = unicode(document.book_info.author)
199 output_dir = os.path.join(output_dir, author)
202 style_filename = get_stylesheet("wl2tex")
203 style = etree.parse(style_filename)
204 texml = document.transform(style)
205 del document # no longer needed large object :)
208 temp = mkdtemp('-wl2pdf')
209 tex_path = os.path.join(temp, 'doc.tex')
210 fout = open(tex_path, 'w')
211 process(StringIO(texml), fout, 'utf-8')
216 shutil.copy(tex_path, save_tex)
219 shutil.copy(get_resource('pdf/wl.sty'), temp)
220 shutil.copy(get_resource('pdf/wl-logo.png'), temp)
226 p = call(['xelatex', tex_path])
228 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
230 raise ParseError("Error parsing .tex file")
235 pdf_path = os.path.join(temp, 'doc.pdf')
236 if output_dir is not None:
238 os.makedirs(output_dir)
242 output_path = os.path.join(output_dir, '%s.pdf' % slug)
244 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
245 shutil.move(pdf_path, output_path)
247 if hasattr(output_file, 'write'):
249 with open(pdf_path) as f:
250 output_file.write(f.read())
253 # path to output file
254 shutil.copy(pdf_path, output_file)
257 except (XMLSyntaxError, XSLTApplyError), e:
261 def load_including_children(provider, slug=None, uri=None, file_path=None):
262 """ makes one big xml file with children inserted at end
263 either slug or uri must be provided
267 f = provider.by_uri(uri)
271 f = open(file_path, 'r')
273 raise ValueError('Neither slug, URI nor file path provided for a book.')
275 document = WLDocument.from_file(f, True,
276 parse_dublincore=True,
277 preserve_lines=False)
281 for child_uri in document.book_info.parts:
282 child = load_including_children(provider, uri=child_uri)
283 document.edoc.getroot().append(child.edoc.getroot())