1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
14 from subprocess import call, PIPE
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.dcparser import Person
23 from librarian.parser import WLDocument
24 from librarian import ParseError, DCNS
25 from librarian import functions
28 functions.reg_substitute_entities()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
32 functions.reg_texcommand()
35 'wl2tex': 'xslt/wl2tex.xslt',
39 def insert_tags(doc, split_re, tagname, exclude=None):
40 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
42 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
43 >>> insert_tags(t, re.compile('-'), 'd');
44 >>> print etree.tostring(t)
45 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
48 for elem in doc.iter(tag=etree.Element):
49 if exclude and elem.tag in exclude:
52 chunks = split_re.split(elem.text)
53 while len(chunks) > 1:
54 ins = etree.Element(tagname)
55 ins.tail = chunks.pop()
57 elem.text = chunks.pop(0)
59 chunks = split_re.split(elem.tail)
60 parent = elem.getparent()
61 ins_index = parent.index(elem) + 1
62 while len(chunks) > 1:
63 ins = etree.Element(tagname)
64 ins.tail = chunks.pop()
65 parent.insert(ins_index, ins)
66 elem.tail = chunks.pop(0)
69 def substitute_hyphens(doc):
71 re.compile("(?<=[^-\s])-(?=[^-\s])"),
73 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
79 re.compile("(?<=\s\w)\s+"),
81 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
85 def move_motifs_inside(doc):
86 """ moves motifs to be into block elements """
87 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
88 for motif in master.xpath('motyw'):
89 for sib in motif.itersiblings():
90 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
91 # motif shouldn't have a tail - it would be untagged text
93 motif.getparent().remove(motif)
99 """ dirty hack for the marginpar-creates-orphans LaTeX problem
100 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
102 moves motifs in stanzas from first verse to second
103 and from next to last to last, then inserts negative vspace before them
105 for motif in doc.findall('//strofa//motyw'):
106 # find relevant verse-level tag
107 verse, stanza = motif, motif.getparent()
108 while stanza is not None and stanza.tag != 'strofa':
109 verse, stanza = stanza, stanza.getparent()
110 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
111 breaks_after = sum(1 for i in verse.itersiblings('br'))
112 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
114 if breaks_after == 2:
116 moved_motif = deepcopy(motif)
119 moved_motif.tail = None
120 moved_motif.set('moved', str(move_by))
122 for br in verse.itersiblings('br'):
126 br.addnext(moved_motif)
130 def parse_creator(doc):
131 """ find all dc:creator tags and add dc:creator_parsed with forenames first """
132 for creator in doc.findall('//'+DCNS('creator')):
133 p = Person.from_text(creator.text)
134 creator_parsed = deepcopy(creator)
135 creator_parsed.tag = DCNS('creator_parsed')
136 creator_parsed.text = ' '.join(p.first_names + (p.last_name,))
137 creator.getparent().insert(0, creator_parsed)
140 def get_resource(path):
141 return os.path.join(os.path.dirname(__file__), path)
143 def get_stylesheet(name):
144 return get_resource(STYLESHEETS[name])
147 def package_available(package, args='', verbose=False):
148 """ check if a verion of a latex package accepting given args is available """
149 tempdir = mkdtemp('-wl2pdf-test')
150 fpath = os.path.join(tempdir, 'test.tex')
157 """ % (args, package))
160 p = call(['xelatex', '-output-directory', tempdir, fpath])
162 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
163 shutil.rmtree(tempdir)
167 def transform(provider, slug=None, file_path=None,
168 output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None):
169 """ produces a PDF file with XeLaTeX
171 provider: a DocProvider
172 slug: slug of file to process, available by provider
173 file_path can be provided instead of a slug
174 output_file: file-like object or path to output file
175 output_dir: path to directory to save output file to; either this or output_file must be present
176 make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
177 verbose: prints all output from LaTeX
178 save_tex: path to save the intermediary LaTeX file to
179 morefloats (old/new/none): force specific morefloats
186 raise ValueError('slug or file_path should be specified, not both')
187 document = load_including_children(provider, file_path=file_path)
190 raise ValueError('either slug or file_path should be specified')
191 document = load_including_children(provider, slug=slug)
193 # check for LaTeX packages
195 document.edoc.getroot().set('morefloats', morefloats.lower())
196 elif package_available('morefloats', 'maxfloats=19'):
197 document.edoc.getroot().set('morefloats', 'new')
200 move_motifs_inside(document.edoc)
201 hack_motifs(document.edoc)
202 parse_creator(document.edoc)
203 substitute_hyphens(document.edoc)
204 fix_hanging(document.edoc)
207 if make_dir and output_dir is not None:
208 author = unicode(document.book_info.author)
209 output_dir = os.path.join(output_dir, author)
212 style_filename = get_stylesheet("wl2tex")
213 style = etree.parse(style_filename)
214 texml = document.transform(style)
215 del document # no longer needed large object :)
218 temp = mkdtemp('-wl2pdf')
219 tex_path = os.path.join(temp, 'doc.tex')
220 fout = open(tex_path, 'w')
221 process(StringIO(texml), fout, 'utf-8')
226 shutil.copy(tex_path, save_tex)
229 shutil.copy(get_resource('pdf/wl.sty'), temp)
230 shutil.copy(get_resource('pdf/wl-logo.png'), temp)
236 p = call(['xelatex', tex_path])
238 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
240 raise ParseError("Error parsing .tex file")
245 pdf_path = os.path.join(temp, 'doc.pdf')
246 if output_dir is not None:
248 os.makedirs(output_dir)
252 output_path = os.path.join(output_dir, '%s.pdf' % slug)
254 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
255 shutil.move(pdf_path, output_path)
257 if hasattr(output_file, 'write'):
259 with open(pdf_path) as f:
260 output_file.write(f.read())
263 # path to output file
264 shutil.copy(pdf_path, output_file)
267 except (XMLSyntaxError, XSLTApplyError), e:
271 def load_including_children(provider, slug=None, uri=None, file_path=None):
272 """ makes one big xml file with children inserted at end
273 either slug or uri must be provided
277 f = provider.by_uri(uri)
281 f = open(file_path, 'r')
283 raise ValueError('Neither slug, URI nor file path provided for a book.')
285 text = f.read().decode('utf-8')
286 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
288 document = WLDocument.from_string(text, True,
289 parse_dublincore=True)
293 for child_uri in document.book_info.parts:
294 child = load_including_children(provider, uri=child_uri)
295 document.edoc.getroot().append(child.edoc.getroot())