1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
14 from subprocess import call, PIPE
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.dcparser import Person
23 from librarian.parser import WLDocument
24 from librarian import ParseError, DCNS, get_resource
25 from librarian import functions
28 functions.reg_substitute_entities()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
32 functions.reg_texcommand()
35 'wl2tex': 'pdf/wl2tex.xslt',
39 def insert_tags(doc, split_re, tagname, exclude=None):
40 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
42 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
43 >>> insert_tags(t, re.compile('-'), 'd');
44 >>> print etree.tostring(t)
45 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
48 for elem in doc.iter(tag=etree.Element):
49 if exclude and elem.tag in exclude:
52 chunks = split_re.split(elem.text)
53 while len(chunks) > 1:
54 ins = etree.Element(tagname)
55 ins.tail = chunks.pop()
57 elem.text = chunks.pop(0)
59 chunks = split_re.split(elem.tail)
60 parent = elem.getparent()
61 ins_index = parent.index(elem) + 1
62 while len(chunks) > 1:
63 ins = etree.Element(tagname)
64 ins.tail = chunks.pop()
65 parent.insert(ins_index, ins)
66 elem.tail = chunks.pop(0)
69 def substitute_hyphens(doc):
71 re.compile("(?<=[^-\s])-(?=[^-\s])"),
73 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
79 re.compile("(?<=\s\w)\s+"),
81 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
85 def move_motifs_inside(doc):
86 """ moves motifs to be into block elements """
87 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
88 for motif in master.xpath('motyw'):
89 for sib in motif.itersiblings():
90 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
91 # motif shouldn't have a tail - it would be untagged text
93 motif.getparent().remove(motif)
99 """ dirty hack for the marginpar-creates-orphans LaTeX problem
100 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
102 moves motifs in stanzas from first verse to second
103 and from next to last to last, then inserts negative vspace before them
105 for motif in doc.findall('//strofa//motyw'):
106 # find relevant verse-level tag
107 verse, stanza = motif, motif.getparent()
108 while stanza is not None and stanza.tag != 'strofa':
109 verse, stanza = stanza, stanza.getparent()
110 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
111 breaks_after = sum(1 for i in verse.itersiblings('br'))
112 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
114 if breaks_after == 2:
116 moved_motif = deepcopy(motif)
119 moved_motif.tail = None
120 moved_motif.set('moved', str(move_by))
122 for br in verse.itersiblings('br'):
126 br.addnext(moved_motif)
130 def parse_creator(doc):
131 """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
132 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
133 'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
134 namespaces = {'dc': str(DCNS)})[::-1]:
137 p = Person.from_text(person.text)
138 person_parsed = deepcopy(person)
139 person_parsed.tag = person.tag + '_parsed'
140 person_parsed.set('sortkey', person.text)
141 person_parsed.text = p.readable()
142 person.getparent().insert(0, person_parsed)
145 def get_stylesheet(name):
146 return get_resource(STYLESHEETS[name])
149 def package_available(package, args='', verbose=False):
150 """ check if a verion of a latex package accepting given args is available """
151 tempdir = mkdtemp('-wl2pdf-test')
152 fpath = os.path.join(tempdir, 'test.tex')
159 """ % (args, package))
162 p = call(['xelatex', '-output-directory', tempdir, fpath])
164 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
165 shutil.rmtree(tempdir)
169 def transform(provider, slug=None, file_path=None,
170 output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
171 cover=None, flags=None):
172 """ produces a PDF file with XeLaTeX
174 provider: a DocProvider
175 slug: slug of file to process, available by provider
176 file_path can be provided instead of a slug
177 output_file: file-like object or path to output file
178 output_dir: path to directory to save output file to; either this or output_file must be present
179 make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
180 verbose: prints all output from LaTeX
181 save_tex: path to save the intermediary LaTeX file to
182 morefloats (old/new/none): force specific morefloats
183 cover: a cover.Cover object
184 flags: less-advertising,
191 raise ValueError('slug or file_path should be specified, not both')
192 document = load_including_children(provider, file_path=file_path)
195 raise ValueError('either slug or file_path should be specified')
196 document = load_including_children(provider, slug=slug)
199 document.edoc.getroot().set('data-cover-width', str(cover.width))
200 document.edoc.getroot().set('data-cover-height', str(cover.height))
203 document.edoc.getroot().set('flag-' + flag, 'yes')
205 # check for LaTeX packages
207 document.edoc.getroot().set('morefloats', morefloats.lower())
208 elif package_available('morefloats', 'maxfloats=19'):
209 document.edoc.getroot().set('morefloats', 'new')
212 move_motifs_inside(document.edoc)
213 hack_motifs(document.edoc)
214 parse_creator(document.edoc)
215 substitute_hyphens(document.edoc)
216 fix_hanging(document.edoc)
219 if make_dir and output_dir is not None:
220 author = unicode(document.book_info.author)
221 output_dir = os.path.join(output_dir, author)
224 style_filename = get_stylesheet("wl2tex")
225 style = etree.parse(style_filename)
226 texml = document.transform(style)
229 temp = mkdtemp('-wl2pdf')
232 c = cover(document.book_info.author.readable(), document.book_info.title)
233 with open(os.path.join(temp, 'cover.png'), 'w') as f:
236 del document # no longer needed large object :)
238 tex_path = os.path.join(temp, 'doc.tex')
239 fout = open(tex_path, 'w')
240 process(StringIO(texml), fout, 'utf-8')
245 shutil.copy(tex_path, save_tex)
248 shutil.copy(get_resource('pdf/wl.sty'), temp)
249 shutil.copy(get_resource('res/wl-logo.png'), temp)
255 p = call(['xelatex', tex_path])
257 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
259 raise ParseError("Error parsing .tex file")
264 pdf_path = os.path.join(temp, 'doc.pdf')
265 if output_dir is not None:
267 os.makedirs(output_dir)
271 output_path = os.path.join(output_dir, '%s.pdf' % slug)
273 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
274 shutil.move(pdf_path, output_path)
276 if hasattr(output_file, 'write'):
278 with open(pdf_path) as f:
279 output_file.write(f.read())
282 # path to output file
283 shutil.copy(pdf_path, output_file)
286 except (XMLSyntaxError, XSLTApplyError), e:
290 def load_including_children(provider, slug=None, uri=None, file_path=None):
291 """ makes one big xml file with children inserted at end
292 either slug or uri must be provided
296 f = provider.by_uri(uri)
300 f = open(file_path, 'r')
302 raise ValueError('Neither slug, URI nor file path provided for a book.')
304 text = f.read().decode('utf-8')
305 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
307 document = WLDocument.from_string(text, True,
308 parse_dublincore=True)
311 for child_uri in document.book_info.parts:
313 child = load_including_children(provider, uri=child_uri)
314 document.edoc.getroot().append(child.edoc.getroot())