1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
27 from librarian.dcparser import Person
28 from librarian.parser import WLDocument
29 from librarian import ParseError, DCNS, get_resource, OutputFile
30 from librarian import functions
31 from librarian.cover import make_cover
32 from .sponsor import sponsor_logo
35 functions.reg_substitute_entities()
37 functions.reg_starts_white()
38 functions.reg_ends_white()
39 functions.reg_texcommand()
42 'wl2tex': 'pdf/wl2tex.xslt',
55 def insert_tags(doc, split_re, tagname, exclude=None):
56 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
58 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
59 >>> insert_tags(t, re.compile('-'), 'd')
60 >>> print etree.tostring(t)
61 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
64 for elem in doc.iter(tag=etree.Element):
65 if exclude and elem.tag in exclude:
68 chunks = split_re.split(elem.text)
69 while len(chunks) > 1:
70 ins = etree.Element(tagname)
71 ins.tail = chunks.pop()
73 elem.text = chunks.pop(0)
75 chunks = split_re.split(elem.tail)
76 parent = elem.getparent()
77 ins_index = parent.index(elem) + 1
78 while len(chunks) > 1:
79 ins = etree.Element(tagname)
80 ins.tail = chunks.pop()
81 parent.insert(ins_index, ins)
82 elem.tail = chunks.pop(0)
85 def substitute_hyphens(doc):
87 re.compile("(?<=[^-\s])-(?=[^-\s])"),
89 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
95 re.compile("(?<=\s\w)\s+"),
97 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
102 for kol in doc.iter(tag='kol'):
103 if kol.tail is not None:
104 if not kol.tail.strip():
106 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
107 if table.get('ramka') == '1' or table.get('ramki') == '1':
108 table.set('_format', '|' + 'X|' * len(table[0]))
110 table.set('_format', 'X' * len(table[0]))
113 def move_motifs_inside(doc):
114 """ moves motifs to be into block elements """
115 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
116 '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
117 for motif in master.xpath('motyw'):
118 for sib in motif.itersiblings():
119 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
120 'begin', 'end', 'motyw', 'extra', 'uwaga'):
121 # motif shouldn't have a tail - it would be untagged text
123 motif.getparent().remove(motif)
128 def hack_motifs(doc):
129 """ dirty hack for the marginpar-creates-orphans LaTeX problem
130 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
132 moves motifs in stanzas from first verse to second
133 and from next to last to last, then inserts negative vspace before them
135 for motif in doc.findall('//strofa//motyw'):
136 # find relevant verse-level tag
137 verse, stanza = motif, motif.getparent()
138 while stanza is not None and stanza.tag != 'strofa':
139 verse, stanza = stanza, stanza.getparent()
140 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
141 breaks_after = sum(1 for i in verse.itersiblings('br'))
142 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
144 if breaks_after == 2:
146 moved_motif = deepcopy(motif)
149 moved_motif.tail = None
150 moved_motif.set('moved', str(move_by))
152 for br in verse.itersiblings('br'):
156 br.addnext(moved_motif)
160 def parse_creator(doc):
161 """Generates readable versions of creator and translator tags.
163 Finds all dc:creator and dc.contributor.translator tags
164 and adds *_parsed versions with forenames first.
166 for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
167 namespaces={'dc': str(DCNS)})[::-1]:
170 p = Person.from_text(person.text)
171 person_parsed = deepcopy(person)
172 person_parsed.tag = person.tag + '_parsed'
173 person_parsed.set('sortkey', person.text)
174 person_parsed.text = p.readable()
175 person.getparent().insert(0, person_parsed)
178 def get_stylesheet(name):
179 return get_resource(STYLESHEETS[name])
182 def package_available(package, args='', verbose=False):
183 """ check if a verion of a latex package accepting given args is available """
184 tempdir = mkdtemp('-wl2pdf-test')
185 fpath = os.path.join(tempdir, 'test.tex')
192 """ % (args, package))
195 p = call(['xelatex', '-output-directory', tempdir, fpath])
197 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
198 shutil.rmtree(tempdir)
202 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
203 cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
204 """ produces a PDF file with XeLaTeX
207 verbose: prints all output from LaTeX
208 save_tex: path to save the intermediary LaTeX file to
209 morefloats (old/new/none): force specific morefloats
210 cover: a cover.Cover factory or True for default
211 flags: less-advertising,
212 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
217 book_info = wldoc.book_info
218 document = load_including_children(wldoc)
219 root = document.edoc.getroot()
224 bound_cover = cover(book_info, width=1200)
225 root.set('data-cover-width', str(bound_cover.width))
226 root.set('data-cover-height', str(bound_cover.height))
227 if bound_cover.uses_dc_cover:
228 if book_info.cover_by:
229 root.set('data-cover-by', book_info.cover_by)
230 if book_info.cover_source:
231 root.set('data-cover-source', book_info.cover_source)
234 root.set('flag-' + flag, 'yes')
236 # check for LaTeX packages
238 root.set('morefloats', morefloats.lower())
239 elif package_available('morefloats', 'maxfloats=19'):
240 root.set('morefloats', 'new')
243 if customizations is not None:
244 root.set('customizations', u','.join(customizations))
247 editors = document.editors()
249 root.set('editors', u', '.join(sorted(
250 editor.readable() for editor in editors)))
251 if document.book_info.funders:
252 root.set('funders', u', '.join(document.book_info.funders))
253 if document.book_info.thanks:
254 root.set('thanks', document.book_info.thanks)
257 move_motifs_inside(document.edoc)
258 hack_motifs(document.edoc)
259 parse_creator(document.edoc)
260 substitute_hyphens(document.edoc)
261 fix_hanging(document.edoc)
262 fix_tables(document.edoc)
265 style_filename = get_stylesheet("wl2tex")
266 style = etree.parse(style_filename)
267 functions.reg_mathml_latex()
270 temp = mkdtemp('-wl2pdf')
272 for ilustr in document.edoc.findall("//ilustr"):
273 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
275 for sponsor in book_info.sponsors:
276 ins = etree.Element("data-sponsor", name=sponsor)
277 logo = sponsor_logo(sponsor)
279 fname = 'sponsor-%s' % os.path.basename(logo)
280 shutil.copy(logo, os.path.join(temp, fname))
281 ins.set('src', fname)
284 if book_info.sponsor_note:
285 root.set("sponsor-note", book_info.sponsor_note)
287 texml = document.transform(style)
290 with open(os.path.join(temp, 'cover.png'), 'w') as f:
291 bound_cover.save(f, quality=80)
293 del document # no longer needed large object :)
295 tex_path = os.path.join(temp, 'doc.tex')
296 fout = open(tex_path, 'w')
297 process(StringIO(texml), fout, 'utf-8')
302 shutil.copy(tex_path, save_tex)
305 shutil.copy(get_resource('pdf/wl.cls'), temp)
306 shutil.copy(get_resource('res/wl-logo.png'), temp)
317 # some things work better when compiled twice
318 # but they are not enabled now (line numbers)
319 for run in xrange(1):
321 p = call(['xelatex', tex_path])
323 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
325 raise ParseError("Error parsing .tex file")
330 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
331 pdf_path = os.path.join(temp, 'doc.pdf')
332 shutil.move(pdf_path, output_file.name)
334 return OutputFile.from_filename(output_file.name)
336 except (XMLSyntaxError, XSLTApplyError), e:
340 def load_including_children(wldoc=None, provider=None, uri=None):
341 """ Makes one big xml file with children inserted at end.
343 Either wldoc or provider and URI must be provided.
347 f = provider.by_uri(uri)
348 text = f.read().decode('utf-8')
350 elif wldoc is not None:
351 text = etree.tostring(wldoc.edoc, encoding=unicode)
352 provider = wldoc.provider
354 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
356 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
358 document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
359 document.swap_endlines()
361 for child_uri in document.book_info.parts:
362 child = load_including_children(provider=provider, uri=child_uri)
363 document.edoc.getroot().append(child.edoc.getroot())