1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import print_function, unicode_literals
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
28 from librarian.dcparser import Person
29 from librarian.parser import WLDocument
30 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
31 from librarian import functions
32 from librarian.cover import make_cover
33 from .sponsor import sponsor_logo
36 functions.reg_substitute_entities()
38 functions.reg_starts_white()
39 functions.reg_ends_white()
40 functions.reg_texcommand()
43 'wl2tex': 'pdf/wl2tex.xslt',
56 def insert_tags(doc, split_re, tagname, exclude=None):
58 Inserts <tagname> for every occurence of `split_re'
59 in text nodes in the `doc' tree.
61 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
62 >>> insert_tags(t, re.compile('-'), 'd')
63 >>> print(etree.tostring(t, encoding='unicode'))
64 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
67 for elem in doc.iter(tag=etree.Element):
68 if exclude and elem.tag in exclude:
71 chunks = split_re.split(elem.text)
72 while len(chunks) > 1:
73 ins = etree.Element(tagname)
74 ins.tail = chunks.pop()
76 elem.text = chunks.pop(0)
78 chunks = split_re.split(elem.tail)
79 parent = elem.getparent()
80 ins_index = parent.index(elem) + 1
81 while len(chunks) > 1:
82 ins = etree.Element(tagname)
83 ins.tail = chunks.pop()
84 parent.insert(ins_index, ins)
85 elem.tail = chunks.pop(0)
88 def substitute_hyphens(doc):
91 re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
93 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
100 re.compile(r"(?<=\s\w)\s+"),
102 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
107 for kol in doc.iter(tag='kol'):
108 if kol.tail is not None:
109 if not kol.tail.strip():
111 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
112 if table.get('ramka') == '1' or table.get('ramki') == '1':
113 table.set('_format', '|' + 'X|' * len(table[0]))
115 table.set('_format', 'X' * len(table[0]))
118 def mark_subauthors(doc):
119 root_author = ', '.join(
121 for elem in doc.findall(
122 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
126 # jeśli autor jest inny niż autor całości i niż poprzedni autor
127 # to wstawiamy jakiś znacznik w rdf?
128 for subutwor in doc.xpath('/utwor/utwor'):
131 for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
133 if author not in (last_author, root_author):
134 subutwor.find('.//' + RDFNS('RDF')).append(
135 etree.Element('use_subauthor')
140 def move_motifs_inside(doc):
141 """ moves motifs to be into block elements """
142 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
143 '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
144 '//dramat_wspolczesny'):
145 for motif in master.xpath('motyw'):
146 for sib in motif.itersiblings():
147 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
148 'separator_linia', 'begin', 'end',
149 'motyw', 'extra', 'uwaga'):
150 # motif shouldn't have a tail - it would be untagged text
152 motif.getparent().remove(motif)
157 def hack_motifs(doc):
159 Dirty hack for the marginpar-creates-orphans LaTeX problem
160 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
162 Moves motifs in stanzas from first verse to second and from next
163 to last to last, then inserts negative vspace before them.
165 for motif in doc.findall('//strofa//motyw'):
166 # find relevant verse-level tag
167 verse, stanza = motif, motif.getparent()
168 while stanza is not None and stanza.tag != 'strofa':
169 verse, stanza = stanza, stanza.getparent()
171 1 for i in verse.itersiblings('br', preceding=True)
173 breaks_after = sum(1 for i in verse.itersiblings('br'))
174 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
176 if breaks_after == 2:
178 moved_motif = deepcopy(motif)
181 moved_motif.tail = None
182 moved_motif.set('moved', str(move_by))
184 for br in verse.itersiblings('br'):
188 br.addnext(moved_motif)
192 def parse_creator(doc):
193 """Generates readable versions of creator and translator tags.
195 Finds all dc:creator and dc.contributor.translator tags
196 and adds *_parsed versions with forenames first.
198 for person in doc.xpath(
199 "|".join('//dc:' + tag for tag in (
200 'creator', 'contributor.translator'
202 namespaces={'dc': str(DCNS)})[::-1]:
205 p = Person.from_text(person.text)
206 person_parsed = deepcopy(person)
207 person_parsed.tag = person.tag + '_parsed'
208 person_parsed.set('sortkey', person.text)
209 person_parsed.text = p.readable()
210 person.getparent().insert(0, person_parsed)
213 def get_stylesheet(name):
214 return get_resource(STYLESHEETS[name])
217 def package_available(package, args='', verbose=False):
219 Check if a verion of a latex package accepting given args
222 tempdir = mkdtemp('-wl2pdf-test')
223 fpath = os.path.join(tempdir, 'test.tex')
230 """ % (args, package))
233 p = call(['xelatex', '-output-directory', tempdir, fpath])
236 ['xelatex', '-interaction=batchmode', '-output-directory',
238 stdout=PIPE, stderr=PIPE
240 shutil.rmtree(tempdir)
244 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
245 cover=None, flags=None, customizations=None, ilustr_path='',
247 """ produces a PDF file with XeLaTeX
250 verbose: prints all output from LaTeX
251 save_tex: path to save the intermediary LaTeX file to
252 morefloats (old/new/none): force specific morefloats
253 cover: a cover.Cover factory or True for default
254 flags: less-advertising,
255 customizations: user requested customizations regarding various
256 formatting parameters (passed to wl LaTeX class)
261 book_info = wldoc.book_info
262 document = load_including_children(wldoc)
263 root = document.edoc.getroot()
268 bound_cover = cover(book_info, width=1200)
269 root.set('data-cover-width', str(bound_cover.width))
270 root.set('data-cover-height', str(bound_cover.height))
271 if bound_cover.uses_dc_cover:
272 if book_info.cover_by:
273 root.set('data-cover-by', book_info.cover_by)
274 if book_info.cover_source:
275 root.set('data-cover-source', book_info.cover_source)
278 root.set('flag-' + flag, 'yes')
280 # check for LaTeX packages
282 root.set('morefloats', morefloats.lower())
283 elif package_available('morefloats', 'maxfloats=19'):
284 root.set('morefloats', 'new')
287 if customizations is not None:
288 root.set('customizations', u','.join(customizations))
291 editors = document.editors()
293 root.set('editors', u', '.join(sorted(
294 editor.readable() for editor in editors)))
295 if document.book_info.funders:
296 root.set('funders', u', '.join(document.book_info.funders))
297 if document.book_info.thanks:
298 root.set('thanks', document.book_info.thanks)
301 move_motifs_inside(document.edoc)
302 hack_motifs(document.edoc)
303 parse_creator(document.edoc)
304 substitute_hyphens(document.edoc)
305 fix_hanging(document.edoc)
306 fix_tables(document.edoc)
307 mark_subauthors(document.edoc)
310 style_filename = get_stylesheet("wl2tex")
311 style = etree.parse(style_filename)
312 functions.reg_mathml_latex()
315 temp = mkdtemp('-wl2pdf')
317 for ilustr in document.edoc.findall("//ilustr"):
318 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
320 for sponsor in book_info.sponsors:
321 ins = etree.Element("data-sponsor", name=sponsor)
322 logo = sponsor_logo(sponsor)
324 fname = 'sponsor-%s' % os.path.basename(logo)
325 shutil.copy(logo, os.path.join(temp, fname))
326 ins.set('src', fname)
329 if book_info.sponsor_note:
330 root.set("sponsor-note", book_info.sponsor_note)
332 texml = document.transform(style)
335 with open(os.path.join(temp, 'cover.png'), 'w') as f:
336 bound_cover.save(f, quality=80)
338 del document # no longer needed large object :)
340 tex_path = os.path.join(temp, 'doc.tex')
341 fout = open(tex_path, 'wb')
342 process(six.BytesIO(texml), fout, 'utf-8')
347 shutil.copy(tex_path, save_tex)
350 shutil.copy(get_resource('pdf/wl.cls'), temp)
351 shutil.copy(get_resource('res/wl-logo.png'), temp)
362 # some things work better when compiled twice
363 # (table of contents, [line numbers - disabled])
366 p = call(['xelatex', tex_path])
369 ['xelatex', '-interaction=batchmode', tex_path],
370 stdout=PIPE, stderr=PIPE
373 raise ParseError("Error parsing .tex file")
378 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
380 pdf_path = os.path.join(temp, 'doc.pdf')
381 shutil.move(pdf_path, output_file.name)
383 return OutputFile.from_filename(output_file.name)
385 except (XMLSyntaxError, XSLTApplyError) as e:
389 def load_including_children(wldoc=None, provider=None, uri=None):
390 """ Makes one big xml file with children inserted at end.
392 Either wldoc or provider and URI must be provided.
396 f = provider.by_uri(uri)
397 text = f.read().decode('utf-8')
399 elif wldoc is not None:
400 text = etree.tostring(wldoc.edoc, encoding='unicode')
401 provider = wldoc.provider
404 'Neither a WLDocument, nor provider and URI were provided.'
407 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
409 document = WLDocument.from_bytes(text.encode('utf-8'),
410 parse_dublincore=True, provider=provider)
411 document.swap_endlines()
413 for child_uri in document.book_info.parts:
414 child = load_including_children(provider=provider, uri=child_uri)
415 document.edoc.getroot().append(child.edoc.getroot())