1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import print_function, unicode_literals
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
24 from Texml.processor import process
25 from lxml import etree
26 from lxml.etree import XMLSyntaxError, XSLTApplyError
29 from librarian.dcparser import Person
30 from librarian.parser import WLDocument
31 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
32 from librarian import functions
33 from librarian.cover import make_cover
34 from .sponsor import sponsor_logo
37 functions.reg_substitute_entities()
39 functions.reg_starts_white()
40 functions.reg_ends_white()
41 functions.reg_texcommand()
44 'wl2tex': 'pdf/wl2tex.xslt',
57 def insert_tags(doc, split_re, tagname, exclude=None):
59 Inserts <tagname> for every occurence of `split_re'
60 in text nodes in the `doc' tree.
62 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
63 >>> insert_tags(t, re.compile('-'), 'd')
64 >>> print(etree.tostring(t, encoding='unicode'))
65 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
68 for elem in doc.iter(tag=etree.Element):
69 if exclude and elem.tag in exclude:
72 chunks = split_re.split(elem.text)
73 while len(chunks) > 1:
74 ins = etree.Element(tagname)
75 ins.tail = chunks.pop()
77 elem.text = chunks.pop(0)
79 chunks = split_re.split(elem.tail)
80 parent = elem.getparent()
81 ins_index = parent.index(elem) + 1
82 while len(chunks) > 1:
83 ins = etree.Element(tagname)
84 ins.tail = chunks.pop()
85 parent.insert(ins_index, ins)
86 elem.tail = chunks.pop(0)
89 def substitute_hyphens(doc):
92 re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
94 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
101 re.compile(r"(?<=\s\w)\s+"),
103 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
108 for kol in doc.iter(tag='kol'):
109 if kol.tail is not None:
110 if not kol.tail.strip():
112 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
113 if table.get('ramka') == '1' or table.get('ramki') == '1':
114 table.set('_format', '|' + 'X|' * len(table[0]))
116 table.set('_format', 'X' * len(table[0]))
119 def mark_subauthors(doc):
120 root_author = ', '.join(
122 for elem in doc.findall(
123 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
127 # jeśli autor jest inny niż autor całości i niż poprzedni autor
128 # to wstawiamy jakiś znacznik w rdf?
129 for subutwor in doc.xpath('/utwor/utwor'):
132 for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
134 if author not in (last_author, root_author):
135 subutwor.find('.//' + RDFNS('RDF')).append(
136 etree.Element('use_subauthor')
141 def move_motifs_inside(doc):
142 """ moves motifs to be into block elements """
143 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
144 '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
145 '//dramat_wspolczesny'):
146 for motif in master.xpath('motyw'):
147 for sib in motif.itersiblings():
148 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
149 'separator_linia', 'begin', 'end',
150 'motyw', 'extra', 'uwaga'):
151 # motif shouldn't have a tail - it would be untagged text
153 motif.getparent().remove(motif)
158 def hack_motifs(doc):
160 Dirty hack for the marginpar-creates-orphans LaTeX problem
161 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
163 Moves motifs in stanzas from first verse to second and from next
164 to last to last, then inserts negative vspace before them.
166 for motif in doc.findall('//strofa//motyw'):
167 # find relevant verse-level tag
168 verse, stanza = motif, motif.getparent()
169 while stanza is not None and stanza.tag != 'strofa':
170 verse, stanza = stanza, stanza.getparent()
172 1 for i in verse.itersiblings('br', preceding=True)
174 breaks_after = sum(1 for i in verse.itersiblings('br'))
175 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
177 if breaks_after == 2:
179 moved_motif = deepcopy(motif)
182 moved_motif.tail = None
183 moved_motif.set('moved', str(move_by))
185 for br in verse.itersiblings('br'):
189 br.addnext(moved_motif)
193 def parse_creator(doc):
194 """Generates readable versions of creator and translator tags.
196 Finds all dc:creator and dc.contributor.translator tags
197 and adds *_parsed versions with forenames first.
199 for person in doc.xpath(
200 "|".join('//dc:' + tag for tag in (
201 'creator', 'contributor.translator'
203 namespaces={'dc': str(DCNS)})[::-1]:
206 p = Person.from_text(person.text)
207 person_parsed = deepcopy(person)
208 person_parsed.tag = person.tag + '_parsed'
209 person_parsed.set('sortkey', person.text)
210 person_parsed.text = p.readable()
211 person.getparent().insert(0, person_parsed)
214 def get_stylesheet(name):
215 return get_resource(STYLESHEETS[name])
218 def package_available(package, args='', verbose=False):
220 Check if a verion of a latex package accepting given args
223 tempdir = mkdtemp('-wl2pdf-test')
224 fpath = os.path.join(tempdir, 'test.tex')
231 """ % (args, package))
234 p = call(['xelatex', '-output-directory', tempdir, fpath])
237 ['xelatex', '-interaction=batchmode', '-output-directory',
239 stdout=PIPE, stderr=PIPE
241 shutil.rmtree(tempdir)
245 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
246 cover=None, flags=None, customizations=None, base_url='file://./',
248 """ produces a PDF file with XeLaTeX
251 verbose: prints all output from LaTeX
252 save_tex: path to save the intermediary LaTeX file to
253 morefloats (old/new/none): force specific morefloats
254 cover: a cover.Cover factory or True for default
255 flags: less-advertising,
256 customizations: user requested customizations regarding various
257 formatting parameters (passed to wl LaTeX class)
262 book_info = wldoc.book_info
263 document = load_including_children(wldoc)
264 root = document.edoc.getroot()
269 bound_cover = cover(book_info, width=1200)
270 root.set('data-cover-width', str(bound_cover.width))
271 root.set('data-cover-height', str(bound_cover.height))
272 if bound_cover.uses_dc_cover:
273 if book_info.cover_by:
274 root.set('data-cover-by', book_info.cover_by)
275 if book_info.cover_source:
276 root.set('data-cover-source', book_info.cover_source)
279 root.set('flag-' + flag, 'yes')
281 # check for LaTeX packages
283 root.set('morefloats', morefloats.lower())
284 elif package_available('morefloats', 'maxfloats=19'):
285 root.set('morefloats', 'new')
287 if customizations is None:
290 customizations = list(customizations)
292 if book_info.endnotes:
293 customizations.append('endnotes')
296 if customizations is not None:
297 root.set('customizations', u','.join(customizations))
300 editors = document.editors()
302 root.set('editors', u', '.join(sorted(
303 editor.readable() for editor in editors)))
304 if document.book_info.funders:
305 root.set('funders', u', '.join(document.book_info.funders))
306 if document.book_info.thanks:
307 root.set('thanks', document.book_info.thanks)
310 move_motifs_inside(document.edoc)
311 hack_motifs(document.edoc)
312 parse_creator(document.edoc)
313 substitute_hyphens(document.edoc)
314 fix_hanging(document.edoc)
315 fix_tables(document.edoc)
316 mark_subauthors(document.edoc)
317 document.fix_pa_akap()
320 style_filename = get_stylesheet("wl2tex")
321 style = etree.parse(style_filename)
322 functions.reg_mathml_latex()
325 temp = mkdtemp('-wl2pdf')
327 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
328 url = six.moves.urllib.parse.urljoin(
332 imgfile = six.moves.urllib.request.urlopen(url)
333 img = Image.open(imgfile)
335 th_format, ext, media_type = {
336 'GIF': ('GIF', 'gif', 'image/gif'),
337 'PNG': ('PNG', 'png', 'image/png'),
338 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
341 if img.size[0] < width:
344 th = img.resize((width, round(width * img.size[1] / img.size[0])))
346 file_name = 'image%d.%s' % (i, ext)
347 th.save(os.path.join(temp, file_name))
348 ilustr.set('src', file_name)
352 for sponsor in book_info.sponsors:
353 ins = etree.Element("data-sponsor", name=sponsor)
354 logo = sponsor_logo(sponsor)
356 fname = 'sponsor-%s' % os.path.basename(logo)
357 shutil.copy(logo, os.path.join(temp, fname))
358 ins.set('src', fname)
361 if book_info.sponsor_note:
362 root.set("sponsor-note", book_info.sponsor_note)
364 texml = document.transform(style)
367 with open(os.path.join(temp, 'cover.png'), 'w') as f:
368 bound_cover.save(f, quality=80)
370 del document # no longer needed large object :)
372 tex_path = os.path.join(temp, 'doc.tex')
373 fout = open(tex_path, 'wb')
374 process(six.BytesIO(texml), fout, 'utf-8')
379 shutil.copy(tex_path, save_tex)
382 shutil.copy(get_resource('pdf/wl.cls'), temp)
383 shutil.copy(get_resource('res/wl-logo.png'), temp)
394 # some things work better when compiled twice
395 # (table of contents, [line numbers - disabled])
398 p = call(['xelatex', tex_path])
401 ['xelatex', '-interaction=batchmode', tex_path],
402 stdout=PIPE, stderr=PIPE
405 raise ParseError("Error parsing .tex file")
410 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
412 pdf_path = os.path.join(temp, 'doc.pdf')
413 shutil.move(pdf_path, output_file.name)
415 return OutputFile.from_filename(output_file.name)
417 except (XMLSyntaxError, XSLTApplyError) as e:
421 def load_including_children(wldoc=None, provider=None, uri=None):
422 """ Makes one big xml file with children inserted at end.
424 Either wldoc or provider and URI must be provided.
428 f = provider.by_uri(uri)
429 text = f.read().decode('utf-8')
431 elif wldoc is not None:
432 text = etree.tostring(wldoc.edoc, encoding='unicode')
433 provider = wldoc.provider
436 'Neither a WLDocument, nor provider and URI were provided.'
440 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
442 text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
444 document = WLDocument.from_bytes(text.encode('utf-8'),
445 parse_dublincore=True, provider=provider)
446 document.swap_endlines()
448 for child_uri in document.book_info.parts:
449 child = load_including_children(provider=provider, uri=child_uri)
450 document.edoc.getroot().append(child.edoc.getroot())