1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
4 """PDF creation library.
6 Creates one big XML from the book and its children, converts it to LaTeX
7 with TeXML, then runs it by XeLaTeX.
14 from tempfile import mkdtemp, NamedTemporaryFile
16 from copy import deepcopy
17 from subprocess import call, PIPE
18 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
27 from librarian.dcparser import Person
28 from librarian.parser import WLDocument
29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
30 from librarian import functions
31 from librarian.cover import make_cover
32 from .sponsor import sponsor_logo
35 functions.reg_substitute_entities()
37 functions.reg_starts_white()
38 functions.reg_ends_white()
39 functions.reg_texcommand()
42 'wl2tex': 'pdf/wl2tex.xslt',
55 def insert_tags(doc, split_re, tagname, exclude=None):
57 Inserts <tagname> for every occurence of `split_re'
58 in text nodes in the `doc' tree.
60 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
61 >>> insert_tags(t, re.compile('-'), 'd')
62 >>> print(etree.tostring(t, encoding='unicode'))
63 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
66 for elem in doc.iter(tag=etree.Element):
67 if exclude and elem.tag in exclude:
70 chunks = split_re.split(elem.text)
71 while len(chunks) > 1:
72 ins = etree.Element(tagname)
73 ins.tail = chunks.pop()
75 elem.text = chunks.pop(0)
77 chunks = split_re.split(elem.tail)
78 parent = elem.getparent()
79 ins_index = parent.index(elem) + 1
80 while len(chunks) > 1:
81 ins = etree.Element(tagname)
82 ins.tail = chunks.pop()
83 parent.insert(ins_index, ins)
84 elem.tail = chunks.pop(0)
87 def substitute_hyphens(doc):
90 re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
92 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
99 re.compile(r"(?<=\s\w)\s+"),
101 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
106 for kol in doc.iter(tag='kol'):
107 if kol.tail is not None:
108 if not kol.tail.strip():
110 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
111 if table.get('ramka') == '1' or table.get('ramki') == '1':
112 table.set('_format', '|' + 'X|' * len(table[0]))
114 table.set('_format', 'X' * len(table[0]))
117 def mark_subauthors(doc):
118 root_author = ', '.join(
120 for elem in doc.findall(
121 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
125 # jeśli autor jest inny niż autor całości i niż poprzedni autor
126 # to wstawiamy jakiś znacznik w rdf?
127 for subutwor in doc.xpath('/utwor/utwor'):
130 for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
132 if author not in (last_author, root_author):
133 subutwor.find('.//' + RDFNS('RDF')).append(
134 etree.Element('use_subauthor')
139 def move_motifs_inside(doc):
140 """ moves motifs to be into block elements """
141 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
142 '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
143 '//dramat_wspolczesny'):
144 for motif in master.xpath('motyw'):
145 for sib in motif.itersiblings():
146 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
147 'separator_linia', 'begin', 'end',
148 'motyw', 'extra', 'uwaga'):
149 # motif shouldn't have a tail - it would be untagged text
151 motif.getparent().remove(motif)
156 def hack_motifs(doc):
158 Dirty hack for the marginpar-creates-orphans LaTeX problem
159 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
161 Moves motifs in stanzas from first verse to second and from next
162 to last to last, then inserts negative vspace before them.
164 for motif in doc.findall('//strofa//motyw'):
165 # find relevant verse-level tag
166 verse, stanza = motif, motif.getparent()
167 while stanza is not None and stanza.tag != 'strofa':
168 verse, stanza = stanza, stanza.getparent()
170 1 for i in verse.itersiblings('br', preceding=True)
172 breaks_after = sum(1 for i in verse.itersiblings('br'))
173 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
175 if breaks_after == 2:
177 moved_motif = deepcopy(motif)
180 moved_motif.tail = None
181 moved_motif.set('moved', str(move_by))
183 for br in verse.itersiblings('br'):
187 br.addnext(moved_motif)
191 def add_fundraising(doc, fundraising):
192 # Before each naglowek_rozdzial and naglowek_scena and in the end
194 for naglowek in doc.xpath('//naglowek_czesc|//naglowek_akt'):
195 spot = etree.Element('f_spot')
196 naglowek.addprevious(spot)
198 spot = etree.Element('f_spot')
199 doc.getroot()[-1][-1].append(spot)
202 nfunds = len(fundraising)
205 for f in range(nfunds):
206 spot_index = int(f / nfunds * e)
207 spots[spot_index].set('active', 'true')
208 elem = etree.fromstring('<f_spot>' + fundraising[f % len(fundraising)] + '</f_spot>')
209 spots[spot_index].text = elem.text
211 spots[spot_index].append(c)
214 def parse_creator(doc):
215 """Generates readable versions of creator and translator tags.
217 Finds all dc:creator and dc.contributor.translator tags
218 and adds *_parsed versions with forenames first.
220 for person in doc.xpath(
221 "|".join('//dc:' + tag for tag in (
222 'creator', 'contributor.translator'
224 namespaces={'dc': str(DCNS)})[::-1]:
227 p = Person.from_text(person.text)
228 person_parsed = deepcopy(person)
229 person_parsed.tag = person.tag + '_parsed'
230 person_parsed.set('sortkey', person.text)
231 person_parsed.text = p.readable()
232 person.getparent().insert(0, person_parsed)
235 def get_stylesheet(name):
236 return get_resource(STYLESHEETS[name])
239 def package_available(package, args='', verbose=False):
241 Check if a verion of a latex package accepting given args
244 tempdir = mkdtemp('-wl2pdf-test')
245 fpath = os.path.join(tempdir, 'test.tex')
252 """ % (args, package))
255 p = call(['xelatex', '-output-directory', tempdir, fpath])
258 ['xelatex', '-interaction=batchmode', '-output-directory',
260 stdout=PIPE, stderr=PIPE
262 shutil.rmtree(tempdir)
266 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
267 cover=None, flags=None, customizations=None, base_url='file://./',
268 latex_dir=False, fundraising=None):
269 """ produces a PDF file with XeLaTeX
272 verbose: prints all output from LaTeX
273 save_tex: path to save the intermediary LaTeX file to
274 morefloats (old/new/none): force specific morefloats
275 cover: a cover.Cover factory or True for default
276 flags: less-advertising,
277 customizations: user requested customizations regarding various
278 formatting parameters (passed to wl LaTeX class)
283 book_info = wldoc.book_info
284 document = load_including_children(wldoc)
285 root = document.edoc.getroot()
290 bound_cover = cover(book_info, width=1200)
291 root.set('data-cover-width', str(bound_cover.width))
292 root.set('data-cover-height', str(bound_cover.height))
293 if bound_cover.uses_dc_cover:
294 if book_info.cover_by:
295 root.set('data-cover-by', book_info.cover_by)
296 if book_info.cover_source:
297 root.set('data-cover-source', book_info.cover_source)
300 root.set('flag-' + flag, 'yes')
302 # check for LaTeX packages
304 root.set('morefloats', morefloats.lower())
305 elif package_available('morefloats', 'maxfloats=19'):
306 root.set('morefloats', 'new')
308 if customizations is None:
311 customizations = list(customizations)
313 if book_info.endnotes:
314 customizations.append('endnotes')
317 if customizations is not None:
318 root.set('customizations', ','.join(customizations))
321 editors = document.editors()
323 root.set('editors', ', '.join(sorted(
324 editor.readable() for editor in editors)))
325 if document.book_info.funders:
326 root.set('funders', ', '.join(document.book_info.funders))
327 if document.book_info.thanks:
328 root.set('thanks', document.book_info.thanks)
332 add_fundraising(document.edoc, fundraising)
333 move_motifs_inside(document.edoc)
334 hack_motifs(document.edoc)
335 parse_creator(document.edoc)
336 substitute_hyphens(document.edoc)
337 fix_hanging(document.edoc)
338 fix_tables(document.edoc)
339 mark_subauthors(document.edoc)
340 document.fix_pa_akap()
343 style_filename = get_stylesheet("wl2tex")
344 style = etree.parse(style_filename)
345 functions.reg_mathml_latex()
348 temp = mkdtemp('-wl2pdf')
350 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
351 url = urllib.parse.urljoin(
355 imgfile = urllib.request.urlopen(url)
356 img = Image.open(imgfile)
358 th_format, ext, media_type = {
359 'GIF': ('GIF', 'gif', 'image/gif'),
360 'PNG': ('PNG', 'png', 'image/png'),
361 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
364 if img.size[0] < width:
367 th = img.resize((width, round(width * img.size[1] / img.size[0])))
369 file_name = 'image%d.%s' % (i, ext)
370 th.save(os.path.join(temp, file_name))
371 ilustr.set('src', file_name)
375 for sponsor in book_info.sponsors:
376 ins = etree.Element("data-sponsor", name=sponsor)
377 logo = sponsor_logo(sponsor)
379 fname = 'sponsor-%s' % os.path.basename(logo)
380 shutil.copy(logo, os.path.join(temp, fname))
381 ins.set('src', fname)
384 if book_info.sponsor_note:
385 root.set("sponsor-note", book_info.sponsor_note)
387 texml = document.transform(style)
390 with open(os.path.join(temp, 'cover.png'), 'w') as f:
391 bound_cover.save(f, quality=80)
393 del document # no longer needed large object :)
395 tex_path = os.path.join(temp, 'doc.tex')
396 fout = open(tex_path, 'wb')
397 process(io.BytesIO(texml), fout, 'utf-8')
402 shutil.copy(tex_path, save_tex)
405 shutil.copy(get_resource('pdf/wl.cls'), temp)
406 shutil.copy(get_resource('res/wl-logo.png'), temp)
417 # some things work better when compiled twice
418 # (table of contents, [line numbers - disabled])
421 p = call(['xelatex', tex_path])
424 ['xelatex', '-interaction=batchmode', tex_path],
425 stdout=PIPE, stderr=PIPE
428 raise ParseError("Error parsing .tex file")
433 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
435 pdf_path = os.path.join(temp, 'doc.pdf')
436 shutil.move(pdf_path, output_file.name)
438 return OutputFile.from_filename(output_file.name)
440 except (XMLSyntaxError, XSLTApplyError) as e:
444 def load_including_children(wldoc=None, provider=None, uri=None):
445 """ Makes one big xml file with children inserted at end.
447 Either wldoc or provider and URI must be provided.
451 f = provider.by_slug(uri.slug)
452 text = f.read().decode('utf-8')
454 elif wldoc is not None:
455 text = etree.tostring(wldoc.edoc, encoding='unicode')
456 provider = wldoc.provider
459 'Neither a WLDocument, nor provider and URI were provided.'
463 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
465 text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
467 document = WLDocument.from_bytes(text.encode('utf-8'),
468 parse_dublincore=True, provider=provider)
469 document.swap_endlines()
471 for child_uri in document.book_info.parts:
472 child = load_including_children(provider=provider, uri=child_uri)
473 document.edoc.getroot().append(child.edoc.getroot())