1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
4 """PDF creation library.
6 Creates one big XML from the book and its children, converts it to LaTeX
7 with TeXML, then runs it by XeLaTeX.
14 from tempfile import mkdtemp, NamedTemporaryFile
16 from copy import deepcopy
17 from subprocess import call, PIPE
18 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
27 from librarian.dcparser import Person
28 from librarian.parser import WLDocument
29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
30 from librarian import functions
31 from librarian.cover import make_cover
32 from .sponsor import sponsor_logo
35 functions.reg_substitute_entities()
37 functions.reg_starts_white()
38 functions.reg_ends_white()
39 functions.reg_texcommand()
42 'wl2tex': 'pdf/wl2tex.xslt',
55 def insert_tags(doc, split_re, tagname, exclude=None):
57 Inserts <tagname> for every occurence of `split_re'
58 in text nodes in the `doc' tree.
60 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
61 >>> insert_tags(t, re.compile('-'), 'd')
62 >>> print(etree.tostring(t, encoding='unicode'))
63 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
66 for elem in doc.iter(tag=etree.Element):
67 if exclude and elem.tag in exclude:
70 chunks = split_re.split(elem.text)
71 while len(chunks) > 1:
72 ins = etree.Element(tagname)
73 ins.tail = chunks.pop()
75 elem.text = chunks.pop(0)
77 chunks = split_re.split(elem.tail)
78 parent = elem.getparent()
79 ins_index = parent.index(elem) + 1
80 while len(chunks) > 1:
81 ins = etree.Element(tagname)
82 ins.tail = chunks.pop()
83 parent.insert(ins_index, ins)
84 elem.tail = chunks.pop(0)
87 def substitute_hyphens(doc):
90 re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
92 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
99 re.compile(r"(?<=\s\w)\s+"),
101 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
106 for kol in doc.iter(tag='kol'):
107 if kol.tail is not None:
108 if not kol.tail.strip():
110 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
111 if table.get('ramka') == '1' or table.get('ramki') == '1':
112 table.set('_format', '|' + 'X|' * len(table[0]))
114 table.set('_format', 'X' * len(table[0]))
117 def mark_subauthors(doc):
118 root_author = ', '.join(
120 for elem in doc.findall(
121 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
125 # jeśli autor jest inny niż autor całości i niż poprzedni autor
126 # to wstawiamy jakiś znacznik w rdf?
127 for subutwor in doc.xpath('/utwor/utwor'):
130 for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
132 if author not in (last_author, root_author):
133 subutwor.find('.//' + RDFNS('RDF')).append(
134 etree.Element('use_subauthor')
139 def move_motifs_inside(doc):
140 """ moves motifs to be into block elements """
141 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
142 '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
143 '//dramat_wspolczesny'):
144 for motif in master.xpath('motyw'):
145 for sib in motif.itersiblings():
146 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
147 'separator_linia', 'begin', 'end',
148 'motyw', 'extra', 'uwaga'):
149 # motif shouldn't have a tail - it would be untagged text
151 motif.getparent().remove(motif)
156 def hack_motifs(doc):
158 Dirty hack for the marginpar-creates-orphans LaTeX problem
159 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
161 Moves motifs in stanzas from first verse to second and from next
162 to last to last, then inserts negative vspace before them.
164 for motif in doc.findall('//strofa//motyw'):
165 # find relevant verse-level tag
166 verse, stanza = motif, motif.getparent()
167 while stanza is not None and stanza.tag != 'strofa':
168 verse, stanza = stanza, stanza.getparent()
170 1 for i in verse.itersiblings('br', preceding=True)
172 breaks_after = sum(1 for i in verse.itersiblings('br'))
173 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
175 if breaks_after == 2:
177 moved_motif = deepcopy(motif)
180 moved_motif.tail = None
181 moved_motif.set('moved', str(move_by))
183 for br in verse.itersiblings('br'):
187 br.addnext(moved_motif)
191 def parse_creator(doc):
192 """Generates readable versions of creator and translator tags.
194 Finds all dc:creator and dc.contributor.translator tags
195 and adds *_parsed versions with forenames first.
197 for person in doc.xpath(
198 "|".join('//dc:' + tag for tag in (
199 'creator', 'contributor.translator'
201 namespaces={'dc': str(DCNS)})[::-1]:
204 p = Person.from_text(person.text)
205 person_parsed = deepcopy(person)
206 person_parsed.tag = person.tag + '_parsed'
207 person_parsed.set('sortkey', person.text)
208 person_parsed.text = p.readable()
209 person.getparent().insert(0, person_parsed)
212 def get_stylesheet(name):
213 return get_resource(STYLESHEETS[name])
216 def package_available(package, args='', verbose=False):
218 Check if a verion of a latex package accepting given args
221 tempdir = mkdtemp('-wl2pdf-test')
222 fpath = os.path.join(tempdir, 'test.tex')
229 """ % (args, package))
232 p = call(['xelatex', '-output-directory', tempdir, fpath])
235 ['xelatex', '-interaction=batchmode', '-output-directory',
237 stdout=PIPE, stderr=PIPE
239 shutil.rmtree(tempdir)
243 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
244 cover=None, flags=None, customizations=None, base_url='file://./',
246 """ produces a PDF file with XeLaTeX
249 verbose: prints all output from LaTeX
250 save_tex: path to save the intermediary LaTeX file to
251 morefloats (old/new/none): force specific morefloats
252 cover: a cover.Cover factory or True for default
253 flags: less-advertising,
254 customizations: user requested customizations regarding various
255 formatting parameters (passed to wl LaTeX class)
260 book_info = wldoc.book_info
261 document = load_including_children(wldoc)
262 root = document.edoc.getroot()
267 bound_cover = cover(book_info, width=1200)
268 root.set('data-cover-width', str(bound_cover.width))
269 root.set('data-cover-height', str(bound_cover.height))
270 if bound_cover.uses_dc_cover:
271 if book_info.cover_by:
272 root.set('data-cover-by', book_info.cover_by)
273 if book_info.cover_source:
274 root.set('data-cover-source', book_info.cover_source)
277 root.set('flag-' + flag, 'yes')
279 # check for LaTeX packages
281 root.set('morefloats', morefloats.lower())
282 elif package_available('morefloats', 'maxfloats=19'):
283 root.set('morefloats', 'new')
285 if customizations is None:
288 customizations = list(customizations)
290 if book_info.endnotes:
291 customizations.append('endnotes')
294 if customizations is not None:
295 root.set('customizations', ','.join(customizations))
298 editors = document.editors()
300 root.set('editors', ', '.join(sorted(
301 editor.readable() for editor in editors)))
302 if document.book_info.funders:
303 root.set('funders', ', '.join(document.book_info.funders))
304 if document.book_info.thanks:
305 root.set('thanks', document.book_info.thanks)
308 move_motifs_inside(document.edoc)
309 hack_motifs(document.edoc)
310 parse_creator(document.edoc)
311 substitute_hyphens(document.edoc)
312 fix_hanging(document.edoc)
313 fix_tables(document.edoc)
314 mark_subauthors(document.edoc)
315 document.fix_pa_akap()
318 style_filename = get_stylesheet("wl2tex")
319 style = etree.parse(style_filename)
320 functions.reg_mathml_latex()
323 temp = mkdtemp('-wl2pdf')
325 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
326 url = urllib.parse.urljoin(
330 imgfile = urllib.request.urlopen(url)
331 img = Image.open(imgfile)
333 th_format, ext, media_type = {
334 'GIF': ('GIF', 'gif', 'image/gif'),
335 'PNG': ('PNG', 'png', 'image/png'),
336 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
339 if img.size[0] < width:
342 th = img.resize((width, round(width * img.size[1] / img.size[0])))
344 file_name = 'image%d.%s' % (i, ext)
345 th.save(os.path.join(temp, file_name))
346 ilustr.set('src', file_name)
350 for sponsor in book_info.sponsors:
351 ins = etree.Element("data-sponsor", name=sponsor)
352 logo = sponsor_logo(sponsor)
354 fname = 'sponsor-%s' % os.path.basename(logo)
355 shutil.copy(logo, os.path.join(temp, fname))
356 ins.set('src', fname)
359 if book_info.sponsor_note:
360 root.set("sponsor-note", book_info.sponsor_note)
362 texml = document.transform(style)
365 with open(os.path.join(temp, 'cover.png'), 'w') as f:
366 bound_cover.save(f, quality=80)
368 del document # no longer needed large object :)
370 tex_path = os.path.join(temp, 'doc.tex')
371 fout = open(tex_path, 'wb')
372 process(io.BytesIO(texml), fout, 'utf-8')
377 shutil.copy(tex_path, save_tex)
380 shutil.copy(get_resource('pdf/wl.cls'), temp)
381 shutil.copy(get_resource('res/wl-logo.png'), temp)
392 # some things work better when compiled twice
393 # (table of contents, [line numbers - disabled])
396 p = call(['xelatex', tex_path])
399 ['xelatex', '-interaction=batchmode', tex_path],
400 stdout=PIPE, stderr=PIPE
403 raise ParseError("Error parsing .tex file")
408 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
410 pdf_path = os.path.join(temp, 'doc.pdf')
411 shutil.move(pdf_path, output_file.name)
413 return OutputFile.from_filename(output_file.name)
415 except (XMLSyntaxError, XSLTApplyError) as e:
419 def load_including_children(wldoc=None, provider=None, uri=None):
420 """ Makes one big xml file with children inserted at end.
422 Either wldoc or provider and URI must be provided.
426 f = provider.by_slug(uri.slug)
427 text = f.read().decode('utf-8')
429 elif wldoc is not None:
430 text = etree.tostring(wldoc.edoc, encoding='unicode')
431 provider = wldoc.provider
434 'Neither a WLDocument, nor provider and URI were provided.'
438 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
440 text = re.sub(r"([\u25a0-\u25ff]+)", r"<alien>\1</alien>", text)
442 document = WLDocument.from_bytes(text.encode('utf-8'),
443 parse_dublincore=True, provider=provider)
444 document.swap_endlines()
446 for child_uri in document.book_info.parts:
447 child = load_including_children(provider=provider, uri=child_uri)
448 document.edoc.getroot().append(child.edoc.getroot())