1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import print_function, unicode_literals
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
24 from Texml.processor import process
25 from lxml import etree
26 from lxml.etree import XMLSyntaxError, XSLTApplyError
29 from librarian.dcparser import Person
30 from librarian.parser import WLDocument
31 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
32 from librarian import functions
33 from librarian.cover import make_cover
34 from .sponsor import sponsor_logo
37 functions.reg_substitute_entities()
39 functions.reg_starts_white()
40 functions.reg_ends_white()
41 functions.reg_texcommand()
44 'wl2tex': 'pdf/wl2tex.xslt',
57 def insert_tags(doc, split_re, tagname, exclude=None):
59 Inserts <tagname> for every occurence of `split_re'
60 in text nodes in the `doc' tree.
62 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
63 >>> insert_tags(t, re.compile('-'), 'd')
64 >>> print(etree.tostring(t, encoding='unicode'))
65 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
68 for elem in doc.iter(tag=etree.Element):
69 if exclude and elem.tag in exclude:
72 chunks = split_re.split(elem.text)
73 while len(chunks) > 1:
74 ins = etree.Element(tagname)
75 ins.tail = chunks.pop()
77 elem.text = chunks.pop(0)
79 chunks = split_re.split(elem.tail)
80 parent = elem.getparent()
81 ins_index = parent.index(elem) + 1
82 while len(chunks) > 1:
83 ins = etree.Element(tagname)
84 ins.tail = chunks.pop()
85 parent.insert(ins_index, ins)
86 elem.tail = chunks.pop(0)
89 def substitute_hyphens(doc):
92 re.compile(r"(?<=[^-\s])-(?=[^-\s])"),
94 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
101 re.compile(r"(?<=\s\w)\s+"),
103 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
108 for kol in doc.iter(tag='kol'):
109 if kol.tail is not None:
110 if not kol.tail.strip():
112 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
113 if table.get('ramka') == '1' or table.get('ramki') == '1':
114 table.set('_format', '|' + 'X|' * len(table[0]))
116 table.set('_format', 'X' * len(table[0]))
119 def mark_subauthors(doc):
120 root_author = ', '.join(
122 for elem in doc.findall(
123 './' + RDFNS('RDF') + '//' + DCNS('creator_parsed')
127 # jeśli autor jest inny niż autor całości i niż poprzedni autor
128 # to wstawiamy jakiś znacznik w rdf?
129 for subutwor in doc.xpath('/utwor/utwor'):
132 for elem in subutwor.findall('.//' + DCNS('creator_parsed'))
134 if author not in (last_author, root_author):
135 subutwor.find('.//' + RDFNS('RDF')).append(
136 etree.Element('use_subauthor')
141 def move_motifs_inside(doc):
142 """ moves motifs to be into block elements """
143 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
144 '//dramat_wierszowany_l|//dramat_wierszowany_lp|'
145 '//dramat_wspolczesny'):
146 for motif in master.xpath('motyw'):
147 for sib in motif.itersiblings():
148 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk',
149 'separator_linia', 'begin', 'end',
150 'motyw', 'extra', 'uwaga'):
151 # motif shouldn't have a tail - it would be untagged text
153 motif.getparent().remove(motif)
158 def hack_motifs(doc):
160 Dirty hack for the marginpar-creates-orphans LaTeX problem
161 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
163 Moves motifs in stanzas from first verse to second and from next
164 to last to last, then inserts negative vspace before them.
166 for motif in doc.findall('//strofa//motyw'):
167 # find relevant verse-level tag
168 verse, stanza = motif, motif.getparent()
169 while stanza is not None and stanza.tag != 'strofa':
170 verse, stanza = stanza, stanza.getparent()
172 1 for i in verse.itersiblings('br', preceding=True)
174 breaks_after = sum(1 for i in verse.itersiblings('br'))
175 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
177 if breaks_after == 2:
179 moved_motif = deepcopy(motif)
182 moved_motif.tail = None
183 moved_motif.set('moved', str(move_by))
185 for br in verse.itersiblings('br'):
189 br.addnext(moved_motif)
193 def parse_creator(doc):
194 """Generates readable versions of creator and translator tags.
196 Finds all dc:creator and dc.contributor.translator tags
197 and adds *_parsed versions with forenames first.
199 for person in doc.xpath(
200 "|".join('//dc:' + tag for tag in (
201 'creator', 'contributor.translator'
203 namespaces={'dc': str(DCNS)})[::-1]:
206 p = Person.from_text(person.text)
207 person_parsed = deepcopy(person)
208 person_parsed.tag = person.tag + '_parsed'
209 person_parsed.set('sortkey', person.text)
210 person_parsed.text = p.readable()
211 person.getparent().insert(0, person_parsed)
214 def get_stylesheet(name):
215 return get_resource(STYLESHEETS[name])
218 def package_available(package, args='', verbose=False):
220 Check if a verion of a latex package accepting given args
223 tempdir = mkdtemp('-wl2pdf-test')
224 fpath = os.path.join(tempdir, 'test.tex')
231 """ % (args, package))
234 p = call(['xelatex', '-output-directory', tempdir, fpath])
237 ['xelatex', '-interaction=batchmode', '-output-directory',
239 stdout=PIPE, stderr=PIPE
241 shutil.rmtree(tempdir)
245 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
246 cover=None, flags=None, customizations=None, base_url='file://./',
248 """ produces a PDF file with XeLaTeX
251 verbose: prints all output from LaTeX
252 save_tex: path to save the intermediary LaTeX file to
253 morefloats (old/new/none): force specific morefloats
254 cover: a cover.Cover factory or True for default
255 flags: less-advertising,
256 customizations: user requested customizations regarding various
257 formatting parameters (passed to wl LaTeX class)
262 book_info = wldoc.book_info
263 document = load_including_children(wldoc)
264 root = document.edoc.getroot()
269 bound_cover = cover(book_info, width=1200)
270 root.set('data-cover-width', str(bound_cover.width))
271 root.set('data-cover-height', str(bound_cover.height))
272 if bound_cover.uses_dc_cover:
273 if book_info.cover_by:
274 root.set('data-cover-by', book_info.cover_by)
275 if book_info.cover_source:
276 root.set('data-cover-source', book_info.cover_source)
279 root.set('flag-' + flag, 'yes')
281 # check for LaTeX packages
283 root.set('morefloats', morefloats.lower())
284 elif package_available('morefloats', 'maxfloats=19'):
285 root.set('morefloats', 'new')
288 if customizations is not None:
289 root.set('customizations', u','.join(customizations))
292 editors = document.editors()
294 root.set('editors', u', '.join(sorted(
295 editor.readable() for editor in editors)))
296 if document.book_info.funders:
297 root.set('funders', u', '.join(document.book_info.funders))
298 if document.book_info.thanks:
299 root.set('thanks', document.book_info.thanks)
302 move_motifs_inside(document.edoc)
303 hack_motifs(document.edoc)
304 parse_creator(document.edoc)
305 substitute_hyphens(document.edoc)
306 fix_hanging(document.edoc)
307 fix_tables(document.edoc)
308 mark_subauthors(document.edoc)
311 style_filename = get_stylesheet("wl2tex")
312 style = etree.parse(style_filename)
313 functions.reg_mathml_latex()
316 temp = mkdtemp('-wl2pdf')
318 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
319 url = six.moves.urllib.parse.urljoin(
323 with six.moves.urllib.request.urlopen(url) as imgfile:
324 img = Image.open(imgfile)
326 th_format, ext, media_type = {
327 'GIF': ('GIF', 'gif', 'image/gif'),
328 'PNG': ('PNG', 'png', 'image/png'),
329 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
332 if img.size[0] < width:
335 th = img.resize((width, round(width * img.size[1] / img.size[0])))
337 file_name = 'image%d.%s' % (i, ext)
338 th.save(os.path.join(temp, file_name))
339 ilustr.set('src', file_name)
341 for sponsor in book_info.sponsors:
342 ins = etree.Element("data-sponsor", name=sponsor)
343 logo = sponsor_logo(sponsor)
345 fname = 'sponsor-%s' % os.path.basename(logo)
346 shutil.copy(logo, os.path.join(temp, fname))
347 ins.set('src', fname)
350 if book_info.sponsor_note:
351 root.set("sponsor-note", book_info.sponsor_note)
353 texml = document.transform(style)
356 with open(os.path.join(temp, 'cover.png'), 'w') as f:
357 bound_cover.save(f, quality=80)
359 del document # no longer needed large object :)
361 tex_path = os.path.join(temp, 'doc.tex')
362 fout = open(tex_path, 'wb')
363 process(six.BytesIO(texml), fout, 'utf-8')
368 shutil.copy(tex_path, save_tex)
371 shutil.copy(get_resource('pdf/wl.cls'), temp)
372 shutil.copy(get_resource('res/wl-logo.png'), temp)
383 # some things work better when compiled twice
384 # (table of contents, [line numbers - disabled])
387 p = call(['xelatex', tex_path])
390 ['xelatex', '-interaction=batchmode', tex_path],
391 stdout=PIPE, stderr=PIPE
394 raise ParseError("Error parsing .tex file")
399 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf',
401 pdf_path = os.path.join(temp, 'doc.pdf')
402 shutil.move(pdf_path, output_file.name)
404 return OutputFile.from_filename(output_file.name)
406 except (XMLSyntaxError, XSLTApplyError) as e:
410 def load_including_children(wldoc=None, provider=None, uri=None):
411 """ Makes one big xml file with children inserted at end.
413 Either wldoc or provider and URI must be provided.
417 f = provider.by_uri(uri)
418 text = f.read().decode('utf-8')
420 elif wldoc is not None:
421 text = etree.tostring(wldoc.edoc, encoding='unicode')
422 provider = wldoc.provider
425 'Neither a WLDocument, nor provider and URI were provided.'
428 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
430 document = WLDocument.from_bytes(text.encode('utf-8'),
431 parse_dublincore=True, provider=provider)
432 document.swap_endlines()
434 for child_uri in document.book_info.parts:
435 child = load_including_children(provider=provider, uri=child_uri)
436 document.edoc.getroot().append(child.edoc.getroot())