1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import print_function, unicode_literals
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
28 from librarian.dcparser import Person
29 from librarian.parser import WLDocument
30 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
31 from librarian import functions
32 from librarian.cover import make_cover
33 from .sponsor import sponsor_logo
36 functions.reg_substitute_entities()
38 functions.reg_starts_white()
39 functions.reg_ends_white()
40 functions.reg_texcommand()
43 'wl2tex': 'pdf/wl2tex.xslt',
56 def insert_tags(doc, split_re, tagname, exclude=None):
57 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
59 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
60 >>> insert_tags(t, re.compile('-'), 'd')
61 >>> print(etree.tostring(t, encoding='unicode'))
62 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
65 for elem in doc.iter(tag=etree.Element):
66 if exclude and elem.tag in exclude:
69 chunks = split_re.split(elem.text)
70 while len(chunks) > 1:
71 ins = etree.Element(tagname)
72 ins.tail = chunks.pop()
74 elem.text = chunks.pop(0)
76 chunks = split_re.split(elem.tail)
77 parent = elem.getparent()
78 ins_index = parent.index(elem) + 1
79 while len(chunks) > 1:
80 ins = etree.Element(tagname)
81 ins.tail = chunks.pop()
82 parent.insert(ins_index, ins)
83 elem.tail = chunks.pop(0)
86 def substitute_hyphens(doc):
88 re.compile("(?<=[^-\s])-(?=[^-\s])"),
90 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
96 re.compile("(?<=\s\w)\s+"),
98 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
103 for kol in doc.iter(tag='kol'):
104 if kol.tail is not None:
105 if not kol.tail.strip():
107 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
108 if table.get('ramka') == '1' or table.get('ramki') == '1':
109 table.set('_format', '|' + 'X|' * len(table[0]))
111 table.set('_format', 'X' * len(table[0]))
114 def mark_subauthors(doc):
115 root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed')))
117 # jeśli autor jest inny niż autor całości i niż poprzedni autor
118 # to wstawiamy jakiś znacznik w rdf?
119 for subutwor in doc.xpath('/utwor/utwor'):
120 author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
121 if author not in (last_author, root_author):
122 subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor'))
126 def move_motifs_inside(doc):
127 """ moves motifs to be into block elements """
128 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
129 '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
130 for motif in master.xpath('motyw'):
131 for sib in motif.itersiblings():
132 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
133 'begin', 'end', 'motyw', 'extra', 'uwaga'):
134 # motif shouldn't have a tail - it would be untagged text
136 motif.getparent().remove(motif)
141 def hack_motifs(doc):
142 """ dirty hack for the marginpar-creates-orphans LaTeX problem
143 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
145 moves motifs in stanzas from first verse to second
146 and from next to last to last, then inserts negative vspace before them
148 for motif in doc.findall('//strofa//motyw'):
149 # find relevant verse-level tag
150 verse, stanza = motif, motif.getparent()
151 while stanza is not None and stanza.tag != 'strofa':
152 verse, stanza = stanza, stanza.getparent()
153 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
154 breaks_after = sum(1 for i in verse.itersiblings('br'))
155 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
157 if breaks_after == 2:
159 moved_motif = deepcopy(motif)
162 moved_motif.tail = None
163 moved_motif.set('moved', str(move_by))
165 for br in verse.itersiblings('br'):
169 br.addnext(moved_motif)
173 def parse_creator(doc):
174 """Generates readable versions of creator and translator tags.
176 Finds all dc:creator and dc.contributor.translator tags
177 and adds *_parsed versions with forenames first.
179 for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
180 namespaces={'dc': str(DCNS)})[::-1]:
183 p = Person.from_text(person.text)
184 person_parsed = deepcopy(person)
185 person_parsed.tag = person.tag + '_parsed'
186 person_parsed.set('sortkey', person.text)
187 person_parsed.text = p.readable()
188 person.getparent().insert(0, person_parsed)
191 def get_stylesheet(name):
192 return get_resource(STYLESHEETS[name])
195 def package_available(package, args='', verbose=False):
196 """ check if a verion of a latex package accepting given args is available """
197 tempdir = mkdtemp('-wl2pdf-test')
198 fpath = os.path.join(tempdir, 'test.tex')
205 """ % (args, package))
208 p = call(['xelatex', '-output-directory', tempdir, fpath])
210 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
211 shutil.rmtree(tempdir)
215 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
216 cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
217 """ produces a PDF file with XeLaTeX
220 verbose: prints all output from LaTeX
221 save_tex: path to save the intermediary LaTeX file to
222 morefloats (old/new/none): force specific morefloats
223 cover: a cover.Cover factory or True for default
224 flags: less-advertising,
225 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
230 book_info = wldoc.book_info
231 document = load_including_children(wldoc)
232 root = document.edoc.getroot()
237 bound_cover = cover(book_info, width=1200)
238 root.set('data-cover-width', str(bound_cover.width))
239 root.set('data-cover-height', str(bound_cover.height))
240 if bound_cover.uses_dc_cover:
241 if book_info.cover_by:
242 root.set('data-cover-by', book_info.cover_by)
243 if book_info.cover_source:
244 root.set('data-cover-source', book_info.cover_source)
247 root.set('flag-' + flag, 'yes')
249 # check for LaTeX packages
251 root.set('morefloats', morefloats.lower())
252 elif package_available('morefloats', 'maxfloats=19'):
253 root.set('morefloats', 'new')
256 if customizations is not None:
257 root.set('customizations', u','.join(customizations))
260 editors = document.editors()
262 root.set('editors', u', '.join(sorted(
263 editor.readable() for editor in editors)))
264 if document.book_info.funders:
265 root.set('funders', u', '.join(document.book_info.funders))
266 if document.book_info.thanks:
267 root.set('thanks', document.book_info.thanks)
270 move_motifs_inside(document.edoc)
271 hack_motifs(document.edoc)
272 parse_creator(document.edoc)
273 substitute_hyphens(document.edoc)
274 fix_hanging(document.edoc)
275 fix_tables(document.edoc)
276 mark_subauthors(document.edoc)
279 style_filename = get_stylesheet("wl2tex")
280 style = etree.parse(style_filename)
281 functions.reg_mathml_latex()
284 temp = mkdtemp('-wl2pdf')
286 for ilustr in document.edoc.findall("//ilustr"):
287 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
289 for sponsor in book_info.sponsors:
290 ins = etree.Element("data-sponsor", name=sponsor)
291 logo = sponsor_logo(sponsor)
293 fname = 'sponsor-%s' % os.path.basename(logo)
294 shutil.copy(logo, os.path.join(temp, fname))
295 ins.set('src', fname)
298 if book_info.sponsor_note:
299 root.set("sponsor-note", book_info.sponsor_note)
301 texml = document.transform(style)
304 with open(os.path.join(temp, 'cover.png'), 'w') as f:
305 bound_cover.save(f, quality=80)
307 del document # no longer needed large object :)
309 tex_path = os.path.join(temp, 'doc.tex')
310 fout = open(tex_path, 'wb')
311 process(six.BytesIO(texml), fout, 'utf-8')
316 shutil.copy(tex_path, save_tex)
319 shutil.copy(get_resource('pdf/wl.cls'), temp)
320 shutil.copy(get_resource('res/wl-logo.png'), temp)
331 # some things work better when compiled twice
332 # (table of contents, [line numbers - disabled])
335 p = call(['xelatex', tex_path])
337 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
339 raise ParseError("Error parsing .tex file")
344 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
345 pdf_path = os.path.join(temp, 'doc.pdf')
346 shutil.move(pdf_path, output_file.name)
348 return OutputFile.from_filename(output_file.name)
350 except (XMLSyntaxError, XSLTApplyError) as e:
354 def load_including_children(wldoc=None, provider=None, uri=None):
355 """ Makes one big xml file with children inserted at end.
357 Either wldoc or provider and URI must be provided.
361 f = provider.by_uri(uri)
362 text = f.read().decode('utf-8')
364 elif wldoc is not None:
365 text = etree.tostring(wldoc.edoc, encoding='unicode')
366 provider = wldoc.provider
368 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
370 text = re.sub(r"([\u0400-\u04ff]+)", r"<alien>\1</alien>", text)
372 document = WLDocument.from_bytes(text.encode('utf-8'), parse_dublincore=True, provider=provider)
373 document.swap_endlines()
375 for child_uri in document.book_info.parts:
376 child = load_including_children(provider=provider, uri=child_uri)
377 document.edoc.getroot().append(child.edoc.getroot())