1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
21 from itertools import chain
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
27 from librarian.dcparser import Person
28 from librarian.parser import WLDocument
29 from librarian import ParseError, DCNS, get_resource, OutputFile, RDFNS
30 from librarian import functions
31 from librarian.cover import make_cover
32 from .sponsor import sponsor_logo
35 functions.reg_substitute_entities()
37 functions.reg_starts_white()
38 functions.reg_ends_white()
39 functions.reg_texcommand()
42 'wl2tex': 'pdf/wl2tex.xslt',
55 def insert_tags(doc, split_re, tagname, exclude=None):
56 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
58 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
59 >>> insert_tags(t, re.compile('-'), 'd')
60 >>> print etree.tostring(t)
61 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
64 for elem in doc.iter(tag=etree.Element):
65 if exclude and elem.tag in exclude:
68 chunks = split_re.split(elem.text)
69 while len(chunks) > 1:
70 ins = etree.Element(tagname)
71 ins.tail = chunks.pop()
73 elem.text = chunks.pop(0)
75 chunks = split_re.split(elem.tail)
76 parent = elem.getparent()
77 ins_index = parent.index(elem) + 1
78 while len(chunks) > 1:
79 ins = etree.Element(tagname)
80 ins.tail = chunks.pop()
81 parent.insert(ins_index, ins)
82 elem.tail = chunks.pop(0)
85 def substitute_hyphens(doc):
87 re.compile("(?<=[^-\s])-(?=[^-\s])"),
89 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "meta"]
95 re.compile("(?<=\s\w)\s+"),
97 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
102 for kol in doc.iter(tag='kol'):
103 if kol.tail is not None:
104 if not kol.tail.strip():
106 for table in chain(doc.iter(tag='tabela'), doc.iter(tag='tabelka')):
107 if table.get('ramka') == '1' or table.get('ramki') == '1':
108 table.set('_format', '|' + 'X|' * len(table[0]))
110 table.set('_format', 'X' * len(table[0]))
113 def mark_subauthors(doc):
114 root_author = ', '.join(elem.text for elem in doc.findall('./' + RDFNS('RDF') + '//' + DCNS('creator_parsed')))
116 # jeśli autor jest inny niż autor całości i niż poprzedni autor
117 # to wstawiamy jakiś znacznik w rdf?
118 for subutwor in doc.xpath('/utwor/utwor'):
119 author = ', '.join(elem.text for elem in subutwor.findall('.//' + DCNS('creator_parsed')))
120 if author not in (last_author, root_author):
121 subutwor.find('.//' + RDFNS('RDF')).append(etree.Element('use_subauthor'))
125 def move_motifs_inside(doc):
126 """ moves motifs to be into block elements """
127 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
128 '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
129 for motif in master.xpath('motyw'):
130 for sib in motif.itersiblings():
131 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
132 'begin', 'end', 'motyw', 'extra', 'uwaga'):
133 # motif shouldn't have a tail - it would be untagged text
135 motif.getparent().remove(motif)
140 def hack_motifs(doc):
141 """ dirty hack for the marginpar-creates-orphans LaTeX problem
142 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
144 moves motifs in stanzas from first verse to second
145 and from next to last to last, then inserts negative vspace before them
147 for motif in doc.findall('//strofa//motyw'):
148 # find relevant verse-level tag
149 verse, stanza = motif, motif.getparent()
150 while stanza is not None and stanza.tag != 'strofa':
151 verse, stanza = stanza, stanza.getparent()
152 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
153 breaks_after = sum(1 for i in verse.itersiblings('br'))
154 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
156 if breaks_after == 2:
158 moved_motif = deepcopy(motif)
161 moved_motif.tail = None
162 moved_motif.set('moved', str(move_by))
164 for br in verse.itersiblings('br'):
168 br.addnext(moved_motif)
172 def parse_creator(doc):
173 """Generates readable versions of creator and translator tags.
175 Finds all dc:creator and dc.contributor.translator tags
176 and adds *_parsed versions with forenames first.
178 for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
179 namespaces={'dc': str(DCNS)})[::-1]:
182 p = Person.from_text(person.text)
183 person_parsed = deepcopy(person)
184 person_parsed.tag = person.tag + '_parsed'
185 person_parsed.set('sortkey', person.text)
186 person_parsed.text = p.readable()
187 person.getparent().insert(0, person_parsed)
190 def get_stylesheet(name):
191 return get_resource(STYLESHEETS[name])
194 def package_available(package, args='', verbose=False):
195 """ check if a verion of a latex package accepting given args is available """
196 tempdir = mkdtemp('-wl2pdf-test')
197 fpath = os.path.join(tempdir, 'test.tex')
204 """ % (args, package))
207 p = call(['xelatex', '-output-directory', tempdir, fpath])
209 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
210 shutil.rmtree(tempdir)
214 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
215 cover=None, flags=None, customizations=None, ilustr_path='', latex_dir=False):
216 """ produces a PDF file with XeLaTeX
219 verbose: prints all output from LaTeX
220 save_tex: path to save the intermediary LaTeX file to
221 morefloats (old/new/none): force specific morefloats
222 cover: a cover.Cover factory or True for default
223 flags: less-advertising,
224 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
229 book_info = wldoc.book_info
230 document = load_including_children(wldoc)
231 root = document.edoc.getroot()
236 bound_cover = cover(book_info, width=1200)
237 root.set('data-cover-width', str(bound_cover.width))
238 root.set('data-cover-height', str(bound_cover.height))
239 if bound_cover.uses_dc_cover:
240 if book_info.cover_by:
241 root.set('data-cover-by', book_info.cover_by)
242 if book_info.cover_source:
243 root.set('data-cover-source', book_info.cover_source)
246 root.set('flag-' + flag, 'yes')
248 # check for LaTeX packages
250 root.set('morefloats', morefloats.lower())
251 elif package_available('morefloats', 'maxfloats=19'):
252 root.set('morefloats', 'new')
255 if customizations is not None:
256 root.set('customizations', u','.join(customizations))
259 editors = document.editors()
261 root.set('editors', u', '.join(sorted(
262 editor.readable() for editor in editors)))
263 if document.book_info.funders:
264 root.set('funders', u', '.join(document.book_info.funders))
265 if document.book_info.thanks:
266 root.set('thanks', document.book_info.thanks)
269 move_motifs_inside(document.edoc)
270 hack_motifs(document.edoc)
271 parse_creator(document.edoc)
272 substitute_hyphens(document.edoc)
273 fix_hanging(document.edoc)
274 fix_tables(document.edoc)
275 mark_subauthors(document.edoc)
278 style_filename = get_stylesheet("wl2tex")
279 style = etree.parse(style_filename)
280 functions.reg_mathml_latex()
283 temp = mkdtemp('-wl2pdf')
285 for ilustr in document.edoc.findall("//ilustr"):
286 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
288 for sponsor in book_info.sponsors:
289 ins = etree.Element("data-sponsor", name=sponsor)
290 logo = sponsor_logo(sponsor)
292 fname = 'sponsor-%s' % os.path.basename(logo)
293 shutil.copy(logo, os.path.join(temp, fname))
294 ins.set('src', fname)
297 if book_info.sponsor_note:
298 root.set("sponsor-note", book_info.sponsor_note)
300 texml = document.transform(style)
303 with open(os.path.join(temp, 'cover.png'), 'w') as f:
304 bound_cover.save(f, quality=80)
306 del document # no longer needed large object :)
308 tex_path = os.path.join(temp, 'doc.tex')
309 fout = open(tex_path, 'w')
310 process(StringIO(texml), fout, 'utf-8')
315 shutil.copy(tex_path, save_tex)
318 shutil.copy(get_resource('pdf/wl.cls'), temp)
319 shutil.copy(get_resource('res/wl-logo.png'), temp)
330 # some things work better when compiled twice
331 # (table of contents, [line numbers - disabled])
332 for run in xrange(2):
334 p = call(['xelatex', tex_path])
336 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
338 raise ParseError("Error parsing .tex file")
343 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
344 pdf_path = os.path.join(temp, 'doc.pdf')
345 shutil.move(pdf_path, output_file.name)
347 return OutputFile.from_filename(output_file.name)
349 except (XMLSyntaxError, XSLTApplyError), e:
353 def load_including_children(wldoc=None, provider=None, uri=None):
354 """ Makes one big xml file with children inserted at end.
356 Either wldoc or provider and URI must be provided.
360 f = provider.by_uri(uri)
361 text = f.read().decode('utf-8')
363 elif wldoc is not None:
364 text = etree.tostring(wldoc.edoc, encoding=unicode)
365 provider = wldoc.provider
367 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
369 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
371 document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
372 document.swap_endlines()
374 for child_uri in document.book_info.parts:
375 child = load_including_children(provider=provider, uri=child_uri)
376 document.edoc.getroot().append(child.edoc.getroot())