1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import DefaultEbookCover
31 from .sponsor import sponsor_logo
34 functions.reg_substitute_entities()
36 functions.reg_starts_white()
37 functions.reg_ends_white()
38 functions.reg_texcommand()
41 'wl2tex': 'pdf/wl2tex.xslt',
54 def insert_tags(doc, split_re, tagname, exclude=None):
55 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
57 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
58 >>> insert_tags(t, re.compile('-'), 'd')
59 >>> print etree.tostring(t)
60 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
63 for elem in doc.iter(tag=etree.Element):
64 if exclude and elem.tag in exclude:
67 chunks = split_re.split(elem.text)
68 while len(chunks) > 1:
69 ins = etree.Element(tagname)
70 ins.tail = chunks.pop()
72 elem.text = chunks.pop(0)
74 chunks = split_re.split(elem.tail)
75 parent = elem.getparent()
76 ins_index = parent.index(elem) + 1
77 while len(chunks) > 1:
78 ins = etree.Element(tagname)
79 ins.tail = chunks.pop()
80 parent.insert(ins_index, ins)
81 elem.tail = chunks.pop(0)
84 def substitute_hyphens(doc):
86 re.compile("(?<=[^-\s])-(?=[^-\s])"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
94 re.compile("(?<=\s\w)\s+"),
96 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
101 for kol in doc.iter(tag='kol'):
102 if kol.tail is not None:
103 if not kol.tail.strip():
105 for table in doc.iter(tag='tabela'):
106 if table.get('ramka') == '1' or table.get('ramki') == '1':
107 table.set('_format', '|' + 'X|' * len(table[0]))
109 table.set('_format', 'X' * len(table[0]))
112 def move_motifs_inside(doc):
113 """ moves motifs to be into block elements """
114 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|'
115 '//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
116 for motif in master.xpath('motyw'):
117 for sib in motif.itersiblings():
118 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
119 'begin', 'end', 'motyw', 'extra', 'uwaga'):
120 # motif shouldn't have a tail - it would be untagged text
122 motif.getparent().remove(motif)
127 def hack_motifs(doc):
128 """ dirty hack for the marginpar-creates-orphans LaTeX problem
129 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
131 moves motifs in stanzas from first verse to second
132 and from next to last to last, then inserts negative vspace before them
134 for motif in doc.findall('//strofa//motyw'):
135 # find relevant verse-level tag
136 verse, stanza = motif, motif.getparent()
137 while stanza is not None and stanza.tag != 'strofa':
138 verse, stanza = stanza, stanza.getparent()
139 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
140 breaks_after = sum(1 for i in verse.itersiblings('br'))
141 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
143 if breaks_after == 2:
145 moved_motif = deepcopy(motif)
148 moved_motif.tail = None
149 moved_motif.set('moved', str(move_by))
151 for br in verse.itersiblings('br'):
155 br.addnext(moved_motif)
159 def parse_creator(doc):
160 """Generates readable versions of creator and translator tags.
162 Finds all dc:creator and dc.contributor.translator tags
163 and adds *_parsed versions with forenames first.
165 for person in doc.xpath("|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
166 namespaces={'dc': str(DCNS)})[::-1]:
169 p = Person.from_text(person.text)
170 person_parsed = deepcopy(person)
171 person_parsed.tag = person.tag + '_parsed'
172 person_parsed.set('sortkey', person.text)
173 person_parsed.text = p.readable()
174 person.getparent().insert(0, person_parsed)
177 def get_stylesheet(name):
178 return get_resource(STYLESHEETS[name])
181 def package_available(package, args='', verbose=False):
182 """ check if a verion of a latex package accepting given args is available """
183 tempdir = mkdtemp('-wl2pdf-test')
184 fpath = os.path.join(tempdir, 'test.tex')
191 """ % (args, package))
194 p = call(['xelatex', '-output-directory', tempdir, fpath])
196 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
197 shutil.rmtree(tempdir)
201 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
202 cover=None, flags=None, customizations=None, ilustr_path=''):
203 """ produces a PDF file with XeLaTeX
206 verbose: prints all output from LaTeX
207 save_tex: path to save the intermediary LaTeX file to
208 morefloats (old/new/none): force specific morefloats
209 cover: a cover.Cover factory or True for default
210 flags: less-advertising,
211 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
216 book_info = wldoc.book_info
217 document = load_including_children(wldoc)
218 root = document.edoc.getroot()
222 cover = DefaultEbookCover
223 bound_cover = cover(book_info, width=1200)
224 root.set('data-cover-width', str(bound_cover.width))
225 root.set('data-cover-height', str(bound_cover.height))
226 if bound_cover.uses_dc_cover:
227 if book_info.cover_by:
228 root.set('data-cover-by', book_info.cover_by)
229 if book_info.cover_source:
230 root.set('data-cover-source', book_info.cover_source)
233 root.set('flag-' + flag, 'yes')
235 # check for LaTeX packages
237 root.set('morefloats', morefloats.lower())
238 elif package_available('morefloats', 'maxfloats=19'):
239 root.set('morefloats', 'new')
242 if customizations is not None:
243 root.set('customizations', u','.join(customizations))
246 editors = document.editors()
248 root.set('editors', u', '.join(sorted(
249 editor.readable() for editor in editors)))
250 if document.book_info.funders:
251 root.set('funders', u', '.join(document.book_info.funders))
252 if document.book_info.thanks:
253 root.set('thanks', document.book_info.thanks)
256 move_motifs_inside(document.edoc)
257 hack_motifs(document.edoc)
258 parse_creator(document.edoc)
259 substitute_hyphens(document.edoc)
260 fix_hanging(document.edoc)
261 fix_tables(document.edoc)
264 style_filename = get_stylesheet("wl2tex")
265 style = etree.parse(style_filename)
266 functions.reg_mathml_latex()
269 temp = mkdtemp('-wl2pdf')
271 for ilustr in document.edoc.findall("//ilustr"):
272 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
274 for sponsor in book_info.sponsors:
275 ins = etree.Element("data-sponsor", name=sponsor)
276 logo = sponsor_logo(sponsor)
278 fname = 'sponsor-%s' % os.path.basename(logo)
279 shutil.copy(logo, os.path.join(temp, fname))
280 ins.set('src', fname)
283 if book_info.sponsor_note:
284 root.set("sponsor-note", book_info.sponsor_note)
286 texml = document.transform(style)
289 with open(os.path.join(temp, 'cover.png'), 'w') as f:
290 bound_cover.save(f, quality=80)
292 del document # no longer needed large object :)
294 tex_path = os.path.join(temp, 'doc.tex')
295 fout = open(tex_path, 'w')
296 process(StringIO(texml), fout, 'utf-8')
301 shutil.copy(tex_path, save_tex)
304 shutil.copy(get_resource('pdf/wl.cls'), temp)
305 shutil.copy(get_resource('res/wl-logo.png'), temp)
314 p = call(['xelatex', tex_path])
316 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
318 raise ParseError("Error parsing .tex file")
323 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
324 pdf_path = os.path.join(temp, 'doc.pdf')
325 shutil.move(pdf_path, output_file.name)
327 return OutputFile.from_filename(output_file.name)
329 except (XMLSyntaxError, XSLTApplyError), e:
333 def load_including_children(wldoc=None, provider=None, uri=None):
334 """ Makes one big xml file with children inserted at end.
336 Either wldoc or provider and URI must be provided.
340 f = provider.by_uri(uri)
341 text = f.read().decode('utf-8')
343 elif wldoc is not None:
344 text = etree.tostring(wldoc.edoc, encoding=unicode)
345 provider = wldoc.provider
347 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
349 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
351 document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
352 document.swap_endlines()
354 for child_uri in document.book_info.parts:
355 child = load_including_children(provider=provider, uri=child_uri)
356 document.edoc.getroot().append(child.edoc.getroot())