1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import DefaultEbookCover
31 from .sponsor import sponsor_logo
34 functions.reg_substitute_entities()
36 functions.reg_starts_white()
37 functions.reg_ends_white()
38 functions.reg_texcommand()
41 'wl2tex': 'pdf/wl2tex.xslt',
53 def insert_tags(doc, split_re, tagname, exclude=None):
54 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
56 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
57 >>> insert_tags(t, re.compile('-'), 'd');
58 >>> print etree.tostring(t)
59 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
62 for elem in doc.iter(tag=etree.Element):
63 if exclude and elem.tag in exclude:
66 chunks = split_re.split(elem.text)
67 while len(chunks) > 1:
68 ins = etree.Element(tagname)
69 ins.tail = chunks.pop()
71 elem.text = chunks.pop(0)
73 chunks = split_re.split(elem.tail)
74 parent = elem.getparent()
75 ins_index = parent.index(elem) + 1
76 while len(chunks) > 1:
77 ins = etree.Element(tagname)
78 ins.tail = chunks.pop()
79 parent.insert(ins_index, ins)
80 elem.tail = chunks.pop(0)
83 def substitute_hyphens(doc):
85 re.compile("(?<=[^-\s])-(?=[^-\s])"),
87 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
93 re.compile("(?<=\s\w)\s+"),
95 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
99 for kol in doc.iter(tag='kol'):
100 if kol.tail is not None:
101 if not kol.tail.strip():
103 for table in doc.iter(tag='tabela'):
104 if table.get('ramka') == '1' or table.get('ramki') == '1':
105 table.set('_format', '|' + 'X|' * len(table[0]))
107 table.set('_format', 'X' * len(table[0]))
110 def move_motifs_inside(doc):
111 """ moves motifs to be into block elements """
112 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
113 for motif in master.xpath('motyw'):
114 for sib in motif.itersiblings():
115 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
116 # motif shouldn't have a tail - it would be untagged text
118 motif.getparent().remove(motif)
123 def hack_motifs(doc):
124 """ dirty hack for the marginpar-creates-orphans LaTeX problem
125 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
127 moves motifs in stanzas from first verse to second
128 and from next to last to last, then inserts negative vspace before them
130 for motif in doc.findall('//strofa//motyw'):
131 # find relevant verse-level tag
132 verse, stanza = motif, motif.getparent()
133 while stanza is not None and stanza.tag != 'strofa':
134 verse, stanza = stanza, stanza.getparent()
135 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
136 breaks_after = sum(1 for i in verse.itersiblings('br'))
137 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
139 if breaks_after == 2:
141 moved_motif = deepcopy(motif)
144 moved_motif.tail = None
145 moved_motif.set('moved', str(move_by))
147 for br in verse.itersiblings('br'):
151 br.addnext(moved_motif)
155 def parse_creator(doc):
156 """Generates readable versions of creator and translator tags.
158 Finds all dc:creator and dc.contributor.translator tags
159 and adds *_parsed versions with forenames first.
161 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
162 'creator', 'contributor.translator')),
163 namespaces = {'dc': str(DCNS)})[::-1]:
166 p = Person.from_text(person.text)
167 person_parsed = deepcopy(person)
168 person_parsed.tag = person.tag + '_parsed'
169 person_parsed.set('sortkey', person.text)
170 person_parsed.text = p.readable()
171 person.getparent().insert(0, person_parsed)
174 def get_stylesheet(name):
175 return get_resource(STYLESHEETS[name])
178 def package_available(package, args='', verbose=False):
179 """ check if a verion of a latex package accepting given args is available """
180 tempdir = mkdtemp('-wl2pdf-test')
181 fpath = os.path.join(tempdir, 'test.tex')
188 """ % (args, package))
191 p = call(['xelatex', '-output-directory', tempdir, fpath])
193 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
194 shutil.rmtree(tempdir)
198 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
199 cover=None, flags=None, customizations=None):
200 """ produces a PDF file with XeLaTeX
203 verbose: prints all output from LaTeX
204 save_tex: path to save the intermediary LaTeX file to
205 morefloats (old/new/none): force specific morefloats
206 cover: a cover.Cover factory or True for default
207 flags: less-advertising,
208 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
213 book_info = wldoc.book_info
214 document = load_including_children(wldoc)
215 root = document.edoc.getroot()
219 cover = DefaultEbookCover
220 bound_cover = cover(book_info, width=1200)
221 root.set('data-cover-width', str(bound_cover.width))
222 root.set('data-cover-height', str(bound_cover.height))
223 if bound_cover.uses_dc_cover:
224 if book_info.cover_by:
225 root.set('data-cover-by', book_info.cover_by)
226 if book_info.cover_source:
227 root.set('data-cover-source',
228 book_info.cover_source)
231 root.set('flag-' + flag, 'yes')
233 # check for LaTeX packages
235 root.set('morefloats', morefloats.lower())
236 elif package_available('morefloats', 'maxfloats=19'):
237 root.set('morefloats', 'new')
240 if customizations is not None:
241 root.set('customizations', u','.join(customizations))
244 editors = document.editors()
246 root.set('editors', u', '.join(sorted(
247 editor.readable() for editor in editors)))
248 if document.book_info.funders:
249 root.set('funders', u', '.join(document.book_info.funders))
250 if document.book_info.thanks:
251 root.set('thanks', document.book_info.thanks)
254 move_motifs_inside(document.edoc)
255 hack_motifs(document.edoc)
256 parse_creator(document.edoc)
257 substitute_hyphens(document.edoc)
258 fix_hanging(document.edoc)
259 fix_tables(document.edoc)
262 style_filename = get_stylesheet("wl2tex")
263 style = etree.parse(style_filename)
264 functions.reg_mathml_latex()
267 temp = mkdtemp('-wl2pdf')
269 for sponsor in book_info.sponsors:
270 ins = etree.Element("data-sponsor", name=sponsor)
271 logo = sponsor_logo(sponsor)
273 fname = 'sponsor-%s' % os.path.basename(logo)
274 shutil.copy(logo, os.path.join(temp, fname))
275 ins.set('src', fname)
278 if book_info.sponsor_note:
279 root.set("sponsor-note", book_info.sponsor_note)
281 texml = document.transform(style)
284 with open(os.path.join(temp, 'cover.png'), 'w') as f:
285 bound_cover.save(f, quality=80)
287 del document # no longer needed large object :)
289 tex_path = os.path.join(temp, 'doc.tex')
290 fout = open(tex_path, 'w')
291 process(StringIO(texml), fout, 'utf-8')
296 shutil.copy(tex_path, save_tex)
299 shutil.copy(get_resource('pdf/wl.cls'), temp)
300 shutil.copy(get_resource('res/wl-logo.png'), temp)
309 p = call(['xelatex', tex_path])
311 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
313 raise ParseError("Error parsing .tex file")
318 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
319 pdf_path = os.path.join(temp, 'doc.pdf')
320 shutil.move(pdf_path, output_file.name)
322 return OutputFile.from_filename(output_file.name)
324 except (XMLSyntaxError, XSLTApplyError), e:
328 def load_including_children(wldoc=None, provider=None, uri=None):
329 """ Makes one big xml file with children inserted at end.
331 Either wldoc or provider and URI must be provided.
335 f = provider.by_uri(uri)
336 text = f.read().decode('utf-8')
338 elif wldoc is not None:
339 text = etree.tostring(wldoc.edoc, encoding=unicode)
340 provider = wldoc.provider
342 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
344 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
346 document = WLDocument.from_string(text,
347 parse_dublincore=True, provider=provider)
348 document.swap_endlines()
350 for child_uri in document.book_info.parts:
351 child = load_including_children(provider=provider, uri=child_uri)
352 document.edoc.getroot().append(child.edoc.getroot())