1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import DefaultEbookCover
31 from .sponsor import sponsor_logo
34 functions.reg_substitute_entities()
36 functions.reg_starts_white()
37 functions.reg_ends_white()
38 functions.reg_texcommand()
41 'wl2tex': 'pdf/wl2tex.xslt',
53 def insert_tags(doc, split_re, tagname, exclude=None):
54 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
56 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
57 >>> insert_tags(t, re.compile('-'), 'd');
58 >>> print etree.tostring(t)
59 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
62 for elem in doc.iter(tag=etree.Element):
63 if exclude and elem.tag in exclude:
66 chunks = split_re.split(elem.text)
67 while len(chunks) > 1:
68 ins = etree.Element(tagname)
69 ins.tail = chunks.pop()
71 elem.text = chunks.pop(0)
73 chunks = split_re.split(elem.tail)
74 parent = elem.getparent()
75 ins_index = parent.index(elem) + 1
76 while len(chunks) > 1:
77 ins = etree.Element(tagname)
78 ins.tail = chunks.pop()
79 parent.insert(ins_index, ins)
80 elem.tail = chunks.pop(0)
83 def substitute_hyphens(doc):
85 re.compile("(?<=[^-\s])-(?=[^-\s])"),
87 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
93 re.compile("(?<=\s\w)\s+"),
95 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
99 def move_motifs_inside(doc):
100 """ moves motifs to be into block elements """
101 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
102 for motif in master.xpath('motyw'):
103 for sib in motif.itersiblings():
104 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
105 # motif shouldn't have a tail - it would be untagged text
107 motif.getparent().remove(motif)
112 def hack_motifs(doc):
113 """ dirty hack for the marginpar-creates-orphans LaTeX problem
114 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
116 moves motifs in stanzas from first verse to second
117 and from next to last to last, then inserts negative vspace before them
119 for motif in doc.findall('//strofa//motyw'):
120 # find relevant verse-level tag
121 verse, stanza = motif, motif.getparent()
122 while stanza is not None and stanza.tag != 'strofa':
123 verse, stanza = stanza, stanza.getparent()
124 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
125 breaks_after = sum(1 for i in verse.itersiblings('br'))
126 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
128 if breaks_after == 2:
130 moved_motif = deepcopy(motif)
133 moved_motif.tail = None
134 moved_motif.set('moved', str(move_by))
136 for br in verse.itersiblings('br'):
140 br.addnext(moved_motif)
144 def parse_creator(doc):
145 """Generates readable versions of creator and translator tags.
147 Finds all dc:creator and dc.contributor.translator tags
148 and adds *_parsed versions with forenames first.
150 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
151 'creator', 'contributor.translator')),
152 namespaces = {'dc': str(DCNS)})[::-1]:
155 p = Person.from_text(person.text)
156 person_parsed = deepcopy(person)
157 person_parsed.tag = person.tag + '_parsed'
158 person_parsed.set('sortkey', person.text)
159 person_parsed.text = p.readable()
160 person.getparent().insert(0, person_parsed)
163 def get_stylesheet(name):
164 return get_resource(STYLESHEETS[name])
167 def package_available(package, args='', verbose=False):
168 """ check if a verion of a latex package accepting given args is available """
169 tempdir = mkdtemp('-wl2pdf-test')
170 fpath = os.path.join(tempdir, 'test.tex')
177 """ % (args, package))
180 p = call(['xelatex', '-output-directory', tempdir, fpath])
182 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
183 shutil.rmtree(tempdir)
187 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
188 cover=None, flags=None, customizations=None):
189 """ produces a PDF file with XeLaTeX
192 verbose: prints all output from LaTeX
193 save_tex: path to save the intermediary LaTeX file to
194 morefloats (old/new/none): force specific morefloats
195 cover: a cover.Cover factory or True for default
196 flags: less-advertising,
197 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
202 book_info = wldoc.book_info
203 document = load_including_children(wldoc)
204 root = document.edoc.getroot()
208 cover = DefaultEbookCover
209 bound_cover = cover(book_info, width=1200)
210 root.set('data-cover-width', str(bound_cover.width))
211 root.set('data-cover-height', str(bound_cover.height))
212 if bound_cover.uses_dc_cover:
213 if book_info.cover_by:
214 root.set('data-cover-by', book_info.cover_by)
215 if book_info.cover_source:
216 root.set('data-cover-source',
217 book_info.cover_source)
220 root.set('flag-' + flag, 'yes')
222 # check for LaTeX packages
224 root.set('morefloats', morefloats.lower())
225 elif package_available('morefloats', 'maxfloats=19'):
226 root.set('morefloats', 'new')
229 if customizations is not None:
230 root.set('customizations', u','.join(customizations))
233 editors = document.editors()
235 root.set('editors', u', '.join(sorted(
236 editor.readable() for editor in editors)))
237 if document.book_info.funders:
238 root.set('funders', u', '.join(document.book_info.funders))
239 if document.book_info.thanks:
240 root.set('thanks', document.book_info.thanks)
243 move_motifs_inside(document.edoc)
244 hack_motifs(document.edoc)
245 parse_creator(document.edoc)
246 substitute_hyphens(document.edoc)
247 fix_hanging(document.edoc)
250 style_filename = get_stylesheet("wl2tex")
251 style = etree.parse(style_filename)
254 temp = mkdtemp('-wl2pdf')
256 for sponsor in book_info.sponsors:
257 ins = etree.Element("data-sponsor", name=sponsor)
258 logo = sponsor_logo(sponsor)
260 fname = 'sponsor-%s' % os.path.basename(logo)
261 shutil.copy(logo, os.path.join(temp, fname))
262 ins.set('src', fname)
265 if book_info.sponsor_note:
266 root.set("sponsor-note", book_info.sponsor_note)
268 texml = document.transform(style)
271 with open(os.path.join(temp, 'cover.png'), 'w') as f:
272 bound_cover.save(f, quality=80)
274 del document # no longer needed large object :)
276 tex_path = os.path.join(temp, 'doc.tex')
277 fout = open(tex_path, 'w')
278 process(StringIO(texml), fout, 'utf-8')
283 shutil.copy(tex_path, save_tex)
286 shutil.copy(get_resource('pdf/wl.cls'), temp)
287 shutil.copy(get_resource('res/wl-logo.png'), temp)
296 p = call(['xelatex', tex_path])
298 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
300 raise ParseError("Error parsing .tex file")
305 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
306 pdf_path = os.path.join(temp, 'doc.pdf')
307 shutil.move(pdf_path, output_file.name)
309 return OutputFile.from_filename(output_file.name)
311 except (XMLSyntaxError, XSLTApplyError), e:
315 def load_including_children(wldoc=None, provider=None, uri=None):
316 """ Makes one big xml file with children inserted at end.
318 Either wldoc or provider and URI must be provided.
322 f = provider.by_uri(uri)
323 text = f.read().decode('utf-8')
325 elif wldoc is not None:
326 text = etree.tostring(wldoc.edoc, encoding=unicode)
327 provider = wldoc.provider
329 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
331 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
333 document = WLDocument.from_string(text,
334 parse_dublincore=True, provider=provider)
335 document.swap_endlines()
337 for child_uri in document.book_info.parts:
338 child = load_including_children(provider=provider, uri=child_uri)
339 document.edoc.getroot().append(child.edoc.getroot())