1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from distutils.dir_util import copy_tree # shutil.copytree is so uncapable.
17 from StringIO import StringIO
18 from tempfile import mkdtemp, NamedTemporaryFile
20 from copy import deepcopy
21 from subprocess import call, PIPE
23 from Texml.processor import process
24 from lxml import etree
25 from lxml.etree import XMLSyntaxError, XSLTApplyError
27 from librarian.dcparser import Person
28 from librarian.parser import WLDocument
29 from librarian import ParseError, DCNS, get_resource, OutputFile
30 from librarian import functions
31 from librarian.cover import WLCover
33 import itertools, operator
35 functions.reg_substitute_entities()
37 functions.reg_starts_white()
38 functions.reg_ends_white()
39 functions.reg_texcommand()
42 'wl2tex': 'pdf/wl2tex.xslt',
54 def insert_tags(doc, split_re, tagname, exclude=None):
55 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
57 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
58 >>> insert_tags(t, re.compile('-'), 'd');
59 >>> print etree.tostring(t)
60 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
63 for elem in doc.iter(tag=etree.Element):
64 if exclude and elem.tag in exclude:
67 chunks = split_re.split(elem.text)
68 while len(chunks) > 1:
69 ins = etree.Element(tagname)
70 ins.tail = chunks.pop()
72 elem.text = chunks.pop(0)
74 chunks = split_re.split(elem.tail)
75 parent = elem.getparent()
76 ins_index = parent.index(elem) + 1
77 while len(chunks) > 1:
78 ins = etree.Element(tagname)
79 ins.tail = chunks.pop()
80 parent.insert(ins_index, ins)
81 elem.tail = chunks.pop(0)
84 def substitute_hyphens(doc):
86 re.compile("(?<=[^-\s])-(?=[^-\s])"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license"), "www"]
94 re.compile("(?<=\s\w)\s+"),
96 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
100 for tabela in doc.findall("//tabela"):
101 # are we dealing with a table of proper structure?
102 # two levels of same tags, and all tags on second level
103 # must be of same count.
105 m[k.tag] = m.get(k.tag, 0) + 1
108 child_tags = reduce(tag_count, list(tabela), {})
109 if len(child_tags) != 1:
111 grandchild_tags = reduce(tag_count, itertools.chain(*[list(c) for c in tabela]), {})
112 if len(grandchild_tags) != 1:
114 if len(set(grandchild_tags.values())) != 1:
123 def move_motifs_inside(doc):
124 """ moves motifs to be into block elements """
125 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
126 for motif in master.xpath('motyw'):
127 for sib in motif.itersiblings():
128 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
129 # motif shouldn't have a tail - it would be untagged text
131 motif.getparent().remove(motif)
136 def hack_motifs(doc):
137 """ dirty hack for the marginpar-creates-orphans LaTeX problem
138 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
140 moves motifs in stanzas from first verse to second
141 and from next to last to last, then inserts negative vspace before them
143 for motif in doc.findall('//strofa//motyw'):
144 # find relevant verse-level tag
145 verse, stanza = motif, motif.getparent()
146 while stanza is not None and stanza.tag != 'strofa':
147 verse, stanza = stanza, stanza.getparent()
148 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
149 breaks_after = sum(1 for i in verse.itersiblings('br'))
150 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
152 if breaks_after == 2:
154 moved_motif = deepcopy(motif)
157 moved_motif.tail = None
158 moved_motif.set('moved', str(move_by))
160 for br in verse.itersiblings('br'):
164 br.addnext(moved_motif)
168 def parse_creator(doc):
169 """Generates readable versions of creator and translator tags.
171 Finds all dc:creator and dc.contributor.translator tags
172 and adds *_parsed versions with forenames first.
174 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
175 'creator', 'contributor.translator')),
176 namespaces = {'dc': str(DCNS)})[::-1]:
179 p = Person.from_text(person.text)
180 person_parsed = deepcopy(person)
181 person_parsed.tag = person.tag + '_parsed'
182 person_parsed.set('sortkey', person.text)
183 person_parsed.text = p.readable()
184 person.getparent().insert(0, person_parsed)
187 def get_stylesheet(name):
188 return get_resource(STYLESHEETS[name])
191 def package_available(package, args='', verbose=False):
192 """ check if a verion of a latex package accepting given args is available """
193 tempdir = mkdtemp('-wl2pdf-test')
194 fpath = os.path.join(tempdir, 'test.tex')
201 """ % (args, package))
204 p = call(['xelatex', '-output-directory', tempdir, fpath])
206 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
207 shutil.rmtree(tempdir)
211 def transform(wldoc, verbose=False, save_tex=None, save_texml=None, morefloats=None,
212 cover=None, cover_file=None, flags=None, customizations=None, documentclass='wl', resources=None):
213 """ produces a PDF file with XeLaTeX
216 verbose: prints all output from LaTeX
217 save_tex: path to save the intermediary LaTeX file to
218 save_texml: path to save the intermediary TeXML file to
219 morefloats (old/new/none): force specific morefloats
220 cover: a cover.Cover factory or True for default
221 flags: less-advertising,
222 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
223 documentclass: LaTeX document class, defaults to wl
224 resources: a directory with resources, copied to place where LaTeX compilation is made
229 book_info = wldoc.book_info
230 document = load_including_children(wldoc)
231 root = document.edoc.getroot()
236 bound_cover = cover(book_info)
237 root.set('data-cover-width', str(bound_cover.width))
238 root.set('data-cover-height', str(bound_cover.height))
239 if bound_cover.uses_dc_cover:
240 if book_info.cover_by:
241 root.set('data-cover-by', book_info.cover_by)
242 if book_info.cover_source:
243 root.set('data-cover-source',
244 book_info.cover_source)
247 root.set('flag-' + flag, 'yes')
249 # check for LaTeX packages
251 root.set('morefloats', morefloats.lower())
252 elif package_available('morefloats', 'maxfloats=19'):
253 root.set('morefloats', 'new')
256 if customizations is not None:
257 root.set('customizations', u','.join(customizations))
259 root.set('documentclass', documentclass or 'wl')
262 root.set('editors', u', '.join(sorted(
263 editor.readable() for editor in document.editors())))
266 move_motifs_inside(document.edoc)
267 hack_motifs(document.edoc)
268 fake_tables(document.edoc)
269 parse_creator(document.edoc)
270 substitute_hyphens(document.edoc)
271 fix_hanging(document.edoc)
274 style_filename = get_stylesheet("wl2tex")
275 style = etree.parse(style_filename)
277 texml = document.transform(style)
280 texml.write(save_texml)
283 temp = mkdtemp('-wl2pdf')
286 with open(os.path.join(temp, 'cover.png'), 'w') as f:
289 del document # no longer needed large object :)
291 tex_path = os.path.join(temp, 'doc.tex')
292 fout = open(tex_path, 'w')
293 process(StringIO(texml), fout, 'utf-8')
298 shutil.copy(tex_path, save_tex)
301 shutil.copy(get_resource('pdf/wl.cls'), temp)
302 shutil.copy(get_resource('pdf/wlpub.cls'), temp)
303 shutil.copy(get_resource('pdf/fnprep.cls'), temp)
304 shutil.copy(get_resource('res/wl-logo.png'), temp)
305 shutil.copy(get_resource('res/cover.jpg'), temp)
307 copy_tree(resources, temp)
316 os.putenv("TEXINPUTS", "::.:%s" % resources)
319 p = call(['xelatex', tex_path])
321 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
323 raise ParseError("Error parsing .tex file")
328 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
329 pdf_path = os.path.join(temp, 'doc.pdf')
330 shutil.move(pdf_path, output_file.name)
332 return OutputFile.from_filename(output_file.name)
334 except (XMLSyntaxError, XSLTApplyError), e:
338 def load_including_children(wldoc=None, provider=None, uri=None):
339 """ Makes one big xml file with children inserted at end.
341 Either wldoc or provider and URI must be provided.
345 f = provider.by_uri(uri)
346 text = f.read().decode('utf-8')
348 elif wldoc is not None:
349 text = etree.tostring(wldoc.edoc, encoding=unicode)
350 provider = wldoc.provider
352 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
354 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
356 document = WLDocument.from_string(text,
357 parse_dublincore=True, provider=provider)
358 document.swap_endlines()
360 for child_uri in document.book_info.parts:
361 child = load_including_children(provider=provider, uri=child_uri)
362 document.edoc.getroot().append(child.edoc.getroot())