1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import WLCover
33 functions.reg_substitute_entities()
35 functions.reg_starts_white()
36 functions.reg_ends_white()
37 functions.reg_texcommand()
40 'wl2tex': 'pdf/wl2tex.xslt',
52 def insert_tags(doc, split_re, tagname, exclude=None):
53 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
55 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
56 >>> insert_tags(t, re.compile('-'), 'd');
57 >>> print etree.tostring(t)
58 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
61 for elem in doc.iter(tag=etree.Element):
62 if exclude and elem.tag in exclude:
65 chunks = split_re.split(elem.text)
66 while len(chunks) > 1:
67 ins = etree.Element(tagname)
68 ins.tail = chunks.pop()
70 elem.text = chunks.pop(0)
72 chunks = split_re.split(elem.tail)
73 parent = elem.getparent()
74 ins_index = parent.index(elem) + 1
75 while len(chunks) > 1:
76 ins = etree.Element(tagname)
77 ins.tail = chunks.pop()
78 parent.insert(ins_index, ins)
79 elem.tail = chunks.pop(0)
82 def substitute_hyphens(doc):
84 re.compile("(?<=[^-\s])-(?=[^-\s])"),
86 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
92 re.compile("(?<=\s\w)\s+"),
94 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
98 def move_motifs_inside(doc):
99 """ moves motifs to be into block elements """
100 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
101 for motif in master.xpath('motyw'):
102 for sib in motif.itersiblings():
103 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
104 # motif shouldn't have a tail - it would be untagged text
106 motif.getparent().remove(motif)
111 def hack_motifs(doc):
112 """ dirty hack for the marginpar-creates-orphans LaTeX problem
113 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
115 moves motifs in stanzas from first verse to second
116 and from next to last to last, then inserts negative vspace before them
118 for motif in doc.findall('//strofa//motyw'):
119 # find relevant verse-level tag
120 verse, stanza = motif, motif.getparent()
121 while stanza is not None and stanza.tag != 'strofa':
122 verse, stanza = stanza, stanza.getparent()
123 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
124 breaks_after = sum(1 for i in verse.itersiblings('br'))
125 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
127 if breaks_after == 2:
129 moved_motif = deepcopy(motif)
132 moved_motif.tail = None
133 moved_motif.set('moved', str(move_by))
135 for br in verse.itersiblings('br'):
139 br.addnext(moved_motif)
143 def parse_creator(doc):
144 """Generates readable versions of creator and translator tags.
146 Finds all dc:creator and dc.contributor.translator tags
147 and adds *_parsed versions with forenames first.
149 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
150 'creator', 'contributor.translator')),
151 namespaces = {'dc': str(DCNS)})[::-1]:
154 p = Person.from_text(person.text)
155 person_parsed = deepcopy(person)
156 person_parsed.tag = person.tag + '_parsed'
157 person_parsed.set('sortkey', person.text)
158 person_parsed.text = p.readable()
159 person.getparent().insert(0, person_parsed)
162 def get_stylesheet(name):
163 return get_resource(STYLESHEETS[name])
166 def package_available(package, args='', verbose=False):
167 """ check if a verion of a latex package accepting given args is available """
168 tempdir = mkdtemp('-wl2pdf-test')
169 fpath = os.path.join(tempdir, 'test.tex')
176 """ % (args, package))
179 p = call(['xelatex', '-output-directory', tempdir, fpath])
181 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
182 shutil.rmtree(tempdir)
186 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
187 cover=None, flags=None, customizations=None):
188 """ produces a PDF file with XeLaTeX
191 verbose: prints all output from LaTeX
192 save_tex: path to save the intermediary LaTeX file to
193 morefloats (old/new/none): force specific morefloats
194 cover: a cover.Cover factory or True for default
195 flags: less-advertising,
196 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
201 book_info = wldoc.book_info
202 document = load_including_children(wldoc)
203 root = document.edoc.getroot()
208 bound_cover = cover(book_info)
209 root.set('data-cover-width', str(bound_cover.width))
210 root.set('data-cover-height', str(bound_cover.height))
211 if bound_cover.uses_dc_cover:
212 if book_info.cover_by:
213 root.set('data-cover-by', book_info.cover_by)
214 if book_info.cover_source:
215 root.set('data-cover-source',
216 book_info.cover_source)
219 root.set('flag-' + flag, 'yes')
221 # check for LaTeX packages
223 root.set('morefloats', morefloats.lower())
224 elif package_available('morefloats', 'maxfloats=19'):
225 root.set('morefloats', 'new')
228 if customizations is not None:
229 root.set('customizations', u','.join(customizations))
232 root.set('editors', u', '.join(sorted(
233 editor.readable() for editor in document.editors())))
234 if document.book_info.funders:
235 root.set('funders', u', '.join(document.book_info.funders))
236 if document.book_info.thanks:
237 root.set('thanks', document.book_info.thanks)
240 move_motifs_inside(document.edoc)
241 hack_motifs(document.edoc)
242 parse_creator(document.edoc)
243 substitute_hyphens(document.edoc)
244 fix_hanging(document.edoc)
247 style_filename = get_stylesheet("wl2tex")
248 style = etree.parse(style_filename)
250 texml = document.transform(style)
253 temp = mkdtemp('-wl2pdf')
256 with open(os.path.join(temp, 'cover.png'), 'w') as f:
259 del document # no longer needed large object :)
261 tex_path = os.path.join(temp, 'doc.tex')
262 fout = open(tex_path, 'w')
263 process(StringIO(texml), fout, 'utf-8')
268 shutil.copy(tex_path, save_tex)
271 shutil.copy(get_resource('pdf/wl.cls'), temp)
272 shutil.copy(get_resource('res/wl-logo.png'), temp)
281 p = call(['xelatex', tex_path])
283 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
285 raise ParseError("Error parsing .tex file")
290 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
291 pdf_path = os.path.join(temp, 'doc.pdf')
292 shutil.move(pdf_path, output_file.name)
294 return OutputFile.from_filename(output_file.name)
296 except (XMLSyntaxError, XSLTApplyError), e:
300 def load_including_children(wldoc=None, provider=None, uri=None):
301 """ Makes one big xml file with children inserted at end.
303 Either wldoc or provider and URI must be provided.
307 f = provider.by_uri(uri)
308 text = f.read().decode('utf-8')
310 elif wldoc is not None:
311 text = etree.tostring(wldoc.edoc, encoding=unicode)
312 provider = wldoc.provider
314 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
316 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
318 document = WLDocument.from_string(text,
319 parse_dublincore=True, provider=provider)
320 document.swap_endlines()
322 for child_uri in document.book_info.parts:
323 child = load_including_children(provider=provider, uri=child_uri)
324 document.edoc.getroot().append(child.edoc.getroot())