1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import WLCover
33 functions.reg_substitute_entities()
35 functions.reg_starts_white()
36 functions.reg_ends_white()
37 functions.reg_texcommand()
38 functions.reg_urlquote()
39 functions.reg_breakurl()
42 'wl2tex': 'pdf/wl2tex.xslt',
54 def insert_tags(doc, split_re, tagname, exclude=None):
55 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
57 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
58 >>> insert_tags(t, re.compile('-'), 'd');
59 >>> print etree.tostring(t)
60 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
63 for elem in doc.iter(tag=etree.Element):
64 if exclude and elem.tag in exclude:
67 chunks = split_re.split(elem.text)
68 while len(chunks) > 1:
69 ins = etree.Element(tagname)
70 ins.tail = chunks.pop()
72 elem.text = chunks.pop(0)
74 chunks = split_re.split(elem.tail)
75 parent = elem.getparent()
76 ins_index = parent.index(elem) + 1
77 while len(chunks) > 1:
78 ins = etree.Element(tagname)
79 ins.tail = chunks.pop()
80 parent.insert(ins_index, ins)
81 elem.tail = chunks.pop(0)
84 def substitute_hyphens(doc):
86 re.compile("(?<=[^-\s])-(?=[^-\s])"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
94 re.compile("(?<=\s\w)\s+"),
96 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
100 def move_motifs_inside(doc):
101 """ moves motifs to be into block elements """
102 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
103 for motif in master.xpath('motyw'):
104 for sib in motif.itersiblings():
105 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
106 # motif shouldn't have a tail - it would be untagged text
108 motif.getparent().remove(motif)
113 def hack_motifs(doc):
114 """ dirty hack for the marginpar-creates-orphans LaTeX problem
115 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
117 moves motifs in stanzas from first verse to second
118 and from next to last to last, then inserts negative vspace before them
120 for motif in doc.findall('//strofa//motyw'):
121 # find relevant verse-level tag
122 verse, stanza = motif, motif.getparent()
123 while stanza is not None and stanza.tag != 'strofa':
124 verse, stanza = stanza, stanza.getparent()
125 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
126 breaks_after = sum(1 for i in verse.itersiblings('br'))
127 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
129 if breaks_after == 2:
131 moved_motif = deepcopy(motif)
134 moved_motif.tail = None
135 moved_motif.set('moved', str(move_by))
137 for br in verse.itersiblings('br'):
141 br.addnext(moved_motif)
145 def parse_creator(doc):
146 """Generates readable versions of creator and translator tags.
148 Finds all dc:creator and dc.contributor.translator tags
149 and adds *_parsed versions with forenames first.
151 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
152 'creator', 'contributor.translator',
153 'contributor.editor', 'contributor.technical_editor')),
154 namespaces = {'dc': str(DCNS)})[::-1]:
157 p = Person.from_text(person.text)
158 person_parsed = deepcopy(person)
159 person_parsed.tag = person.tag + '_parsed'
160 person_parsed.set('sortkey', person.text)
161 person_parsed.text = p.readable()
162 person.getparent().insert(0, person_parsed)
165 def get_stylesheet(name):
166 return get_resource(STYLESHEETS[name])
169 def package_available(package, args='', verbose=False):
170 """ check if a verion of a latex package accepting given args is available """
171 tempdir = mkdtemp('-wl2pdf-test')
172 fpath = os.path.join(tempdir, 'test.tex')
179 """ % (args, package))
182 p = call(['xelatex', '-output-directory', tempdir, fpath])
184 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
185 shutil.rmtree(tempdir)
189 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
190 cover=None, flags=None, customizations=None):
191 """ produces a PDF file with XeLaTeX
194 verbose: prints all output from LaTeX
195 save_tex: path to save the intermediary LaTeX file to
196 morefloats (old/new/none): force specific morefloats
197 cover: a cover.Cover factory or True for default
198 flags: less-advertising,
199 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
204 book_info = wldoc.book_info
205 document = load_including_children(wldoc)
206 root = document.edoc.getroot()
211 bound_cover = cover(book_info)
212 root.set('data-cover-width', str(bound_cover.width))
213 root.set('data-cover-height', str(bound_cover.height))
214 if bound_cover.uses_dc_cover:
215 if book_info.cover_by:
216 root.set('data-cover-by', book_info.cover_by)
217 if book_info.cover_source:
218 root.set('data-cover-source',
219 book_info.cover_source)
222 root.set('flag-' + flag, 'yes')
224 # check for LaTeX packages
226 root.set('morefloats', morefloats.lower())
227 elif package_available('morefloats', 'maxfloats=19'):
228 root.set('morefloats', 'new')
231 if customizations is not None:
232 root.set('customizations', u','.join(customizations))
235 root.set('editors', u', '.join(sorted(
236 editor.readable() for editor in document.editors())))
239 move_motifs_inside(document.edoc)
240 hack_motifs(document.edoc)
241 parse_creator(document.edoc)
242 substitute_hyphens(document.edoc)
243 fix_hanging(document.edoc)
246 style_filename = get_stylesheet("wl2tex")
247 style = etree.parse(style_filename)
249 texml = document.transform(style)
252 temp = mkdtemp('-wl2pdf')
254 for ilustr in document.edoc.findall("//ilustr"):
255 shutil.copy(ilustr.get("src"), temp)
258 with open(os.path.join(temp, 'cover.png'), 'w') as f:
261 del document # no longer needed large object :)
263 tex_path = os.path.join(temp, 'doc.tex')
264 fout = open(tex_path, 'w')
265 process(StringIO(texml), fout, 'utf-8')
270 shutil.copy(tex_path, save_tex)
273 shutil.copy(get_resource('pdf/wl.cls'), temp)
274 shutil.copy(get_resource('res/wl-logo.png'), temp)
275 #shutil.copy(get_resource('res/prawokultury-logo.png'), temp)
276 #shutil.copy(get_resource('res/trust-logo.eps'), temp)
277 shutil.copy(get_resource('res/nowoczesnapolska.org.pl.png'), temp)
278 shutil.copy(get_resource('res/koedlogo.png'), temp)
287 p = call(['xelatex', tex_path])
289 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
291 raise ParseError("Error parsing .tex file")
296 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
297 pdf_path = os.path.join(temp, 'doc.pdf')
298 shutil.move(pdf_path, output_file.name)
300 return OutputFile.from_filename(output_file.name)
302 except (XMLSyntaxError, XSLTApplyError), e:
306 def load_including_children(wldoc=None, provider=None, uri=None):
307 """ Makes one big xml file with children inserted at end.
309 Either wldoc or provider and URI must be provided.
313 f = provider.by_uri(uri)
314 text = f.read().decode('utf-8')
316 elif wldoc is not None:
317 text = etree.tostring(wldoc.edoc, encoding=unicode)
318 provider = wldoc.provider
320 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
322 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
324 document = WLDocument.from_string(text,
325 parse_dublincore=True, provider=provider)
326 document.swap_endlines()
328 for child_uri in document.book_info.parts:
329 child = load_including_children(provider=provider, uri=child_uri)
330 document.edoc.getroot().append(child.edoc.getroot())