1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
25 from librarian.dcparser import Person
26 from librarian.parser import WLDocument
27 from librarian import ParseError, DCNS, get_resource, IOFile, Format
28 from librarian import functions
31 functions.reg_substitute_entities()
33 functions.reg_starts_white()
34 functions.reg_ends_white()
35 functions.reg_texcommand()
38 'wl2tex': 'pdf/wl2tex.xslt',
42 def insert_tags(doc, split_re, tagname, exclude=None):
43 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
45 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
46 >>> insert_tags(t, re.compile('-'), 'd')
47 >>> print etree.tostring(t)
48 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
51 for elem in doc.iter(tag=etree.Element):
52 if exclude and elem.tag in exclude:
55 chunks = split_re.split(elem.text)
56 while len(chunks) > 1:
57 ins = etree.Element(tagname)
58 ins.tail = chunks.pop()
60 elem.text = chunks.pop(0)
62 chunks = split_re.split(elem.tail)
63 parent = elem.getparent()
64 ins_index = parent.index(elem) + 1
65 while len(chunks) > 1:
66 ins = etree.Element(tagname)
67 ins.tail = chunks.pop()
68 parent.insert(ins_index, ins)
69 elem.tail = chunks.pop(0)
72 def substitute_hyphens(doc):
74 doc, re.compile("(?<=[^-\s])-(?=[^-\s])"), "dywiz",
76 DCNS("identifier.url"),
77 DCNS("rights.license"),
80 DCNS("subject.curriculum"),
88 doc, re.compile("(?<=\s\w)\s+"), "nbsp",
90 DCNS("identifier.url"),
91 DCNS("rights.license"),
94 DCNS("subject.curriculum"),
99 def move_motifs_inside(doc):
100 """ moves motifs to be into block elements """
101 main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp',
102 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny')
103 for master in doc.xpath('|'.join('//' + tag for tag in main_tags)):
104 for motif in master.xpath('motyw'):
105 for sib in motif.itersiblings():
106 special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
107 'begin', 'end', 'motyw', 'extra', 'uwaga')
108 if sib.tag not in special_tags:
109 # motif shouldn't have a tail - it would be untagged text
111 motif.getparent().remove(motif)
116 def hack_motifs(doc):
117 """ dirty hack for the marginpar-creates-orphans LaTeX problem
118 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
120 moves motifs in stanzas from first verse to second
121 and from next to last to last, then inserts negative vspace before them
123 for motif in doc.findall('//strofa//motyw'):
124 # find relevant verse-level tag
125 verse, stanza = motif, motif.getparent()
126 while stanza is not None and stanza.tag != 'strofa':
127 verse, stanza = stanza, stanza.getparent()
128 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
129 breaks_after = sum(1 for i in verse.itersiblings('br'))
130 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
132 if breaks_after == 2:
134 moved_motif = deepcopy(motif)
137 moved_motif.tail = None
138 moved_motif.set('moved', str(move_by))
140 for br in verse.itersiblings('br'):
144 br.addnext(moved_motif)
148 def parse_creator(doc):
149 """Generates readable versions of creator and translator tags.
151 Finds all dc:creator and dc.contributor.translator tags
152 and adds *_parsed versions with forenames first.
155 "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
156 namespaces={'dc': str(DCNS)})[::-1]
157 for person in persons:
160 p = Person.from_text(person.text)
161 person_parsed = deepcopy(person)
162 person_parsed.tag = person.tag + '_parsed'
163 person_parsed.set('sortkey', person.text)
164 person_parsed.text = p.readable()
165 person.getparent().insert(0, person_parsed)
168 def get_stylesheet(name):
169 return get_resource(STYLESHEETS[name])
172 def package_available(package, args='', verbose=False):
173 """ check if a verion of a latex package accepting given args is available """
174 tempdir = mkdtemp('-wl2pdf-test')
175 fpath = os.path.join(tempdir, 'test.tex')
182 """ % (args, package))
185 p = call(['xelatex', '-output-directory', tempdir, fpath])
187 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
188 shutil.rmtree(tempdir)
193 def load_including_children(wldoc=None, provider=None, uri=None):
194 """ Makes one big xml file with children inserted at end.
196 Either wldoc or provider and URI must be provided.
200 f = provider.by_uri(uri)
201 # WTF DocProvider.by_uri() returns IOFile, so no .read() there
202 text = f.read().decode('utf-8')
204 elif wldoc is not None:
205 text = etree.tostring(wldoc.edoc, encoding=unicode)
206 provider = wldoc.provider
208 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
210 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
212 document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
213 document.swap_endlines()
215 for child_uri in document.book_info.parts:
216 child = load_including_children(provider=provider, uri=child_uri)
217 document.edoc.getroot().append(child.edoc.getroot())
221 class PDFFormat(Format):
224 Available customization:
225 nofootnotes: Doesn't do footnotes.
226 nothemes: Doesn't do themes.
227 defaultleading: Default leading.
228 onehalfleading: Bigger leading.
229 doubleleading: Big leading.
230 nowlfont: Uses standard TeX font instead of JUnicodeWL.
236 style = get_resource('pdf/default.sty')
241 """ For use in XSLT. """
242 return self.cover is not None
245 def customization_str(self):
246 """ For use in XSLT. """
247 return u','.join(k for k, v in self.customization.items() if v)
250 raise NotImplementedError
252 def get_tex_dir(self):
253 texml = self.get_texml()
254 temp = mkdtemp('-wl2pdf')
256 tex_path = os.path.join(temp, 'doc.tex')
257 with open(tex_path, 'w') as fout:
258 process(StringIO(texml), fout, 'utf-8')
260 shutil.copy(tex_path, self.save_tex)
262 shutil.copy(get_resource('pdf/wl.cls'), temp)
263 shutil.copy(self.style, os.path.join(temp, 'style.sty'))
264 # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
265 # shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
269 self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
273 temp = self.get_tex_dir()
274 tex_path = os.path.join(temp, 'doc.tex')
283 for i in xrange(self.tex_passes):
284 p = call(['xelatex', tex_path])
286 for i in xrange(self.tex_passes):
287 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
289 raise ParseError("Error parsing .tex file: %s" % tex_path)
294 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
295 pdf_path = os.path.join(temp, 'doc.pdf')
296 shutil.move(pdf_path, output_file.name)
298 return IOFile.from_filename(output_file.name)
300 def build(self, verbose=False, save_tex=None, morefloats=None):
301 """ morefloats: new/old/none
303 self.verbose = verbose
304 self.save_tex = save_tex
306 if morefloats is None and package_available('morefloats', 'maxfloats=19'):
308 self.morefloats = morefloats
310 book_info = self.wldoc.book_info
312 self.cover = self.cover_class(book_info)
314 return self.get_pdf()