1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
25 from librarian.dcparser import Person
26 from librarian.parser import WLDocument
27 from librarian import ParseError, DCNS, get_resource, IOFile, Format
28 from librarian import functions
31 functions.reg_substitute_entities()
33 functions.reg_starts_white()
34 functions.reg_ends_white()
35 functions.reg_texcommand()
38 'wl2tex': 'pdf/wl2tex.xslt',
42 def insert_tags(doc, split_re, tagname, exclude=None):
43 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
45 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>')
46 >>> insert_tags(t, re.compile('-'), 'd')
47 >>> print etree.tostring(t)
48 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
51 for elem in doc.iter(tag=etree.Element):
52 if exclude and elem.tag in exclude:
55 chunks = split_re.split(elem.text)
56 while len(chunks) > 1:
57 ins = etree.Element(tagname)
58 ins.tail = chunks.pop()
60 elem.text = chunks.pop(0)
62 chunks = split_re.split(elem.tail)
63 parent = elem.getparent()
64 ins_index = parent.index(elem) + 1
65 while len(chunks) > 1:
66 ins = etree.Element(tagname)
67 ins.tail = chunks.pop()
68 parent.insert(ins_index, ins)
69 elem.tail = chunks.pop(0)
72 def substitute_hyphens(doc):
74 doc, re.compile("(?<=[^-\s])-(?=[^-\s])"), "dywiz",
76 DCNS("identifier.url"),
77 DCNS("rights.license"),
80 DCNS("subject.curriculum"),
81 DCNS("subject.curriculum.new"),
89 doc, re.compile("(?<=\s\w)\s+"), "nbsp",
91 DCNS("identifier.url"),
92 DCNS("rights.license"),
95 DCNS("subject.curriculum"),
96 DCNS("subject.curriculum.new"),
101 def move_motifs_inside(doc):
102 """ moves motifs to be into block elements """
103 main_tags = ('powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp',
104 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny')
105 for master in doc.xpath('|'.join('//' + tag for tag in main_tags)):
106 for motif in master.xpath('motyw'):
107 for sib in motif.itersiblings():
108 special_tags = ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia',
109 'begin', 'end', 'motyw', 'extra', 'uwaga')
110 if sib.tag not in special_tags:
111 # motif shouldn't have a tail - it would be untagged text
113 motif.getparent().remove(motif)
118 def hack_motifs(doc):
119 """ dirty hack for the marginpar-creates-orphans LaTeX problem
120 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
122 moves motifs in stanzas from first verse to second
123 and from next to last to last, then inserts negative vspace before them
125 for motif in doc.findall('//strofa//motyw'):
126 # find relevant verse-level tag
127 verse, stanza = motif, motif.getparent()
128 while stanza is not None and stanza.tag != 'strofa':
129 verse, stanza = stanza, stanza.getparent()
130 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
131 breaks_after = sum(1 for i in verse.itersiblings('br'))
132 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
134 if breaks_after == 2:
136 moved_motif = deepcopy(motif)
139 moved_motif.tail = None
140 moved_motif.set('moved', str(move_by))
142 for br in verse.itersiblings('br'):
146 br.addnext(moved_motif)
150 def parse_creator(doc):
151 """Generates readable versions of creator and translator tags.
153 Finds all dc:creator and dc.contributor.translator tags
154 and adds *_parsed versions with forenames first.
157 "|".join('//dc:' + tag for tag in ('creator', 'contributor.translator')),
158 namespaces={'dc': str(DCNS)})[::-1]
159 for person in persons:
162 p = Person.from_text(person.text)
163 person_parsed = deepcopy(person)
164 person_parsed.tag = person.tag + '_parsed'
165 person_parsed.set('sortkey', person.text)
166 person_parsed.text = p.readable()
167 person.getparent().insert(0, person_parsed)
170 def get_stylesheet(name):
171 return get_resource(STYLESHEETS[name])
174 def package_available(package, args='', verbose=False):
175 """ check if a verion of a latex package accepting given args is available """
176 tempdir = mkdtemp('-wl2pdf-test')
177 fpath = os.path.join(tempdir, 'test.tex')
184 """ % (args, package))
187 p = call(['xelatex', '-output-directory', tempdir, fpath])
189 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
190 shutil.rmtree(tempdir)
195 def load_including_children(wldoc=None, provider=None, uri=None):
196 """ Makes one big xml file with children inserted at end.
198 Either wldoc or provider and URI must be provided.
202 f = provider.by_uri(uri)
203 # WTF DocProvider.by_uri() returns IOFile, so no .read() there
204 text = f.read().decode('utf-8')
206 elif wldoc is not None:
207 text = etree.tostring(wldoc.edoc, encoding=unicode)
208 provider = wldoc.provider
210 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
212 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
214 document = WLDocument.from_string(text, parse_dublincore=True, provider=provider)
215 document.swap_endlines()
217 for child_uri in document.book_info.parts:
218 child = load_including_children(provider=provider, uri=child_uri)
219 document.edoc.getroot().append(child.edoc.getroot())
223 class PDFFormat(Format):
226 Available customization:
227 nofootnotes: Doesn't do footnotes.
228 nothemes: Doesn't do themes.
229 defaultleading: Default leading.
230 onehalfleading: Bigger leading.
231 doubleleading: Big leading.
232 nowlfont: Uses standard TeX font instead of JUnicodeWL.
238 style = get_resource('pdf/default.sty')
243 """ For use in XSLT. """
244 return self.cover is not None
247 def customization_str(self):
248 """ For use in XSLT. """
249 return u','.join(k for k, v in self.customization.items() if v)
252 raise NotImplementedError
254 def get_tex_dir(self):
255 texml = self.get_texml()
256 temp = mkdtemp('-wl2pdf')
258 tex_path = os.path.join(temp, 'doc.tex')
259 with open(tex_path, 'w') as fout:
260 process(StringIO(texml), fout, 'utf-8')
262 shutil.copy(tex_path, self.save_tex)
264 shutil.copy(get_resource('pdf/wl.cls'), temp)
265 shutil.copy(self.style, os.path.join(temp, 'style.sty'))
266 # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
267 # shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
271 self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
275 temp = self.get_tex_dir()
276 tex_path = os.path.join(temp, 'doc.tex')
285 for i in xrange(self.tex_passes):
286 p = call(['xelatex', tex_path])
288 for i in xrange(self.tex_passes):
289 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
291 raise ParseError("Error parsing .tex file: %s" % tex_path)
296 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
297 pdf_path = os.path.join(temp, 'doc.pdf')
298 shutil.move(pdf_path, output_file.name)
300 return IOFile.from_filename(output_file.name)
302 def build(self, verbose=False, save_tex=None, morefloats=None):
303 """ morefloats: new/old/none
305 self.verbose = verbose
306 self.save_tex = save_tex
308 if morefloats is None and package_available('morefloats', 'maxfloats=19'):
310 self.morefloats = morefloats
312 book_info = self.wldoc.book_info
314 self.cover = self.cover_class(book_info)
316 return self.get_pdf()