1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, IOFile, Format
29 from librarian import functions
32 functions.reg_substitute_entities()
34 functions.reg_starts_white()
35 functions.reg_ends_white()
36 functions.reg_texcommand()
39 'wl2tex': 'pdf/wl2tex.xslt',
42 def insert_tags(doc, split_re, tagname, exclude=None):
43 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
45 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
46 >>> insert_tags(t, re.compile('-'), 'd');
47 >>> print etree.tostring(t)
48 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
51 for elem in doc.iter(tag=etree.Element):
52 if exclude and elem.tag in exclude:
55 chunks = split_re.split(elem.text)
56 while len(chunks) > 1:
57 ins = etree.Element(tagname)
58 ins.tail = chunks.pop()
60 elem.text = chunks.pop(0)
62 chunks = split_re.split(elem.tail)
63 parent = elem.getparent()
64 ins_index = parent.index(elem) + 1
65 while len(chunks) > 1:
66 ins = etree.Element(tagname)
67 ins.tail = chunks.pop()
68 parent.insert(ins_index, ins)
69 elem.tail = chunks.pop(0)
72 def substitute_hyphens(doc):
74 re.compile("(?<=[^-\s])-(?=[^-\s])"),
76 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
82 re.compile("(?<=\s\w)\s+"),
84 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
88 def move_motifs_inside(doc):
89 """ moves motifs to be into block elements """
90 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
91 for motif in master.xpath('motyw'):
92 for sib in motif.itersiblings():
93 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
94 # motif shouldn't have a tail - it would be untagged text
96 motif.getparent().remove(motif)
101 def hack_motifs(doc):
102 """ dirty hack for the marginpar-creates-orphans LaTeX problem
103 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
105 moves motifs in stanzas from first verse to second
106 and from next to last to last, then inserts negative vspace before them
108 for motif in doc.findall('//strofa//motyw'):
109 # find relevant verse-level tag
110 verse, stanza = motif, motif.getparent()
111 while stanza is not None and stanza.tag != 'strofa':
112 verse, stanza = stanza, stanza.getparent()
113 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
114 breaks_after = sum(1 for i in verse.itersiblings('br'))
115 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
117 if breaks_after == 2:
119 moved_motif = deepcopy(motif)
122 moved_motif.tail = None
123 moved_motif.set('moved', str(move_by))
125 for br in verse.itersiblings('br'):
129 br.addnext(moved_motif)
133 def parse_creator(doc):
134 """Generates readable versions of creator and translator tags.
136 Finds all dc:creator and dc.contributor.translator tags
137 and adds *_parsed versions with forenames first.
139 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
140 'creator', 'contributor.translator')),
141 namespaces = {'dc': str(DCNS)})[::-1]:
144 p = Person.from_text(person.text)
145 person_parsed = deepcopy(person)
146 person_parsed.tag = person.tag + '_parsed'
147 person_parsed.set('sortkey', person.text)
148 person_parsed.text = p.readable()
149 person.getparent().insert(0, person_parsed)
152 def get_stylesheet(name):
153 return get_resource(STYLESHEETS[name])
156 def package_available(package, args='', verbose=False):
157 """ check if a verion of a latex package accepting given args is available """
158 tempdir = mkdtemp('-wl2pdf-test')
159 fpath = os.path.join(tempdir, 'test.tex')
166 """ % (args, package))
169 p = call(['xelatex', '-output-directory', tempdir, fpath])
171 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
172 shutil.rmtree(tempdir)
176 def load_including_children(wldoc=None, provider=None, uri=None):
177 """ Makes one big xml file with children inserted at end.
179 Either wldoc or provider and URI must be provided.
183 f = provider.by_uri(uri)
184 text = f.read().decode('utf-8')
186 elif wldoc is not None:
187 text = etree.tostring(wldoc.edoc, encoding=unicode)
188 provider = wldoc.provider
190 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
192 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
194 document = WLDocument.from_string(text,
195 parse_dublincore=True, provider=provider)
196 document.swap_endlines()
198 for child_uri in document.book_info.parts:
199 child = load_including_children(provider=provider, uri=child_uri)
200 document.edoc.getroot().append(child.edoc.getroot())
204 class PDFFormat(Format):
207 Available customization:
208 nofootnotes: Doesn't do footnotes.
209 nothemes: Doesn't do themes.
210 defaultleading: Default leading.
211 onehalfleading: Bigger leading.
212 doubleleading: Big leading.
213 nowlfont: Uses standard TeX font instead of JUnicodeWL.
219 style = get_resource('pdf/default.sty')
224 """ For use in XSLT. """
225 return self.cover is not None
228 def customization_str(self):
229 """ For use in XSLT. """
230 return u','.join(k for k, v in self.customization.items() if v)
232 def get_document(self):
233 document = load_including_children(self.wldoc)
234 root = document.edoc.getroot()
235 root.set('editors', u', '.join(sorted(
236 editor.readable() for editor in document.editors())))
239 move_motifs_inside(document.edoc)
240 hack_motifs(document.edoc)
241 parse_creator(document.edoc)
242 substitute_hyphens(document.edoc)
243 fix_hanging(document.edoc)
247 style_filename = get_stylesheet("wl2tex")
248 functions.reg_get(self)
250 style = etree.parse(style_filename)
251 texml = self.get_document().transform(style)
253 except (XMLSyntaxError, XSLTApplyError), e:
256 def get_tex_dir(self):
257 texml = self.get_texml()
258 temp = mkdtemp('-wl2pdf')
260 tex_path = os.path.join(temp, 'doc.tex')
261 with open(tex_path, 'w') as fout:
262 process(StringIO(texml), fout, 'utf-8')
264 shutil.copy(tex_path, self.save_tex)
266 shutil.copy(get_resource('pdf/wl.cls'), temp)
267 shutil.copy(self.style, os.path.join(temp, 'style.sty'))
270 self.cover.for_pdf().dump_to(os.path.join(temp, 'makecover.sty'))
274 temp = self.get_tex_dir()
275 tex_path = os.path.join(temp, 'doc.tex')
283 for i in range(self.tex_passes):
284 p = call(['xelatex', tex_path])
286 for i in range(self.tex_passes):
287 p = call(['xelatex', '-interaction=batchmode', tex_path],
288 stdout=PIPE, stderr=PIPE)
290 raise ParseError("Error parsing .tex file")
295 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
296 pdf_path = os.path.join(temp, 'doc.pdf')
297 shutil.move(pdf_path, output_file.name)
299 return IOFile.from_filename(output_file.name)
301 def build(self, verbose=False, save_tex=None, morefloats=None):
302 """ morefloats: new/old/none
304 self.verbose = verbose
305 self.save_tex = save_tex
307 if morefloats is None and package_available('morefloats', 'maxfloats=19'):
309 self.morefloats = morefloats
311 book_info = self.wldoc.book_info
313 self.cover = self.cover_class(book_info)
315 return self.get_pdf()