1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 """PDF creation library.
8 Creates one big XML from the book and its children, converts it to LaTeX
9 with TeXML, then runs it by XeLaTeX.
12 from __future__ import with_statement
16 from StringIO import StringIO
17 from tempfile import mkdtemp, NamedTemporaryFile
19 from copy import deepcopy
20 from subprocess import call, PIPE
22 from Texml.processor import process
23 from lxml import etree
24 from lxml.etree import XMLSyntaxError, XSLTApplyError
26 from librarian.dcparser import Person
27 from librarian.parser import WLDocument
28 from librarian import ParseError, DCNS, get_resource, OutputFile
29 from librarian import functions
30 from librarian.cover import WLCover
33 functions.reg_substitute_entities()
35 functions.reg_starts_white()
36 functions.reg_ends_white()
37 functions.reg_texcommand()
38 functions.reg_urlquote()
39 functions.reg_breakurl()
42 'wl2tex': 'pdf/wl2tex.xslt',
54 def insert_tags(doc, split_re, tagname, exclude=None):
55 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
57 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
58 >>> insert_tags(t, re.compile('-'), 'd');
59 >>> print etree.tostring(t)
60 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
63 for elem in doc.iter(tag=etree.Element):
64 if exclude and elem.tag in exclude:
67 chunks = split_re.split(elem.text)
68 while len(chunks) > 1:
69 ins = etree.Element(tagname)
70 ins.tail = chunks.pop()
72 elem.text = chunks.pop(0)
74 chunks = split_re.split(elem.tail)
75 parent = elem.getparent()
76 ins_index = parent.index(elem) + 1
77 while len(chunks) > 1:
78 ins = etree.Element(tagname)
79 ins.tail = chunks.pop()
80 parent.insert(ins_index, ins)
81 elem.tail = chunks.pop(0)
84 def substitute_hyphens(doc):
86 re.compile("(?<=[^-\s])-(?=[^-\s])"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license"), 'www']
94 re.compile("(?<=\s\w)\s+"),
96 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
100 for kol in doc.iter(tag='kol'):
101 if kol.tail is not None:
102 if not kol.tail.strip():
104 for table in doc.iter(tag='tabela'):
105 if table.get('ramka') == '1' or table.get('ramki') == '1':
106 table.set('_format', '|' + 'X|' * len(table[0]))
108 table.set('_format', 'X' * len(table[0]))
112 def move_motifs_inside(doc):
113 """ moves motifs to be into block elements """
114 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
115 for motif in master.xpath('motyw'):
116 for sib in motif.itersiblings():
117 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
118 # motif shouldn't have a tail - it would be untagged text
120 motif.getparent().remove(motif)
125 def hack_motifs(doc):
126 """ dirty hack for the marginpar-creates-orphans LaTeX problem
127 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
129 moves motifs in stanzas from first verse to second
130 and from next to last to last, then inserts negative vspace before them
132 for motif in doc.findall('//strofa//motyw'):
133 # find relevant verse-level tag
134 verse, stanza = motif, motif.getparent()
135 while stanza is not None and stanza.tag != 'strofa':
136 verse, stanza = stanza, stanza.getparent()
137 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
138 breaks_after = sum(1 for i in verse.itersiblings('br'))
139 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
141 if breaks_after == 2:
143 moved_motif = deepcopy(motif)
146 moved_motif.tail = None
147 moved_motif.set('moved', str(move_by))
149 for br in verse.itersiblings('br'):
153 br.addnext(moved_motif)
157 def parse_creator(doc):
158 """Generates readable versions of creator and translator tags.
160 Finds all dc:creator and dc.contributor.translator tags
161 and adds *_parsed versions with forenames first.
163 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
164 'creator', 'contributor.translator',
165 'contributor.editor', 'contributor.technical_editor')),
166 namespaces = {'dc': str(DCNS)})[::-1]:
169 p = Person.from_text(person.text)
170 person_parsed = deepcopy(person)
171 person_parsed.tag = person.tag + '_parsed'
172 person_parsed.set('sortkey', person.text)
173 person_parsed.text = p.readable()
174 person.getparent().insert(0, person_parsed)
177 def get_stylesheet(name):
178 return get_resource(STYLESHEETS[name])
181 def package_available(package, args='', verbose=False):
182 """ check if a verion of a latex package accepting given args is available """
183 tempdir = mkdtemp('-wl2pdf-test')
184 fpath = os.path.join(tempdir, 'test.tex')
191 """ % (args, package))
194 p = call(['xelatex', '-output-directory', tempdir, fpath])
196 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
197 shutil.rmtree(tempdir)
201 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
202 cover=None, flags=None, customizations=None, ilustr_path=''):
203 """ produces a PDF file with XeLaTeX
206 verbose: prints all output from LaTeX
207 save_tex: path to save the intermediary LaTeX file to
208 morefloats (old/new/none): force specific morefloats
209 cover: a cover.Cover factory or True for default
210 flags: less-advertising,
211 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
216 book_info = wldoc.book_info
217 document = load_including_children(wldoc)
218 root = document.edoc.getroot()
223 bound_cover = cover(book_info, width=2400)
224 root.set('data-cover-width', str(bound_cover.width))
225 root.set('data-cover-height', str(bound_cover.height))
226 if bound_cover.uses_dc_cover:
227 if book_info.cover_by:
228 root.set('data-cover-by', book_info.cover_by)
229 if book_info.cover_source:
230 root.set('data-cover-source',
231 book_info.cover_source)
234 root.set('flag-' + flag, 'yes')
236 # check for LaTeX packages
238 root.set('morefloats', morefloats.lower())
239 elif package_available('morefloats', 'maxfloats=19'):
240 root.set('morefloats', 'new')
243 if customizations is not None:
244 root.set('customizations', u','.join(customizations))
247 root.set('editors', u', '.join(sorted(
248 editor.readable() for editor in document.editors())))
251 move_motifs_inside(document.edoc)
252 hack_motifs(document.edoc)
253 parse_creator(document.edoc)
254 substitute_hyphens(document.edoc)
255 fix_hanging(document.edoc)
256 fix_tables(document.edoc)
259 style_filename = get_stylesheet("wl2tex")
260 style = etree.parse(style_filename)
262 texml = document.transform(style)
265 temp = mkdtemp('-wl2pdf')
267 for ilustr in document.edoc.findall("//ilustr"):
268 shutil.copy(os.path.join(ilustr_path, ilustr.get("src")), temp)
271 with open(os.path.join(temp, 'cover.png'), 'w') as f:
274 del document # no longer needed large object :)
276 tex_path = os.path.join(temp, 'doc.tex')
277 fout = open(tex_path, 'w')
278 process(StringIO(texml), fout, 'utf-8')
283 shutil.copy(tex_path, save_tex)
286 shutil.copy(get_resource('pdf/wl.cls'), temp)
287 shutil.copy(get_resource('res/wl-logo.png'), temp)
288 #shutil.copy(get_resource('res/prawokultury-logo.png'), temp)
289 #shutil.copy(get_resource('res/trust-logo.eps'), temp)
290 shutil.copy(get_resource('res/fnp-logo.eps'), temp)
291 shutil.copy(get_resource('res/koed-logo.eps'), temp)
300 p = call(['xelatex', tex_path])
302 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
304 raise ParseError("Error parsing .tex file")
309 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
310 pdf_path = os.path.join(temp, 'doc.pdf')
311 shutil.move(pdf_path, output_file.name)
313 return OutputFile.from_filename(output_file.name)
315 except (XMLSyntaxError, XSLTApplyError), e:
319 def load_including_children(wldoc=None, provider=None, uri=None):
320 """ Makes one big xml file with children inserted at end.
322 Either wldoc or provider and URI must be provided.
326 f = provider.by_uri(uri)
327 text = f.read().decode('utf-8')
329 elif wldoc is not None:
330 text = etree.tostring(wldoc.edoc, encoding=unicode)
331 provider = wldoc.provider
333 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
335 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
337 document = WLDocument.from_string(text,
338 parse_dublincore=True, provider=provider)
339 document.swap_endlines()
341 for child_uri in document.book_info.parts:
342 child = load_including_children(provider=provider, uri=child_uri)
343 document.edoc.getroot().append(child.edoc.getroot())