1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp
13 from copy import deepcopy
14 from subprocess import call, PIPE
18 from Texml.processor import process
19 from lxml import etree
20 from lxml.etree import XMLSyntaxError, XSLTApplyError
22 from librarian.dcparser import Person
23 from librarian.parser import WLDocument
24 from librarian import ParseError, DCNS, get_resource
25 from librarian import functions
28 functions.reg_substitute_entities()
30 functions.reg_starts_white()
31 functions.reg_ends_white()
32 functions.reg_texcommand()
35 'wl2tex': 'pdf/wl2tex.xslt',
46 def insert_tags(doc, split_re, tagname, exclude=None):
47 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
49 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
50 >>> insert_tags(t, re.compile('-'), 'd');
51 >>> print etree.tostring(t)
52 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
55 for elem in doc.iter(tag=etree.Element):
56 if exclude and elem.tag in exclude:
59 chunks = split_re.split(elem.text)
60 while len(chunks) > 1:
61 ins = etree.Element(tagname)
62 ins.tail = chunks.pop()
64 elem.text = chunks.pop(0)
66 chunks = split_re.split(elem.tail)
67 parent = elem.getparent()
68 ins_index = parent.index(elem) + 1
69 while len(chunks) > 1:
70 ins = etree.Element(tagname)
71 ins.tail = chunks.pop()
72 parent.insert(ins_index, ins)
73 elem.tail = chunks.pop(0)
76 def substitute_hyphens(doc):
78 re.compile("(?<=[^-\s])-(?=[^-\s])"),
80 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
86 re.compile("(?<=\s\w)\s+"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
92 def move_motifs_inside(doc):
93 """ moves motifs to be into block elements """
94 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
95 for motif in master.xpath('motyw'):
96 for sib in motif.itersiblings():
97 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
98 # motif shouldn't have a tail - it would be untagged text
100 motif.getparent().remove(motif)
105 def hack_motifs(doc):
106 """ dirty hack for the marginpar-creates-orphans LaTeX problem
107 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
109 moves motifs in stanzas from first verse to second
110 and from next to last to last, then inserts negative vspace before them
112 for motif in doc.findall('//strofa//motyw'):
113 # find relevant verse-level tag
114 verse, stanza = motif, motif.getparent()
115 while stanza is not None and stanza.tag != 'strofa':
116 verse, stanza = stanza, stanza.getparent()
117 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
118 breaks_after = sum(1 for i in verse.itersiblings('br'))
119 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
121 if breaks_after == 2:
123 moved_motif = deepcopy(motif)
126 moved_motif.tail = None
127 moved_motif.set('moved', str(move_by))
129 for br in verse.itersiblings('br'):
133 br.addnext(moved_motif)
137 def parse_creator(doc):
138 """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
139 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
140 'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
141 namespaces = {'dc': str(DCNS)})[::-1]:
144 p = Person.from_text(person.text)
145 person_parsed = deepcopy(person)
146 person_parsed.tag = person.tag + '_parsed'
147 person_parsed.set('sortkey', person.text)
148 person_parsed.text = p.readable()
149 person.getparent().insert(0, person_parsed)
152 def get_stylesheet(name):
153 return get_resource(STYLESHEETS[name])
156 def package_available(package, args='', verbose=False):
157 """ check if a verion of a latex package accepting given args is available """
158 tempdir = mkdtemp('-wl2pdf-test')
159 fpath = os.path.join(tempdir, 'test.tex')
166 """ % (args, package))
169 p = call(['xelatex', '-output-directory', tempdir, fpath])
171 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
172 shutil.rmtree(tempdir)
176 def transform(provider, slug=None, file_path=None,
177 output_file=None, output_dir=None, make_dir=False, verbose=False, save_tex=None, morefloats=None,
178 cover=None, flags=None, customizations=None):
179 """ produces a PDF file with XeLaTeX
181 provider: a DocProvider
182 slug: slug of file to process, available by provider
183 file_path can be provided instead of a slug
184 output_file: file-like object or path to output file
185 output_dir: path to directory to save output file to; either this or output_file must be present
186 make_dir: writes output to <output_dir>/<author>/<slug>.pdf istead of <output_dir>/<slug>.pdf
187 verbose: prints all output from LaTeX
188 save_tex: path to save the intermediary LaTeX file to
189 morefloats (old/new/none): force specific morefloats
190 cover: a cover.Cover object
191 flags: less-advertising,
192 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
199 raise ValueError('slug or file_path should be specified, not both')
200 document = load_including_children(provider, file_path=file_path)
203 raise ValueError('either slug or file_path should be specified')
204 document = load_including_children(provider, slug=slug)
207 document.edoc.getroot().set('data-cover-width', str(cover.width))
208 document.edoc.getroot().set('data-cover-height', str(cover.height))
211 document.edoc.getroot().set('flag-' + flag, 'yes')
213 # check for LaTeX packages
215 document.edoc.getroot().set('morefloats', morefloats.lower())
216 elif package_available('morefloats', 'maxfloats=19'):
217 document.edoc.getroot().set('morefloats', 'new')
220 if customizations is not None:
221 document.edoc.getroot().set('customizations', u','.join(customizations))
224 move_motifs_inside(document.edoc)
225 hack_motifs(document.edoc)
226 parse_creator(document.edoc)
227 substitute_hyphens(document.edoc)
228 fix_hanging(document.edoc)
231 if make_dir and output_dir is not None:
232 author = unicode(document.book_info.author)
233 output_dir = os.path.join(output_dir, author)
236 style_filename = get_stylesheet("wl2tex")
237 style = etree.parse(style_filename)
239 texml = document.transform(style)
242 temp = mkdtemp('-wl2pdf')
245 c = cover(document.book_info.author.readable(), document.book_info.title)
246 with open(os.path.join(temp, 'cover.png'), 'w') as f:
249 del document # no longer needed large object :)
251 tex_path = os.path.join(temp, 'doc.tex')
252 fout = open(tex_path, 'w')
253 process(StringIO(texml), fout, 'utf-8')
258 shutil.copy(tex_path, save_tex)
261 shutil.copy(get_resource('pdf/wl.cls'), temp)
262 shutil.copy(get_resource('res/wl-logo.png'), temp)
268 p = call(['xelatex', tex_path])
270 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
272 raise ParseError("Error parsing .tex file")
277 pdf_path = os.path.join(temp, 'doc.pdf')
278 if output_dir is not None:
280 os.makedirs(output_dir)
284 output_path = os.path.join(output_dir, '%s.pdf' % slug)
286 output_path = os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.pdf')
287 shutil.move(pdf_path, output_path)
289 if hasattr(output_file, 'write'):
291 with open(pdf_path) as f:
292 output_file.write(f.read())
295 # path to output file
296 shutil.copy(pdf_path, output_file)
299 except (XMLSyntaxError, XSLTApplyError), e:
303 def load_including_children(provider, slug=None, uri=None, file_path=None):
304 """ makes one big xml file with children inserted at end
305 either slug or uri must be provided
309 f = provider.by_uri(uri)
313 f = open(file_path, 'r')
315 raise ValueError('Neither slug, URI nor file path provided for a book.')
317 text = f.read().decode('utf-8')
318 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
320 document = WLDocument.from_string(text, True,
321 parse_dublincore=True)
324 for child_uri in document.book_info.parts:
326 child = load_including_children(provider, uri=child_uri)
327 document.edoc.getroot().append(child.edoc.getroot())