1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp, NamedTemporaryFile
13 from copy import deepcopy
14 from subprocess import call, PIPE
16 from Texml.processor import process
17 from lxml import etree
18 from lxml.etree import XMLSyntaxError, XSLTApplyError
20 from librarian.dcparser import Person
21 from librarian.parser import WLDocument
22 from librarian import ParseError, DCNS, get_resource, OutputFile
23 from librarian import functions
24 from librarian.cover import ImageCover as WLCover
27 functions.reg_substitute_entities()
29 functions.reg_starts_white()
30 functions.reg_ends_white()
31 functions.reg_texcommand()
34 'wl2tex': 'pdf/wl2tex.xslt',
46 def insert_tags(doc, split_re, tagname, exclude=None):
47 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
49 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
50 >>> insert_tags(t, re.compile('-'), 'd');
51 >>> print etree.tostring(t)
52 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
55 for elem in doc.iter(tag=etree.Element):
56 if exclude and elem.tag in exclude:
59 chunks = split_re.split(elem.text)
60 while len(chunks) > 1:
61 ins = etree.Element(tagname)
62 ins.tail = chunks.pop()
64 elem.text = chunks.pop(0)
66 chunks = split_re.split(elem.tail)
67 parent = elem.getparent()
68 ins_index = parent.index(elem) + 1
69 while len(chunks) > 1:
70 ins = etree.Element(tagname)
71 ins.tail = chunks.pop()
72 parent.insert(ins_index, ins)
73 elem.tail = chunks.pop(0)
76 def substitute_hyphens(doc):
78 re.compile("(?<=[^-\s])-(?=[^-\s])"),
80 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
86 re.compile("(?<=\s\w)\s+"),
88 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
92 def move_motifs_inside(doc):
93 """ moves motifs to be into block elements """
94 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
95 for motif in master.xpath('motyw'):
96 for sib in motif.itersiblings():
97 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
98 # motif shouldn't have a tail - it would be untagged text
100 motif.getparent().remove(motif)
105 def hack_motifs(doc):
106 """ dirty hack for the marginpar-creates-orphans LaTeX problem
107 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
109 moves motifs in stanzas from first verse to second
110 and from next to last to last, then inserts negative vspace before them
112 for motif in doc.findall('//strofa//motyw'):
113 # find relevant verse-level tag
114 verse, stanza = motif, motif.getparent()
115 while stanza is not None and stanza.tag != 'strofa':
116 verse, stanza = stanza, stanza.getparent()
117 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
118 breaks_after = sum(1 for i in verse.itersiblings('br'))
119 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
121 if breaks_after == 2:
123 moved_motif = deepcopy(motif)
126 moved_motif.tail = None
127 moved_motif.set('moved', str(move_by))
129 for br in verse.itersiblings('br'):
133 br.addnext(moved_motif)
137 def parse_creator(doc):
138 """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
139 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
140 'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
141 namespaces = {'dc': str(DCNS)})[::-1]:
144 p = Person.from_text(person.text)
145 person_parsed = deepcopy(person)
146 person_parsed.tag = person.tag + '_parsed'
147 person_parsed.set('sortkey', person.text)
148 person_parsed.text = p.readable()
149 person.getparent().insert(0, person_parsed)
152 def get_stylesheet(name):
153 return get_resource(STYLESHEETS[name])
156 def package_available(package, args='', verbose=False):
157 """ check if a verion of a latex package accepting given args is available """
158 tempdir = mkdtemp('-wl2pdf-test')
159 fpath = os.path.join(tempdir, 'test.tex')
166 """ % (args, package))
169 p = call(['xelatex', '-output-directory', tempdir, fpath])
171 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
172 shutil.rmtree(tempdir)
176 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
177 cover=None, flags=None, customizations=None,
179 """ produces a PDF file with XeLaTeX
182 verbose: prints all output from LaTeX
183 save_tex: path to save the intermediary LaTeX file to
184 morefloats (old/new/none): force specific morefloats
185 cover: a cover.Cover object
186 flags: less-advertising,
187 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
192 document = load_including_children(wldoc)
197 the_cover = cover(document.book_info)
198 document.edoc.getroot().set('data-cover-width', str(the_cover.width))
199 document.edoc.getroot().set('data-cover-height', str(the_cover.height))
200 if the_cover.uses_dc_cover:
201 if document.book_info.cover_by:
202 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
203 if document.book_info.cover_source:
204 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
207 document.edoc.getroot().set('flag-' + flag, 'yes')
209 # check for LaTeX packages
211 document.edoc.getroot().set('morefloats', morefloats.lower())
212 elif package_available('morefloats', 'maxfloats=19'):
213 document.edoc.getroot().set('morefloats', 'new')
216 if customizations is not None:
217 document.edoc.getroot().set('customizations', u','.join(customizations))
220 #move_motifs_inside(document.edoc)
221 #hack_motifs(document.edoc)
222 parse_creator(document.edoc)
223 if document.book_info.language == 'pol':
224 substitute_hyphens(document.edoc)
225 fix_hanging(document.edoc)
228 style_filename = get_stylesheet("wl2tex")
229 style = etree.parse(style_filename)
231 texml = document.transform(style)
232 etree.dump(texml.getroot())
234 temp = mkdtemp('-wl2pdf')
237 with open(os.path.join(temp, 'cover.jpg'), 'w') as f:
240 shutil.copy("cce_trust.eps", temp)
241 shutil.copy("logo.eps", temp)
242 for img in document.edoc.findall('//ilustr'):
243 print "--->> %s %s %s" % (imgdir, img, img.get('src'))
244 shutil.copy(os.path.join(imgdir, img.get('src')), temp)
247 del document # no longer needed large object :)
249 tex_path = os.path.join(temp, 'doc.tex')
250 fout = open(tex_path, 'w')
251 process(StringIO(texml), fout, 'utf-8')
256 shutil.copy(tex_path, save_tex)
259 shutil.copy(get_resource('pdf/wl.cls'), temp)
260 shutil.copy(get_resource('res/wl-logo.png'), temp)
261 shutil.copy('logo.eps', temp)
267 p = call(['xelatex', tex_path])
269 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
271 raise ParseError("Error parsing .tex file")
275 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
276 pdf_path = os.path.join(temp, 'doc.pdf')
277 shutil.move(pdf_path, output_file.name)
279 return OutputFile.from_filename(output_file.name)
281 except (XMLSyntaxError, XSLTApplyError), e:
286 def load_including_children(wldoc=None, provider=None, uri=None):
287 """ Makes one big xml file with children inserted at end.
289 Either wldoc or provider and URI must be provided.
293 f = provider.by_uri(uri)
294 text = f.read().decode('utf-8')
296 elif wldoc is not None:
297 text = etree.tostring(wldoc.edoc, encoding=unicode)
298 provider = wldoc.provider
300 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
302 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
304 document = WLDocument.from_string(text, parse_dublincore=True)
305 document.swap_endlines()
307 for child_uri in document.book_info.parts:
308 child = load_including_children(provider=provider, uri=child_uri)
309 document.edoc.getroot().append(child.edoc.getroot())