1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp, NamedTemporaryFile
13 from copy import deepcopy
14 from subprocess import call, PIPE
16 from Texml.processor import process
17 from lxml import etree
18 from lxml.etree import XMLSyntaxError, XSLTApplyError
20 from librarian.dcparser import Person
21 from librarian.parser import WLDocument
22 from librarian import ParseError, DCNS, get_resource, OutputFile
23 from librarian import functions
26 functions.reg_substitute_entities()
28 functions.reg_starts_white()
29 functions.reg_ends_white()
30 functions.reg_texcommand()
33 'wl2tex': 'pdf/wl2tex.xslt',
44 def insert_tags(doc, split_re, tagname, exclude=None):
45 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
47 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
48 >>> insert_tags(t, re.compile('-'), 'd');
49 >>> print etree.tostring(t)
50 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
53 for elem in doc.iter(tag=etree.Element):
54 if exclude and elem.tag in exclude:
57 chunks = split_re.split(elem.text)
58 while len(chunks) > 1:
59 ins = etree.Element(tagname)
60 ins.tail = chunks.pop()
62 elem.text = chunks.pop(0)
64 chunks = split_re.split(elem.tail)
65 parent = elem.getparent()
66 ins_index = parent.index(elem) + 1
67 while len(chunks) > 1:
68 ins = etree.Element(tagname)
69 ins.tail = chunks.pop()
70 parent.insert(ins_index, ins)
71 elem.tail = chunks.pop(0)
74 def substitute_hyphens(doc):
76 re.compile("(?<=[^-\s])-(?=[^-\s])"),
78 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
84 re.compile("(?<=\s\w)\s+"),
86 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
90 def move_motifs_inside(doc):
91 """ moves motifs to be into block elements """
92 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
93 for motif in master.xpath('motyw'):
94 for sib in motif.itersiblings():
95 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
96 # motif shouldn't have a tail - it would be untagged text
98 motif.getparent().remove(motif)
103 def hack_motifs(doc):
104 """ dirty hack for the marginpar-creates-orphans LaTeX problem
105 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
107 moves motifs in stanzas from first verse to second
108 and from next to last to last, then inserts negative vspace before them
110 for motif in doc.findall('//strofa//motyw'):
111 # find relevant verse-level tag
112 verse, stanza = motif, motif.getparent()
113 while stanza is not None and stanza.tag != 'strofa':
114 verse, stanza = stanza, stanza.getparent()
115 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
116 breaks_after = sum(1 for i in verse.itersiblings('br'))
117 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
119 if breaks_after == 2:
121 moved_motif = deepcopy(motif)
124 moved_motif.tail = None
125 moved_motif.set('moved', str(move_by))
127 for br in verse.itersiblings('br'):
131 br.addnext(moved_motif)
135 def parse_creator(doc):
136 """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
137 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
138 'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
139 namespaces = {'dc': str(DCNS)})[::-1]:
142 p = Person.from_text(person.text)
143 person_parsed = deepcopy(person)
144 person_parsed.tag = person.tag + '_parsed'
145 person_parsed.set('sortkey', person.text)
146 person_parsed.text = p.readable()
147 person.getparent().insert(0, person_parsed)
150 def get_stylesheet(name):
151 return get_resource(STYLESHEETS[name])
154 def package_available(package, args='', verbose=False):
155 """ check if a verion of a latex package accepting given args is available """
156 tempdir = mkdtemp('-wl2pdf-test')
157 fpath = os.path.join(tempdir, 'test.tex')
164 """ % (args, package))
167 p = call(['xelatex', '-output-directory', tempdir, fpath])
169 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
170 shutil.rmtree(tempdir)
174 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
175 cover=None, flags=None, customizations=None):
176 """ produces a PDF file with XeLaTeX
179 verbose: prints all output from LaTeX
180 save_tex: path to save the intermediary LaTeX file to
181 morefloats (old/new/none): force specific morefloats
182 cover: a cover.Cover object
183 flags: less-advertising,
184 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
189 document = load_including_children(wldoc)
192 document.edoc.getroot().set('data-cover-width', str(cover.width))
193 document.edoc.getroot().set('data-cover-height', str(cover.height))
196 document.edoc.getroot().set('flag-' + flag, 'yes')
198 # check for LaTeX packages
200 document.edoc.getroot().set('morefloats', morefloats.lower())
201 elif package_available('morefloats', 'maxfloats=19'):
202 document.edoc.getroot().set('morefloats', 'new')
205 if customizations is not None:
206 document.edoc.getroot().set('customizations', u','.join(customizations))
209 move_motifs_inside(document.edoc)
210 hack_motifs(document.edoc)
211 parse_creator(document.edoc)
212 substitute_hyphens(document.edoc)
213 fix_hanging(document.edoc)
216 style_filename = get_stylesheet("wl2tex")
217 style = etree.parse(style_filename)
219 texml = document.transform(style)
222 temp = mkdtemp('-wl2pdf')
225 c = cover(document.book_info.author.readable(), document.book_info.title)
226 with open(os.path.join(temp, 'cover.png'), 'w') as f:
229 del document # no longer needed large object :)
231 tex_path = os.path.join(temp, 'doc.tex')
232 fout = open(tex_path, 'w')
233 process(StringIO(texml), fout, 'utf-8')
238 shutil.copy(tex_path, save_tex)
241 shutil.copy(get_resource('pdf/wl.cls'), temp)
242 shutil.copy(get_resource('res/wl-logo.png'), temp)
248 p = call(['xelatex', tex_path])
250 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
252 raise ParseError("Error parsing .tex file")
256 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
257 pdf_path = os.path.join(temp, 'doc.pdf')
258 shutil.move(pdf_path, output_file.name)
260 return OutputFile.from_filename(output_file.name)
262 except (XMLSyntaxError, XSLTApplyError), e:
266 def load_including_children(wldoc=None, provider=None, uri=None):
267 """ Makes one big xml file with children inserted at end.
269 Either wldoc or provider and URI must be provided.
273 f = provider.by_uri(uri)
274 text = f.read().decode('utf-8')
276 elif wldoc is not None:
277 text = etree.tostring(wldoc.edoc, encoding=unicode)
278 provider = wldoc.provider
280 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
282 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
284 document = WLDocument.from_string(text, parse_dublincore=True)
285 document.swap_endlines()
287 for child_uri in document.book_info.parts:
288 child = load_including_children(provider=provider, uri=child_uri)
289 document.edoc.getroot().append(child.edoc.getroot())