1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
10 from StringIO import StringIO
11 from tempfile import mkdtemp, NamedTemporaryFile
13 from copy import deepcopy
14 from subprocess import call, PIPE
16 from Texml.processor import process
17 from lxml import etree
18 from lxml.etree import XMLSyntaxError, XSLTApplyError
20 from librarian.dcparser import Person
21 from librarian.parser import WLDocument
22 from librarian import ParseError, DCNS, get_resource, OutputFile
23 from librarian import functions
24 from librarian.cover import WLCover
27 functions.reg_substitute_entities()
29 functions.reg_starts_white()
30 functions.reg_ends_white()
31 functions.reg_texcommand()
34 'wl2tex': 'pdf/wl2tex.xslt',
45 def insert_tags(doc, split_re, tagname, exclude=None):
46 """ inserts <tagname> for every occurence of `split_re' in text nodes in the `doc' tree
48 >>> t = etree.fromstring('<a><b>A-B-C</b>X-Y-Z</a>');
49 >>> insert_tags(t, re.compile('-'), 'd');
50 >>> print etree.tostring(t)
51 <a><b>A<d/>B<d/>C</b>X<d/>Y<d/>Z</a>
54 for elem in doc.iter(tag=etree.Element):
55 if exclude and elem.tag in exclude:
58 chunks = split_re.split(elem.text)
59 while len(chunks) > 1:
60 ins = etree.Element(tagname)
61 ins.tail = chunks.pop()
63 elem.text = chunks.pop(0)
65 chunks = split_re.split(elem.tail)
66 parent = elem.getparent()
67 ins_index = parent.index(elem) + 1
68 while len(chunks) > 1:
69 ins = etree.Element(tagname)
70 ins.tail = chunks.pop()
71 parent.insert(ins_index, ins)
72 elem.tail = chunks.pop(0)
75 def substitute_hyphens(doc):
77 re.compile("(?<=[^-\s])-(?=[^-\s])"),
79 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
85 re.compile("(?<=\s\w)\s+"),
87 exclude=[DCNS("identifier.url"), DCNS("rights.license")]
91 def move_motifs_inside(doc):
92 """ moves motifs to be into block elements """
93 for master in doc.xpath('//powiesc|//opowiadanie|//liryka_l|//liryka_lp|//dramat_wierszowany_l|//dramat_wierszowany_lp|//dramat_wspolczesny'):
94 for motif in master.xpath('motyw'):
95 for sib in motif.itersiblings():
96 if sib.tag not in ('sekcja_swiatlo', 'sekcja_asterysk', 'separator_linia', 'begin', 'end', 'motyw', 'extra', 'uwaga'):
97 # motif shouldn't have a tail - it would be untagged text
99 motif.getparent().remove(motif)
104 def hack_motifs(doc):
105 """ dirty hack for the marginpar-creates-orphans LaTeX problem
106 see http://www.latex-project.org/cgi-bin/ltxbugs2html?pr=latex/2304
108 moves motifs in stanzas from first verse to second
109 and from next to last to last, then inserts negative vspace before them
111 for motif in doc.findall('//strofa//motyw'):
112 # find relevant verse-level tag
113 verse, stanza = motif, motif.getparent()
114 while stanza is not None and stanza.tag != 'strofa':
115 verse, stanza = stanza, stanza.getparent()
116 breaks_before = sum(1 for i in verse.itersiblings('br', preceding=True))
117 breaks_after = sum(1 for i in verse.itersiblings('br'))
118 if (breaks_before == 0 and breaks_after > 0) or breaks_after == 1:
120 if breaks_after == 2:
122 moved_motif = deepcopy(motif)
125 moved_motif.tail = None
126 moved_motif.set('moved', str(move_by))
128 for br in verse.itersiblings('br'):
132 br.addnext(moved_motif)
136 def parse_creator(doc):
137 """ find all dc:creator and dc.contributor tags and add *_parsed versions with forenames first """
138 for person in doc.xpath("|".join('//dc:'+(tag) for tag in (
139 'creator', 'contributor.translator', 'contributor.editor', 'contributor.technical_editor')),
140 namespaces = {'dc': str(DCNS)})[::-1]:
143 p = Person.from_text(person.text)
144 person_parsed = deepcopy(person)
145 person_parsed.tag = person.tag + '_parsed'
146 person_parsed.set('sortkey', person.text)
147 person_parsed.text = p.readable()
148 person.getparent().insert(0, person_parsed)
151 def get_stylesheet(name):
152 return get_resource(STYLESHEETS[name])
155 def package_available(package, args='', verbose=False):
156 """ check if a verion of a latex package accepting given args is available """
157 tempdir = mkdtemp('-wl2pdf-test')
158 fpath = os.path.join(tempdir, 'test.tex')
165 """ % (args, package))
168 p = call(['xelatex', '-output-directory', tempdir, fpath])
170 p = call(['xelatex', '-interaction=batchmode', '-output-directory', tempdir, fpath], stdout=PIPE, stderr=PIPE)
171 shutil.rmtree(tempdir)
175 def transform(wldoc, verbose=False, save_tex=None, morefloats=None,
176 cover=None, flags=None, customizations=None):
177 """ produces a PDF file with XeLaTeX
180 verbose: prints all output from LaTeX
181 save_tex: path to save the intermediary LaTeX file to
182 morefloats (old/new/none): force specific morefloats
183 cover: a cover.Cover object or True for default
184 flags: less-advertising, not-wl, images
185 customizations: user requested customizations regarding various formatting parameters (passed to wl LaTeX class)
191 document = load_including_children(wldoc)
196 document.edoc.getroot().set('data-cover-width', str(cover.width))
197 document.edoc.getroot().set('data-cover-height', str(cover.height))
198 if cover.uses_dc_cover:
199 if document.book_info.cover_by:
200 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
201 if document.book_info.cover_source:
202 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
205 document.edoc.getroot().set('flag-' + flag, 'yes')
207 # check for LaTeX packages
209 document.edoc.getroot().set('morefloats', morefloats.lower())
210 elif package_available('morefloats', 'maxfloats=19'):
211 document.edoc.getroot().set('morefloats', 'new')
214 if customizations is not None:
215 document.edoc.getroot().set('customizations', u','.join(customizations))
218 move_motifs_inside(document.edoc)
219 hack_motifs(document.edoc)
220 parse_creator(document.edoc)
221 substitute_hyphens(document.edoc)
222 fix_hanging(document.edoc)
225 style_filename = get_stylesheet("wl2tex")
226 style = etree.parse(style_filename)
228 texml = document.transform(style)
231 temp = mkdtemp('-wl2pdf')
234 c = cover(document.book_info)
235 with open(os.path.join(temp, 'cover.png'), 'w') as f:
238 del document # no longer needed large object :)
240 tex_path = os.path.join(temp, 'doc.tex')
241 fout = open(tex_path, 'w')
242 process(StringIO(texml), fout, 'utf-8')
247 shutil.copy(tex_path, save_tex)
250 shutil.copy(get_resource('pdf/wl.cls'), temp)
251 shutil.copy(get_resource('res/wl-logo.png'), temp)
254 shutil.copy(get_resource('res/ofop-logo.png'), temp)
255 shutil.copy(get_resource('res/logo-fio.jpg'), temp)
261 p = call(['xelatex', tex_path])
263 p = call(['xelatex', '-interaction=batchmode', tex_path], stdout=PIPE, stderr=PIPE)
265 raise ParseError("Error parsing .tex file")
269 output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
270 pdf_path = os.path.join(temp, 'doc.pdf')
271 shutil.move(pdf_path, output_file.name)
273 return OutputFile.from_filename(output_file.name)
275 except (XMLSyntaxError, XSLTApplyError), e:
279 def load_including_children(wldoc=None, provider=None, uri=None):
280 """ Makes one big xml file with children inserted at end.
282 Either wldoc or provider and URI must be provided.
286 f = provider.by_uri(uri)
287 text = f.read().decode('utf-8')
289 elif wldoc is not None:
290 text = etree.tostring(wldoc.edoc, encoding=unicode)
291 provider = wldoc.provider
293 raise ValueError('Neither a WLDocument, nor provider and URI were provided.')
295 text = re.sub(ur"([\u0400-\u04ff]+)", ur"<alien>\1</alien>", text)
297 document = WLDocument.from_string(text, parse_dublincore=True)
298 document.swap_endlines()
300 for child_uri in document.book_info.parts:
301 child = load_including_children(provider=provider, uri=child_uri)
302 document.edoc.getroot().append(child.edoc.getroot())