move validation outside librarian
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import shutil
8 from subprocess import call, PIPE
9 from tempfile import NamedTemporaryFile, mkdtemp
10 from lxml import etree
11 from urllib import urlretrieve
12 from StringIO import StringIO
13 from Texml.processor import process
14 from librarian import DCNS, XMLNamespace, BuildError
15 from librarian.formats import Format
16 from librarian.output import OutputFile
17 from librarian.renderers import Register, TreeRenderer
18 from librarian.utils import Context, get_resource
19 from librarian import core
20 from PIL import Image
21 from ..html import Silent
22
23
24 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
25
26
27 def texml_cmd(name, *parms, **kwargs):
28     cmd = etree.Element(TexmlNS('cmd'), name=name)
29     for opt in kwargs.get('opts', []):
30         etree.SubElement(cmd, TexmlNS('opt')).text = opt
31     for parm in parms:
32         etree.SubElement(cmd, TexmlNS('parm')).text = parm
33     return cmd
34
35
36 class PdfFormat(Format):
37     format_name = 'PDF'
38     format_ext = 'pdf'
39     tex_passes = 1
40     style = get_resource('formats/pdf/res/default.sty')
41
42     local_packages = [
43         get_resource('formats/pdf/res/coverimage.sty'),
44         get_resource('formats/pdf/res/insertimage.sty'),
45     ]
46
47     renderers = Register()
48
49     def retrieve_file(self, url, save_as):
50         # TODO: local sheme
51         return False
52
53     def add_file(self, ctx, filename, url=None, path=None, image=False):
54         from subprocess import call
55         if not url and not path:
56             raise BuildError('No URL or path for image')
57         save_as = os.path.join(ctx.workdir, filename)
58         if path is not None:
59             ext = path.rsplit('.', 1)[-1]
60             if image:
61                 if ext == 'gif':
62                     call(['convert', path, save_as])
63                 else:
64                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
65                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
66                     shutil.move(save_as + '_.' + ext, save_as)
67             else:
68                 shutil.copy(path, save_as)
69         elif not self.retrieve_file(url, save_as):
70             if url.startswith('file://'):
71                 url = ctx.files_path + url[7:]
72
73             if url.startswith('/'):
74                 url = 'http://milpeer.eu' + url
75
76             if '.' not in url:
77                 raise BuildError('Linked file without extension: %s' % url)
78             ext = url.rsplit('.', 1)[-1]
79             if image:
80                 urlretrieve(url, save_as + '_.' + ext)
81                 if ext == 'gif':
82                     call(['convert', save_as + '_.' + ext, save_as])
83                 else:
84                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
85                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300',
86                               save_as + '_2.' + ext])
87                     if r:
88                         shutil.move(save_as + '_.' + ext, save_as)
89                     else:
90                         shutil.move(save_as + '_2.' + ext, save_as)
91             else:
92                 urlretrieve(url, save_as)
93
94     def get_file(self, ctx, filename):
95         return os.path.join(ctx.workdir, filename)
96
97     def get_texml(self, build_ctx):
98         t = etree.Element(TexmlNS('TeXML'))
99
100         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
101         t.append(texml_cmd("documentclass", "wl"))
102
103         # global packages
104         self.add_file(build_ctx, 'style.sty', path=self.style)
105         t.append(texml_cmd("usepackage", "style"))
106         t.append(texml_cmd("usepackage", "hyphenat"))
107
108         # local packages
109         for i, package in enumerate(self.local_packages):
110             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
111             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
112
113         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
114         title = self.doc.meta.title()
115         t.append(texml_cmd("author", author))
116         t.append(texml_cmd("title", title))
117
118         doc = etree.SubElement(t, TexmlNS('env'), name="document")
119         doc.append(texml_cmd("thispagestyle", "empty"))
120
121         # title page
122         height_left = 297
123         cover_url = self.doc.meta.get_one(DCNS('relation.coverimage.url'))
124         if cover_url:
125             self.add_file(build_ctx, 'cover.png', cover_url, image=True)
126             
127             img = Image.open(self.get_file(build_ctx, 'cover.png'))
128             size = img.size
129
130             if size[1] > size[0]:
131                 img = img.crop((0, 0, size[0], size[0]))
132                 img.save(self.get_file(build_ctx, 'cover.png'), format=img.format, quality=90)
133             size = img.size
134
135             # TODO: hardcoded paper size here
136             height = 210.0 * size[1] / size[0]
137             doc.append(texml_cmd("makecover", "%fmm" % height))
138         else:
139             doc.append(texml_cmd("vfill*"))
140
141         # Wielkości!
142         grp = etree.SubElement(doc, 'group')
143         grp.append(texml_cmd("raggedright"))
144         grp.append(texml_cmd("vfill"))
145         if author:
146             p = texml_cmd("par", "")
147             grp.append(p)
148             p[0].append(texml_cmd("Large"))
149             p[0].append(texml_cmd("noindent"))
150             p[0].append(texml_cmd("nohyphens", author))
151             p[0].append(texml_cmd("vspace", "1em"))
152             # p[0][-1].tail = author
153         if title:
154             p = texml_cmd("par", "")
155             grp.append(p)
156             p[0].append(texml_cmd("Huge"))
157             p[0].append(texml_cmd("noindent"))
158             p[0].append(texml_cmd("nohyphens", title))
159             # p[0][-1].tail = title
160         doc.append(texml_cmd("vfill"))
161         doc.append(texml_cmd("vfill"))
162
163         # IOFile probably would be better
164         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
165         # TEST
166         # TODO: convert
167         # cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
168         if cover_logo_url:
169             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
170             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
171             p = texml_cmd("par", "")
172             doc.append(p)
173             p[0].append(texml_cmd("noindent"))
174             p[0].append(texml_cmd("insertimage", 'coverlogo.png', "%fcm" % (1.0 * size[0] / size[1]), "1cm"))
175             
176         # logo organizacji!
177         doc.append(texml_cmd("clearpage"))
178
179         ctx = Context(build_ctx, format=self, img=1)
180         doc.extend(self.render(self.doc.edoc.getroot(), ctx))
181
182         # Redakcyjna na końcu.
183         doc.append(texml_cmd("clearpage"))
184
185         doc.append(texml_cmd("section*", "Information about the resource"))
186         doc.append(texml_cmd("vspace", "1em"))
187
188         for m, f in (
189                 ('Publisher: ', DCNS('publisher')),
190                 ('Rights: ', DCNS('rights')),
191                 ('Intended audience: ', DCNS('audience')),
192                 ('', DCNS('description'))):
193             v = self.doc.meta.get_one(f)
194             if v:
195                 e = texml_cmd("par", "")
196                 e[0].append(texml_cmd("noindent"))
197                 e[0][0].tail = "%s%s" % (m, v)
198                 doc.append(e)
199                 doc.append(texml_cmd("vspace", "1em"))
200
201         e = texml_cmd("par", "")
202         e[0].append(texml_cmd("noindent"))
203         e[0][0].tail = "Resource prepared using "
204         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
205         e[0][-1].tail = " editing platform. "
206         doc.append(e)
207
208         source_url = getattr(build_ctx, 'source_url', None)
209         # source_url = 'http://milpeer.mdrn.pl/documents/27/'
210         if source_url:
211             e = texml_cmd("par", "")
212             doc.append(e)
213             e[0].append(texml_cmd("noindent"))
214             e[0][0].tail = "Source available at "
215             e[0].append(texml_cmd("href", source_url, source_url))
216
217         return t
218
219     def get_tex_dir(self, ctx):
220         ctx.workdir = mkdtemp('-wl2pdf')
221         texml = self.get_texml(ctx)
222         tex_path = os.path.join(ctx.workdir, 'doc.tex')
223         with open(tex_path, 'w') as fout:
224             # print etree.tostring(texml)
225             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
226
227         # if self.save_tex:
228         #     shutil.copy(tex_path, self.save_tex)
229
230         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
231         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
232         return ctx.workdir
233
234     def build(self, ctx=None, verbose=False):
235         temp = self.get_tex_dir(ctx)
236         tex_path = os.path.join(temp, 'doc.tex')
237         try:
238             cwd = os.getcwd()
239         except OSError:
240             cwd = None
241         os.chdir(temp)
242
243         if verbose:
244             for i in range(self.tex_passes):
245                 p = call(['xelatex', tex_path])
246         else:
247             for i in range(self.tex_passes):
248                 p = call(['xelatex', '-interaction=batchmode', tex_path],
249                          stdout=PIPE, stderr=PIPE)
250         if p:
251             # raise ParseError("Error parsing .tex file: %s" % tex_path)
252             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
253
254         if cwd is not None:
255             os.chdir(cwd)
256
257         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
258         pdf_path = os.path.join(temp, 'doc.pdf')
259         shutil.move(pdf_path, output_file.name)
260         shutil.rmtree(temp)
261         os.system("ls -l " + output_file.name)
262         return OutputFile.from_filename(output_file.name)
263     
264     def render(self, element, ctx):
265         return self.renderers.get_for(element).render(element, ctx)
266
267
268 class CmdRenderer(TreeRenderer):
269     def parms(self):
270         return []
271
272     def container(self):
273         root = etree.Element(self.root_name)
274         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
275         inner = root[0][-1]
276         return root, inner
277
278
279 class EnvRenderer(TreeRenderer):
280     def container(self):
281         root = etree.Element(self.root_name)
282         inner = etree.SubElement(root, 'env', name=self.tag_name)
283         return root, inner
284
285
286 class GroupRenderer(CmdRenderer):
287     def container(self):
288         root = etree.Element(self.root_name)
289         inner = etree.SubElement(root, 'group')
290         if self.tag_name:
291             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
292         return root, inner
293
294
295 class SectionRenderer(CmdRenderer):
296     def subcontext(self, element, ctx):
297         # here?
298         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
299
300     def container(self):
301         root = etree.Element(self.root_name)
302         root.append(texml_cmd('pagebreak', opts=['1']))
303         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
304         inner = root[1][0]
305         return root, inner
306
307 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
308
309 # TODO: stopnie
310 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
311
312 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
313
314
315 class ImgRenderer(CmdRenderer):
316     def parms(self):
317         return ["", ""]
318
319     def render(self, element, ctx):
320         root = super(ImgRenderer, self).render(element, ctx)
321         url = element.get('src')
322         nr = getattr(ctx, 'img', 0)
323         ctx.img = nr + 1
324         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
325         root[0][0].text = 'f%d.png' % nr
326         try:
327             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
328         except IOError:  # not an image
329             del root[0]
330             return root
331         root[0][1].text = '15cm'
332         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
333         return root
334
335 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
336
337
338 class VideoRenderer(CmdRenderer):
339     def render(self, element, ctx):
340         root = super(VideoRenderer, self).render(element, ctx)
341         url = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid')
342         link = texml_cmd('href', url, url)
343         root[0][0].text = None
344         root[0][0].append(link)
345         return root
346
347 PdfFormat.renderers.register(core.Div, 'video', VideoRenderer('par'))
348
349
350 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
351 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
352 PdfFormat.renderers.register(core.Span, 'item', CmdRenderer('item'))
353 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
354 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
355
356
357 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
358 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
359 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
360 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
361 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
362
363
364 class SpanUri(CmdRenderer):
365     def parms(self):
366         return [""]
367
368     def render(self, element, ctx):
369         root = super(SpanUri, self).render(element, ctx)
370         src = element.text
371         if src.startswith('file://'):
372             src = ctx.files_path + src[7:]
373         root[0][0].text = src
374         return root
375 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
376
377
378 class SpanLink(CmdRenderer):
379     def parms(self):
380         return [""]
381
382     def render(self, element, ctx):
383         root = super(SpanLink, self).render(element, ctx)
384         src = element.attrib.get('href', '')
385         if src.startswith('file://'):
386             src = ctx.files_path + src[7:]
387         root[0][0].text = src
388         return root
389 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
390
391
392 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
393 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
394 PdfFormat.renderers.register(core.Aside, 'comment', Silent())