changes in pdf: footer on first page, logo next to title, no duplicate title
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import shutil
8 from subprocess import call, PIPE
9 from tempfile import NamedTemporaryFile, mkdtemp
10 from lxml import etree
11 from urllib import urlretrieve
12 from StringIO import StringIO
13 from Texml.processor import process
14 from librarian import DCNS, XMLNamespace, BuildError
15 from librarian.formats import Format
16 from librarian.output import OutputFile
17 from librarian.renderers import Register, TreeRenderer
18 from librarian.utils import Context, get_resource
19 from librarian import core
20 from PIL import Image
21 from ..html import Silent
22
23
24 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
25
26
27 def texml_cmd(name, *parms, **kwargs):
28     cmd = etree.Element(TexmlNS('cmd'), name=name)
29     for opt in kwargs.get('opts', []):
30         etree.SubElement(cmd, TexmlNS('opt')).text = opt
31     for parm in parms:
32         etree.SubElement(cmd, TexmlNS('parm')).text = parm
33     return cmd
34
35
36 class PdfFormat(Format):
37     format_name = 'PDF'
38     format_ext = 'pdf'
39     tex_passes = 1
40     style = get_resource('formats/pdf/res/default.sty')
41
42     local_packages = [
43         get_resource('formats/pdf/res/coverimage.sty'),
44         get_resource('formats/pdf/res/insertimage.sty'),
45     ]
46
47     renderers = Register()
48
49     def retrieve_file(self, url, save_as):
50         # TODO: local sheme
51         return False
52
53     def add_file(self, ctx, filename, url=None, path=None, image=False):
54         from subprocess import call
55         if not url and not path:
56             raise BuildError('No URL or path for image')
57         save_as = os.path.join(ctx.workdir, filename)
58         if path is not None:
59             ext = path.rsplit('.', 1)[-1]
60             if image:
61                 if ext == 'gif':
62                     call(['convert', path, save_as])
63                 else:
64                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
65                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
66                     shutil.move(save_as + '_.' + ext, save_as)
67             else:
68                 shutil.copy(path, save_as)
69         elif not self.retrieve_file(url, save_as):
70             if url.startswith('file://'):
71                 url = ctx.files_path + url[7:]
72
73             if url.startswith('/'):
74                 url = 'http://milpeer.eu' + url
75
76             if '.' not in url:
77                 raise BuildError('Linked file without extension: %s' % url)
78             ext = url.rsplit('.', 1)[-1]
79             if image:
80                 urlretrieve(url, save_as + '_.' + ext)
81                 if ext == 'gif':
82                     call(['convert', save_as + '_.' + ext, save_as])
83                 else:
84                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
85                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300',
86                               save_as + '_2.' + ext])
87                     if r:
88                         shutil.move(save_as + '_.' + ext, save_as)
89                     else:
90                         shutil.move(save_as + '_2.' + ext, save_as)
91             else:
92                 urlretrieve(url, save_as)
93
94     def get_file(self, ctx, filename):
95         return os.path.join(ctx.workdir, filename)
96
97     def get_texml(self, build_ctx):
98         t = etree.Element(TexmlNS('TeXML'))
99
100         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
101         t.append(texml_cmd("documentclass", "wl"))
102
103         # global packages
104         self.add_file(build_ctx, 'style.sty', path=self.style)
105         t.append(texml_cmd("usepackage", "style"))
106         t.append(texml_cmd("usepackage", "hyphenat"))
107
108         # local packages
109         for i, package in enumerate(self.local_packages):
110             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
111             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
112
113         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
114         title = self.doc.meta.title()
115         t.append(texml_cmd("author", author))
116         t.append(texml_cmd("title", title))
117
118         doc = etree.SubElement(t, TexmlNS('env'), name="document")
119
120         # Wielkości!
121         title_field = texml_cmd("titlefield", "")
122         doc.append(title_field)
123         grp = title_field[0]
124         grp.append(texml_cmd("raggedright"))
125         grp.append(texml_cmd("vfill"))
126         if author:
127             p = texml_cmd("par", "")
128             grp.append(p)
129             p[0].append(texml_cmd("Large"))
130             p[0].append(texml_cmd("noindent"))
131             p[0].append(texml_cmd("nohyphens", author))
132             p[0].append(texml_cmd("vspace", "1em"))
133             # p[0][-1].tail = author
134         if title:
135             p = texml_cmd("par", "")
136             grp.append(p)
137             p[0].append(texml_cmd("Huge"))
138             p[0].append(texml_cmd("noindent"))
139             p[0].append(texml_cmd("nohyphens", title))
140             # p[0][-1].tail = title
141
142         # IOFile probably would be better
143         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
144         # TEST
145         # TODO: convert
146         # cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
147         if cover_logo_url:
148             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
149             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
150             doc.append(texml_cmd("toplogo", 'coverlogo.png', "%fcm" % (2.0 * size[0] / size[1]), "2cm"))
151
152         doc.append(texml_cmd("vspace", "2em"))
153
154         ctx = Context(build_ctx, format=self, img=1)
155         root = self.doc.edoc.getroot()
156         root.remove(root[1])
157         doc.extend(self.render(root, ctx))
158
159         # Redakcyjna na końcu.
160         doc.append(texml_cmd("section*", "Information about the resource"))
161         doc.append(texml_cmd("vspace", "1em"))
162
163         for m, f, multiple in (
164                 ('Publisher: ', DCNS('publisher'), False),
165                 ('Rights: ', DCNS('rights'), False),
166                 ('', DCNS('description'), False)):
167             if multiple:
168                 v = ', '.join(self.doc.meta.get(f))
169             else:
170                 v = self.doc.meta.get_one(f)
171             if v:
172                 e = texml_cmd("par", "")
173                 e[0].append(texml_cmd("noindent"))
174                 e[0][0].tail = "%s%s" % (m, v)
175                 doc.append(e)
176                 doc.append(texml_cmd("vspace", "1em"))
177
178         e = texml_cmd("par", "")
179         e[0].append(texml_cmd("noindent"))
180         e[0][0].tail = "Resource prepared using "
181         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
182         e[0][-1].tail = " editing platform. "
183         doc.append(e)
184
185         source_url = getattr(build_ctx, 'source_url', None)
186         # source_url = 'http://milpeer.mdrn.pl/documents/27/'
187         if source_url:
188             e = texml_cmd("par", "")
189             doc.append(e)
190             e[0].append(texml_cmd("noindent"))
191             e[0][0].tail = "Source available at "
192             e[0].append(texml_cmd("href", source_url, source_url))
193
194         return t
195
196     def get_tex_dir(self, ctx):
197         ctx.workdir = mkdtemp('-wl2pdf')
198         texml = self.get_texml(ctx)
199         tex_path = os.path.join(ctx.workdir, 'doc.tex')
200         with open(tex_path, 'w') as fout:
201             # print etree.tostring(texml)
202             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
203
204         # if self.save_tex:
205         #     shutil.copy(tex_path, self.save_tex)
206
207         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
208         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
209         return ctx.workdir
210
211     def build(self, ctx=None, verbose=False):
212         temp = self.get_tex_dir(ctx)
213         tex_path = os.path.join(temp, 'doc.tex')
214         try:
215             cwd = os.getcwd()
216         except OSError:
217             cwd = None
218         os.chdir(temp)
219
220         if verbose:
221             for i in range(self.tex_passes):
222                 p = call(['xelatex', tex_path])
223         else:
224             for i in range(self.tex_passes):
225                 p = call(['xelatex', '-interaction=batchmode', tex_path],
226                          stdout=PIPE, stderr=PIPE)
227         if p:
228             # raise ParseError("Error parsing .tex file: %s" % tex_path)
229             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
230
231         if cwd is not None:
232             os.chdir(cwd)
233
234         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
235         pdf_path = os.path.join(temp, 'doc.pdf')
236         shutil.move(pdf_path, output_file.name)
237         shutil.rmtree(temp)
238         os.system("ls -l " + output_file.name)
239         return OutputFile.from_filename(output_file.name)
240     
241     def render(self, element, ctx):
242         return self.renderers.get_for(element).render(element, ctx)
243
244
245 class CmdRenderer(TreeRenderer):
246     def parms(self):
247         return []
248
249     def container(self):
250         root = etree.Element(self.root_name)
251         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
252         inner = root[0][-1]
253         return root, inner
254
255
256 class EnvRenderer(TreeRenderer):
257     def container(self):
258         root = etree.Element(self.root_name)
259         inner = etree.SubElement(root, 'env', name=self.tag_name)
260         return root, inner
261
262
263 class GroupRenderer(CmdRenderer):
264     def container(self):
265         root = etree.Element(self.root_name)
266         inner = etree.SubElement(root, 'group')
267         if self.tag_name:
268             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
269         return root, inner
270
271
272 class SectionRenderer(CmdRenderer):
273     def subcontext(self, element, ctx):
274         # here?
275         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
276
277     def container(self):
278         root = etree.Element(self.root_name)
279         root.append(texml_cmd('pagebreak', opts=['1']))
280         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
281         inner = root[1][0]
282         return root, inner
283
284 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
285
286 # TODO: stopnie
287 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
288
289 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
290
291
292 class ImgRenderer(CmdRenderer):
293     def parms(self):
294         return ["", ""]
295
296     def render(self, element, ctx):
297         root = super(ImgRenderer, self).render(element, ctx)
298         url = element.get('src')
299         nr = getattr(ctx, 'img', 0)
300         ctx.img = nr + 1
301         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
302         root[0][0].text = 'f%d.png' % nr
303         try:
304             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
305         except IOError:  # not an image
306             del root[0]
307             return root
308         root[0][1].text = '15cm'
309         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
310         return root
311
312 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
313
314
315 class VideoRenderer(CmdRenderer):
316     def render(self, element, ctx):
317         root = super(VideoRenderer, self).render(element, ctx)
318         url = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid')
319         link = texml_cmd('href', url, url)
320         root[0][0].text = None
321         root[0][0].append(link)
322         return root
323
324 PdfFormat.renderers.register(core.Div, 'video', VideoRenderer('par'))
325
326
327 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
328 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
329 PdfFormat.renderers.register(core.Span, 'item', CmdRenderer('item'))
330 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
331 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
332
333
334 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
335 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
336 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
337 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
338 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
339
340
341 class SpanUri(CmdRenderer):
342     def parms(self):
343         return [""]
344
345     def render(self, element, ctx):
346         root = super(SpanUri, self).render(element, ctx)
347         src = element.text
348         if src.startswith('file://'):
349             src = ctx.files_path + src[7:]
350         root[0][0].text = src
351         return root
352 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
353
354
355 class SpanLink(CmdRenderer):
356     def parms(self):
357         return [""]
358
359     def render(self, element, ctx):
360         root = super(SpanLink, self).render(element, ctx)
361         src = element.attrib.get('href', '')
362         if src.startswith('file://'):
363             src = ctx.files_path + src[7:]
364         root[0][0].text = src
365         return root
366 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
367
368
369 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
370 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
371 PdfFormat.renderers.register(core.Aside, 'comment', Silent())