changes in PDF: font, no cover, paragraph style
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import shutil
8 from subprocess import call, PIPE
9 from tempfile import NamedTemporaryFile, mkdtemp
10 from lxml import etree
11 from urllib import urlretrieve
12 from StringIO import StringIO
13 from Texml.processor import process
14 from librarian import DCNS, XMLNamespace, BuildError
15 from librarian.formats import Format
16 from librarian.output import OutputFile
17 from librarian.renderers import Register, TreeRenderer
18 from librarian.utils import Context, get_resource
19 from librarian import core
20 from PIL import Image
21 from ..html import Silent
22
23
24 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
25
26
27 def texml_cmd(name, *parms, **kwargs):
28     cmd = etree.Element(TexmlNS('cmd'), name=name)
29     for opt in kwargs.get('opts', []):
30         etree.SubElement(cmd, TexmlNS('opt')).text = opt
31     for parm in parms:
32         etree.SubElement(cmd, TexmlNS('parm')).text = parm
33     return cmd
34
35
36 class PdfFormat(Format):
37     format_name = 'PDF'
38     format_ext = 'pdf'
39     tex_passes = 1
40     style = get_resource('formats/pdf/res/default.sty')
41
42     local_packages = [
43         get_resource('formats/pdf/res/coverimage.sty'),
44         get_resource('formats/pdf/res/insertimage.sty'),
45     ]
46
47     renderers = Register()
48
49     def retrieve_file(self, url, save_as):
50         # TODO: local sheme
51         return False
52
53     def add_file(self, ctx, filename, url=None, path=None, image=False):
54         from subprocess import call
55         if not url and not path:
56             raise BuildError('No URL or path for image')
57         save_as = os.path.join(ctx.workdir, filename)
58         if path is not None:
59             ext = path.rsplit('.', 1)[-1]
60             if image:
61                 if ext == 'gif':
62                     call(['convert', path, save_as])
63                 else:
64                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
65                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
66                     shutil.move(save_as + '_.' + ext, save_as)
67             else:
68                 shutil.copy(path, save_as)
69         elif not self.retrieve_file(url, save_as):
70             if url.startswith('file://'):
71                 url = ctx.files_path + url[7:]
72
73             if url.startswith('/'):
74                 url = 'http://milpeer.eu' + url
75
76             if '.' not in url:
77                 raise BuildError('Linked file without extension: %s' % url)
78             ext = url.rsplit('.', 1)[-1]
79             if image:
80                 urlretrieve(url, save_as + '_.' + ext)
81                 if ext == 'gif':
82                     call(['convert', save_as + '_.' + ext, save_as])
83                 else:
84                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
85                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300',
86                               save_as + '_2.' + ext])
87                     if r:
88                         shutil.move(save_as + '_.' + ext, save_as)
89                     else:
90                         shutil.move(save_as + '_2.' + ext, save_as)
91             else:
92                 urlretrieve(url, save_as)
93
94     def get_file(self, ctx, filename):
95         return os.path.join(ctx.workdir, filename)
96
97     def get_texml(self, build_ctx):
98         t = etree.Element(TexmlNS('TeXML'))
99
100         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
101         t.append(texml_cmd("documentclass", "wl"))
102
103         # global packages
104         self.add_file(build_ctx, 'style.sty', path=self.style)
105         t.append(texml_cmd("usepackage", "style"))
106         t.append(texml_cmd("usepackage", "hyphenat"))
107
108         # local packages
109         for i, package in enumerate(self.local_packages):
110             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
111             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
112
113         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
114         title = self.doc.meta.title()
115         t.append(texml_cmd("author", author))
116         t.append(texml_cmd("title", title))
117
118         doc = etree.SubElement(t, TexmlNS('env'), name="document")
119         doc.append(texml_cmd("thispagestyle", "empty"))
120
121         # Wielkości!
122         grp = etree.SubElement(doc, 'group')
123         grp.append(texml_cmd("raggedright"))
124         grp.append(texml_cmd("vfill"))
125         if author:
126             p = texml_cmd("par", "")
127             grp.append(p)
128             p[0].append(texml_cmd("Large"))
129             p[0].append(texml_cmd("noindent"))
130             p[0].append(texml_cmd("nohyphens", author))
131             p[0].append(texml_cmd("vspace", "1em"))
132             # p[0][-1].tail = author
133         if title:
134             p = texml_cmd("par", "")
135             grp.append(p)
136             p[0].append(texml_cmd("Huge"))
137             p[0].append(texml_cmd("noindent"))
138             p[0].append(texml_cmd("nohyphens", title))
139             # p[0][-1].tail = title
140
141         # IOFile probably would be better
142         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
143         # TEST
144         # TODO: convert
145         # cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
146         if cover_logo_url:
147             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
148             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
149             p = texml_cmd("par", "")
150             doc.append(p)
151             p[0].append(texml_cmd("noindent"))
152             p[0].append(texml_cmd("insertimage", 'coverlogo.png', "%fcm" % (2.0 * size[0] / size[1]), "2cm"))
153         doc.append(texml_cmd("vspace", "2em"))
154
155         ctx = Context(build_ctx, format=self, img=1)
156         doc.extend(self.render(self.doc.edoc.getroot(), ctx))
157
158         # Redakcyjna na końcu.
159         doc.append(texml_cmd("section*", "Information about the resource"))
160         doc.append(texml_cmd("vspace", "1em"))
161
162         for m, f, multiple in (
163                 ('Publisher: ', DCNS('publisher'), False),
164                 ('Rights: ', DCNS('rights'), False),
165                 ('', DCNS('description'), False)):
166             if multiple:
167                 v = ', '.join(self.doc.meta.get(f))
168             else:
169                 v = self.doc.meta.get_one(f)
170             if v:
171                 e = texml_cmd("par", "")
172                 e[0].append(texml_cmd("noindent"))
173                 e[0][0].tail = "%s%s" % (m, v)
174                 doc.append(e)
175                 doc.append(texml_cmd("vspace", "1em"))
176
177         e = texml_cmd("par", "")
178         e[0].append(texml_cmd("noindent"))
179         e[0][0].tail = "Resource prepared using "
180         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
181         e[0][-1].tail = " editing platform. "
182         doc.append(e)
183
184         source_url = getattr(build_ctx, 'source_url', None)
185         # source_url = 'http://milpeer.mdrn.pl/documents/27/'
186         if source_url:
187             e = texml_cmd("par", "")
188             doc.append(e)
189             e[0].append(texml_cmd("noindent"))
190             e[0][0].tail = "Source available at "
191             e[0].append(texml_cmd("href", source_url, source_url))
192
193         return t
194
195     def get_tex_dir(self, ctx):
196         ctx.workdir = mkdtemp('-wl2pdf')
197         texml = self.get_texml(ctx)
198         tex_path = os.path.join(ctx.workdir, 'doc.tex')
199         with open(tex_path, 'w') as fout:
200             # print etree.tostring(texml)
201             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
202
203         # if self.save_tex:
204         #     shutil.copy(tex_path, self.save_tex)
205
206         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
207         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
208         return ctx.workdir
209
210     def build(self, ctx=None, verbose=False):
211         temp = self.get_tex_dir(ctx)
212         tex_path = os.path.join(temp, 'doc.tex')
213         try:
214             cwd = os.getcwd()
215         except OSError:
216             cwd = None
217         os.chdir(temp)
218
219         if verbose:
220             for i in range(self.tex_passes):
221                 p = call(['xelatex', tex_path])
222         else:
223             for i in range(self.tex_passes):
224                 p = call(['xelatex', '-interaction=batchmode', tex_path],
225                          stdout=PIPE, stderr=PIPE)
226         if p:
227             # raise ParseError("Error parsing .tex file: %s" % tex_path)
228             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
229
230         if cwd is not None:
231             os.chdir(cwd)
232
233         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
234         pdf_path = os.path.join(temp, 'doc.pdf')
235         shutil.move(pdf_path, output_file.name)
236         shutil.rmtree(temp)
237         os.system("ls -l " + output_file.name)
238         return OutputFile.from_filename(output_file.name)
239     
240     def render(self, element, ctx):
241         return self.renderers.get_for(element).render(element, ctx)
242
243
244 class CmdRenderer(TreeRenderer):
245     def parms(self):
246         return []
247
248     def container(self):
249         root = etree.Element(self.root_name)
250         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
251         inner = root[0][-1]
252         return root, inner
253
254
255 class EnvRenderer(TreeRenderer):
256     def container(self):
257         root = etree.Element(self.root_name)
258         inner = etree.SubElement(root, 'env', name=self.tag_name)
259         return root, inner
260
261
262 class GroupRenderer(CmdRenderer):
263     def container(self):
264         root = etree.Element(self.root_name)
265         inner = etree.SubElement(root, 'group')
266         if self.tag_name:
267             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
268         return root, inner
269
270
271 class SectionRenderer(CmdRenderer):
272     def subcontext(self, element, ctx):
273         # here?
274         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
275
276     def container(self):
277         root = etree.Element(self.root_name)
278         root.append(texml_cmd('pagebreak', opts=['1']))
279         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
280         inner = root[1][0]
281         return root, inner
282
283 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
284
285 # TODO: stopnie
286 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
287
288 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
289
290
291 class ImgRenderer(CmdRenderer):
292     def parms(self):
293         return ["", ""]
294
295     def render(self, element, ctx):
296         root = super(ImgRenderer, self).render(element, ctx)
297         url = element.get('src')
298         nr = getattr(ctx, 'img', 0)
299         ctx.img = nr + 1
300         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
301         root[0][0].text = 'f%d.png' % nr
302         try:
303             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
304         except IOError:  # not an image
305             del root[0]
306             return root
307         root[0][1].text = '15cm'
308         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
309         return root
310
311 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
312
313
314 class VideoRenderer(CmdRenderer):
315     def render(self, element, ctx):
316         root = super(VideoRenderer, self).render(element, ctx)
317         url = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid')
318         link = texml_cmd('href', url, url)
319         root[0][0].text = None
320         root[0][0].append(link)
321         return root
322
323 PdfFormat.renderers.register(core.Div, 'video', VideoRenderer('par'))
324
325
326 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
327 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
328 PdfFormat.renderers.register(core.Span, 'item', CmdRenderer('item'))
329 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
330 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
331
332
333 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
334 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
335 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
336 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
337 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
338
339
340 class SpanUri(CmdRenderer):
341     def parms(self):
342         return [""]
343
344     def render(self, element, ctx):
345         root = super(SpanUri, self).render(element, ctx)
346         src = element.text
347         if src.startswith('file://'):
348             src = ctx.files_path + src[7:]
349         root[0][0].text = src
350         return root
351 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
352
353
354 class SpanLink(CmdRenderer):
355     def parms(self):
356         return [""]
357
358     def render(self, element, ctx):
359         root = super(SpanLink, self).render(element, ctx)
360         src = element.attrib.get('href', '')
361         if src.startswith('file://'):
362             src = ctx.files_path + src[7:]
363         root[0][0].text = src
364         return root
365 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
366
367
368 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
369 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
370 PdfFormat.renderers.register(core.Aside, 'comment', Silent())