fd1fd5e4889dd032d184d4e70452e9b73d276a47
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import shutil
8 from subprocess import call, PIPE
9 from tempfile import NamedTemporaryFile, mkdtemp
10 from lxml import etree
11 from urllib import urlretrieve
12 from StringIO import StringIO
13 from Texml.processor import process
14 from librarian import DCNS, XMLNamespace, BuildError
15 from librarian.formats import Format
16 from librarian.output import OutputFile
17 from librarian.renderers import Register, TreeRenderer
18 from librarian.utils import Context, get_resource
19 from librarian import core
20 from PIL import Image
21 from ..html import Silent
22
23
24 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
25
26
27 def texml_cmd(name, *parms, **kwargs):
28     cmd = etree.Element(TexmlNS('cmd'), name=name)
29     for opt in kwargs.get('opts', []):
30         etree.SubElement(cmd, TexmlNS('opt')).text = opt
31     for parm in parms:
32         etree.SubElement(cmd, TexmlNS('parm')).text = parm
33     return cmd
34
35
36 class PdfFormat(Format):
37     format_name = 'PDF'
38     format_ext = 'pdf'
39     tex_passes = 1
40     style = get_resource('formats/pdf/res/default.sty')
41
42     local_packages = [
43         get_resource('formats/pdf/res/coverimage.sty'),
44         get_resource('formats/pdf/res/insertimage.sty'),
45     ]
46
47     renderers = Register()
48
49     def retrieve_file(self, url, save_as):
50         # TODO: local sheme
51         return False
52
53     def add_file(self, ctx, filename, url=None, path=None, image=False):
54         from subprocess import call
55         if not url or path:
56             raise BuildError('No URL or path for image')
57         save_as = os.path.join(ctx.workdir, filename)
58         if path is not None:
59             ext = path.rsplit('.', 1)[-1]
60             if image:
61                 if ext == 'gif':
62                     call(['convert', path, save_as])
63                 else:
64                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
65                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
66                     shutil.move(save_as + '_.' + ext, save_as)
67             else:
68                 shutil.copy(path, save_as)
69         elif not self.retrieve_file(url, save_as):
70             if url.startswith('file://'):
71                 url = ctx.files_path + url[7:]
72
73             if url.startswith('/'):
74                 url = 'http://milpeer.eu' + url
75
76             ext = url.rsplit('.', 1)[-1]
77             if image:
78                 urlretrieve(url, save_as + '_.' + ext)
79                 if ext == 'gif':
80                     call(['convert', save_as + '_.' + ext, save_as])
81                 else:
82                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
83                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300',
84                               save_as + '_2.' + ext])
85                     if r:
86                         shutil.move(save_as + '_.' + ext, save_as)
87                     else:
88                         shutil.move(save_as + '_2.' + ext, save_as)
89             else:
90                 urlretrieve(url, save_as)
91
92     def get_file(self, ctx, filename):
93         return os.path.join(ctx.workdir, filename)
94
95     def get_texml(self, build_ctx):
96         t = etree.Element(TexmlNS('TeXML'))
97
98         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
99         t.append(texml_cmd("documentclass", "wl"))
100
101         # global packages
102         self.add_file(build_ctx, 'style.sty', path=self.style)
103         t.append(texml_cmd("usepackage", "style"))
104         t.append(texml_cmd("usepackage", "hyphenat"))
105
106         # local packages
107         for i, package in enumerate(self.local_packages):
108             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
109             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
110
111         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
112         title = self.doc.meta.title()
113         t.append(texml_cmd("author", author))
114         t.append(texml_cmd("title", title))
115
116         doc = etree.SubElement(t, TexmlNS('env'), name="document")
117         doc.append(texml_cmd("thispagestyle", "empty"))
118
119         # title page
120         height_left = 297
121         cover_url = self.doc.meta.get_one(DCNS('relation.coverimage.url'))
122         if cover_url:
123             self.add_file(build_ctx, 'cover.png', cover_url, image=True)
124             
125             img = Image.open(self.get_file(build_ctx, 'cover.png'))
126             size = img.size
127
128             if size[1] > size[0]:
129                 img = img.crop((0, 0, size[0], size[0]))
130                 img.save(self.get_file(build_ctx, 'cover.png'), format=img.format, quality=90)
131             size = img.size
132
133             # TODO: hardcoded paper size here
134             height = 210.0 * size[1] / size[0]
135             doc.append(texml_cmd("makecover", "%fmm" % height))
136         else:
137             doc.append(texml_cmd("vfill*"))
138
139         # Wielkości!
140         grp = etree.SubElement(doc, 'group')
141         grp.append(texml_cmd("raggedright"))
142         grp.append(texml_cmd("vfill"))
143         if author:
144             p = texml_cmd("par", "")
145             grp.append(p)
146             p[0].append(texml_cmd("Large"))
147             p[0].append(texml_cmd("noindent"))
148             p[0].append(texml_cmd("nohyphens", author))
149             p[0].append(texml_cmd("vspace", "1em"))
150             # p[0][-1].tail = author
151         if title:
152             p = texml_cmd("par", "")
153             grp.append(p)
154             p[0].append(texml_cmd("Huge"))
155             p[0].append(texml_cmd("noindent"))
156             p[0].append(texml_cmd("nohyphens", title))
157             # p[0][-1].tail = title
158         doc.append(texml_cmd("vfill"))
159         doc.append(texml_cmd("vfill"))
160
161         # IOFile probably would be better
162         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
163         # TEST
164         # TODO: convert
165         # cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
166         if cover_logo_url:
167             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
168             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
169             p = texml_cmd("par", "")
170             doc.append(p)
171             p[0].append(texml_cmd("noindent"))
172             p[0].append(texml_cmd("insertimage", 'coverlogo.png', "%fcm" % (1.0 * size[0] / size[1]), "1cm"))
173             
174         # logo organizacji!
175         doc.append(texml_cmd("clearpage"))
176
177         ctx = Context(build_ctx, format=self, img=1)
178         doc.extend(self.render(self.doc.edoc.getroot(), ctx))
179
180         # Redakcyjna na końcu.
181         doc.append(texml_cmd("clearpage"))
182
183         doc.append(texml_cmd("section*", "Information about the resource"))
184         doc.append(texml_cmd("vspace", "1em"))
185
186         for m, f in (
187                 ('Publisher: ', DCNS('publisher')),
188                 ('Rights: ', DCNS('rights')),
189                 ('Intended audience: ', DCNS('audience')),
190                 ('', DCNS('description'))):
191             v = self.doc.meta.get_one(f)
192             if v:
193                 e = texml_cmd("par", "")
194                 e[0].append(texml_cmd("noindent"))
195                 e[0][0].tail = "%s%s" % (m, v)
196                 doc.append(e)
197                 doc.append(texml_cmd("vspace", "1em"))
198
199         e = texml_cmd("par", "")
200         e[0].append(texml_cmd("noindent"))
201         e[0][0].tail = "Resource prepared using "
202         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
203         e[0][-1].tail = " editing platform. "
204         doc.append(e)
205
206         source_url = getattr(build_ctx, 'source_url', None)
207         # source_url = 'http://milpeer.mdrn.pl/documents/27/'
208         if source_url:
209             e = texml_cmd("par", "")
210             doc.append(e)
211             e[0].append(texml_cmd("noindent"))
212             e[0][0].tail = "Source available at "
213             e[0].append(texml_cmd("href", source_url, source_url))
214
215         return t
216
217     def get_tex_dir(self, ctx):
218         ctx.workdir = mkdtemp('-wl2pdf')
219         texml = self.get_texml(ctx)
220         tex_path = os.path.join(ctx.workdir, 'doc.tex')
221         with open(tex_path, 'w') as fout:
222             # print etree.tostring(texml)
223             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
224
225         # if self.save_tex:
226         #     shutil.copy(tex_path, self.save_tex)
227
228         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
229         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
230         return ctx.workdir
231
232     def build(self, ctx=None, verbose=False):
233         temp = self.get_tex_dir(ctx)
234         tex_path = os.path.join(temp, 'doc.tex')
235         try:
236             cwd = os.getcwd()
237         except OSError:
238             cwd = None
239         os.chdir(temp)
240
241         if verbose:
242             for i in range(self.tex_passes):
243                 p = call(['xelatex', tex_path])
244         else:
245             for i in range(self.tex_passes):
246                 p = call(['xelatex', '-interaction=batchmode', tex_path],
247                          stdout=PIPE, stderr=PIPE)
248         if p:
249             # raise ParseError("Error parsing .tex file: %s" % tex_path)
250             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
251
252         if cwd is not None:
253             os.chdir(cwd)
254
255         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
256         pdf_path = os.path.join(temp, 'doc.pdf')
257         shutil.move(pdf_path, output_file.name)
258         shutil.rmtree(temp)
259         os.system("ls -l " + output_file.name)
260         return OutputFile.from_filename(output_file.name)
261     
262     def render(self, element, ctx):
263         return self.renderers.get_for(element).render(element, ctx)
264
265
266 class CmdRenderer(TreeRenderer):
267     def parms(self):
268         return []
269
270     def container(self):
271         root = etree.Element(self.root_name)
272         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
273         inner = root[0][-1]
274         return root, inner
275
276
277 class EnvRenderer(TreeRenderer):
278     def container(self):
279         root = etree.Element(self.root_name)
280         inner = etree.SubElement(root, 'env', name=self.tag_name)
281         return root, inner
282
283
284 class GroupRenderer(CmdRenderer):
285     def container(self):
286         root = etree.Element(self.root_name)
287         inner = etree.SubElement(root, 'group')
288         if self.tag_name:
289             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
290         return root, inner
291
292
293 class SectionRenderer(CmdRenderer):
294     def subcontext(self, element, ctx):
295         # here?
296         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
297
298     def container(self):
299         root = etree.Element(self.root_name)
300         root.append(texml_cmd('pagebreak', opts=['1']))
301         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
302         inner = root[1][0]
303         return root, inner
304
305 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
306
307 # TODO: stopnie
308 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
309
310 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
311
312
313 class ImgRenderer(CmdRenderer):
314     def parms(self):
315         return ["", ""]
316
317     def render(self, element, ctx):
318         root = super(ImgRenderer, self).render(element, ctx)
319         url = element.get('src')
320         nr = getattr(ctx, 'img', 0)
321         ctx.img = nr + 1
322         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
323         root[0][0].text = 'f%d.png' % nr
324         try:
325             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
326         except IOError:  # not an image
327             del root[0]
328             return root
329         root[0][1].text = '15cm'
330         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
331         return root
332
333 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
334
335
336 class VideoRenderer(CmdRenderer):
337     def render(self, element, ctx):
338         root = super(VideoRenderer, self).render(element, ctx)
339         url = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid')
340         link = texml_cmd('href', url, url)
341         root[0][0].text = None
342         root[0][0].append(link)
343         return root
344
345 PdfFormat.renderers.register(core.Div, 'video', VideoRenderer('par'))
346
347
348 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
349 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
350 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
351 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
352
353
354 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
355 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
356 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
357 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
358 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
359
360
361 class SpanUri(CmdRenderer):
362     def parms(self):
363         return [""]
364
365     def render(self, element, ctx):
366         root = super(SpanUri, self).render(element, ctx)
367         src = element.text
368         if src.startswith('file://'):
369             src = ctx.files_path + src[7:]
370         root[0][0].text = src
371         return root
372 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
373
374
375 class SpanLink(CmdRenderer):
376     def parms(self):
377         return [""]
378
379     def render(self, element, ctx):
380         root = super(SpanLink, self).render(element, ctx)
381         src = element.attrib.get('href', '')
382         if src.startswith('file://'):
383             src = ctx.files_path + src[7:]
384         root[0][0].text = src
385         return root
386 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
387
388
389 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
390 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
391 PdfFormat.renderers.register(core.Aside, 'comment', Silent())