render missing tags for epub
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import re
8 import shutil
9 from subprocess import call, PIPE
10 from tempfile import NamedTemporaryFile, mkdtemp
11 from lxml import etree
12 from urllib import urlretrieve
13 from StringIO import StringIO
14 from Texml.processor import process
15 from librarian import DCNS, XMLNamespace
16 from librarian.formats import Format
17 from librarian.output import OutputFile
18 from librarian.renderers import Register, TreeRenderer
19 from librarian.utils import Context, get_resource, extend_element
20 from librarian import core
21 from PIL import Image
22 from ..html import Silent
23
24
25 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
26
27
28 def texml_cmd(name, *parms, **kwargs):
29     cmd = etree.Element(TexmlNS('cmd'), name=name)
30     for opt in kwargs.get('opts', []):
31         etree.SubElement(cmd, TexmlNS('opt')).text = opt
32     for parm in parms:
33         etree.SubElement(cmd, TexmlNS('parm')).text = parm
34     return cmd
35
36
37 class PdfFormat(Format):
38     format_name = 'PDF'
39     format_ext = 'pdf'
40     tex_passes = 1
41     style = get_resource('formats/pdf/res/default.sty')
42
43     local_packages = [
44         get_resource('formats/pdf/res/coverimage.sty'),
45         get_resource('formats/pdf/res/insertimage.sty'),
46     ]
47
48     renderers = Register()
49
50     def retrieve_file(self, url, save_as):
51         # TODO: local sheme
52         return False
53
54     def add_file(self, ctx, filename, url=None, path=None, image=False):
55         from subprocess import call
56         assert url or path
57         save_as = os.path.join(ctx.workdir, filename)
58         if path is not None:
59             ext = path.rsplit('.', 1)[-1]
60             if image:
61                 if ext == 'gif':
62                     call(['convert', path, save_as])
63                 else:
64                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
65                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
66                     shutil.move(save_as + '_.' + ext, save_as)
67             else:
68                 shutil.copy(path, save_as)
69         elif not self.retrieve_file(url, save_as):
70             if url.startswith('file://'):
71                 url = ctx.files_path + url[7:]
72
73             if url.startswith('/'):
74                 url = 'http://milpeer.eu' + url
75
76             ext = url.rsplit('.', 1)[-1]
77             if image:
78                 urlretrieve(url, save_as + '_.' + ext)
79                 if ext == 'gif':
80                     call(['convert', save_as + '_.' + ext, save_as])
81                 else:
82                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
83                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300', save_as + '_2.' + ext])
84                     if r:
85                         shutil.move(save_as + '_.' + ext, save_as)
86                     else:
87                         shutil.move(save_as + '_2.' + ext, save_as)
88             else:
89                 urlretrieve(url, save_as)
90
91     def get_file(self, ctx, filename):
92         return os.path.join(ctx.workdir, filename)
93
94     def get_texml(self, build_ctx):
95         t = etree.Element(TexmlNS('TeXML'))
96
97         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
98         t.append(texml_cmd("documentclass", "wl"))
99
100         # global packages
101         self.add_file(build_ctx, 'style.sty', path=self.style)
102         t.append(texml_cmd("usepackage", "style"))
103         t.append(texml_cmd("usepackage", "hyphenat"))
104
105         # local packages
106         for i, package in enumerate(self.local_packages):
107             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
108             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
109
110         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
111         title = self.doc.meta.title()
112         t.append(texml_cmd("author", author))
113         t.append(texml_cmd("title", title))
114
115         doc = etree.SubElement(t, TexmlNS('env'), name="document")
116         doc.append(texml_cmd("thispagestyle", "empty"))
117
118         # title page
119         height_left = 297
120         cover_url = self.doc.meta.get_one(DCNS('relation.coverimage.url'))
121         if cover_url:
122             self.add_file(build_ctx, 'cover.png', cover_url, image=True)
123             
124             img = Image.open(self.get_file(build_ctx, 'cover.png'))
125             size = img.size
126
127             if (size[1] > size[0]):
128                 img = img.crop((0, 0, size[0], size[0]))
129                 img.save(self.get_file(build_ctx, 'cover.png'), format=img.format, quality=90)
130             size = img.size
131
132             # TODO: hardcoded paper size here
133             height = 210.0 * size[1] / size[0]
134             doc.append(texml_cmd("makecover", "%fmm" % height))
135         else:
136             doc.append(texml_cmd("vfill*"))
137
138         # Wielkości!
139         grp = etree.SubElement(doc, 'group')
140         grp.append(texml_cmd("raggedright"))
141         grp.append(texml_cmd("vfill"))
142         if author:
143             p = texml_cmd("par", "")
144             grp.append(p)
145             p[0].append(texml_cmd("Large"))
146             p[0].append(texml_cmd("noindent"))
147             p[0].append(texml_cmd("nohyphens", author))
148             p[0].append(texml_cmd("vspace", "1em"))
149             #p[0][-1].tail = author
150         if title:
151             p = texml_cmd("par", "")
152             grp.append(p)
153             p[0].append(texml_cmd("Huge"))
154             p[0].append(texml_cmd("noindent"))
155             p[0].append(texml_cmd("nohyphens", title))
156             #p[0][-1].tail = title
157         doc.append(texml_cmd("vfill"))
158         doc.append(texml_cmd("vfill"))
159
160         # IOFile probably would be better
161         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
162         # TEST
163         # TODO: convert
164         #cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
165         if cover_logo_url:
166             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
167             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
168             p = texml_cmd("par", "")
169             doc.append(p)
170             p[0].append(texml_cmd("noindent"))
171             p[0].append(texml_cmd("insertimage", 'coverlogo.png', "%fcm" % (1.0 * size[0] / size[1]), "1cm"))
172             
173         # logo organizacji!
174         doc.append(texml_cmd("clearpage"))
175
176         ctx = Context(build_ctx, format=self, img=1)
177         doc.extend(self.render(self.doc.edoc.getroot(), ctx))
178
179         # Redakcyjna na końcu.
180         doc.append(texml_cmd("clearpage"))
181
182         doc.append(texml_cmd("section*", "Information about the resource"))
183         doc.append(texml_cmd("vspace", "1em"))
184
185         for m, f in (
186             ('Publisher: ', DCNS('publisher')),
187             ('Rights: ', DCNS('rights')),
188             ('Intended audience: ', DCNS('audience')),
189             ('', DCNS('description')),
190             ):
191             v = self.doc.meta.get_one(f)
192             if v:
193                 e = texml_cmd("par", "")
194                 e[0].append(texml_cmd("noindent"))
195                 e[0][0].tail = "%s%s" % (m, v)
196                 doc.append(e)
197                 doc.append(texml_cmd("vspace", "1em"))
198
199
200         e = texml_cmd("par", "")
201         e[0].append(texml_cmd("noindent"))
202         e[0][0].tail = "Resource prepared using "
203         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
204         e[0][-1].tail = " editing platform. "
205         doc.append(e)
206
207         source_url = getattr(build_ctx, 'source_url', None)
208         #source_url = 'http://milpeer.mdrn.pl/documents/27/'
209         if source_url:
210             e = texml_cmd("par", "")
211             doc.append(e)
212             e[0].append(texml_cmd("noindent"))
213             e[0][0].tail = "Source available at "
214             e[0].append(texml_cmd("href", source_url, source_url))
215
216         return t
217
218     def get_tex_dir(self, ctx):
219         ctx.workdir = mkdtemp('-wl2pdf')
220         texml = self.get_texml(ctx)
221         tex_path = os.path.join(ctx.workdir, 'doc.tex')
222         with open(tex_path, 'w') as fout:
223             #print etree.tostring(texml)
224             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
225
226         #~ if self.save_tex:
227             #~ shutil.copy(tex_path, self.save_tex)
228
229
230
231         #for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
232         #    shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
233         return ctx.workdir
234
235     def build(self, ctx=None, verbose=False):
236         temp = self.get_tex_dir(ctx)
237         tex_path = os.path.join(temp, 'doc.tex')
238         try:
239             cwd = os.getcwd()
240         except OSError:
241             cwd = None
242         os.chdir(temp)
243
244         if verbose:
245             for i in range(self.tex_passes):
246                 p = call(['xelatex', tex_path])
247         else:
248             for i in range(self.tex_passes):
249                 p = call(['xelatex', '-interaction=batchmode', tex_path],
250                             stdout=PIPE, stderr=PIPE)
251         if p:
252             #raise ParseError("Error parsing .tex file: %s" % tex_path)
253             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
254
255         if cwd is not None:
256             os.chdir(cwd)
257
258         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
259         pdf_path = os.path.join(temp, 'doc.pdf')
260         shutil.move(pdf_path, output_file.name)
261         shutil.rmtree(temp)
262         os.system("ls -l " + output_file.name)
263         return OutputFile.from_filename(output_file.name)
264     
265     def render(self, element, ctx):
266         return self.renderers.get_for(element).render(element, ctx)
267
268
269
270
271 class CmdRenderer(TreeRenderer):
272     def parms(self):
273         return []
274     def container(self):
275         root = etree.Element(self.root_name)
276         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
277         inner = root[0][-1]
278         return root, inner
279
280 class EnvRenderer(TreeRenderer):
281     def container(self):
282         root = etree.Element(self.root_name)
283         inner = etree.SubElement(root, 'env', name=self.tag_name)
284         return root, inner
285
286 class GroupRenderer(CmdRenderer):
287     def container(self):
288         root = etree.Element(self.root_name)
289         inner = etree.SubElement(root, 'group')
290         if self.tag_name:
291             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
292         return root, inner
293
294
295 class SectionRenderer(CmdRenderer):
296     def subcontext(self, element, ctx):
297         # here?
298         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
299
300     def container(self):
301         root = etree.Element(self.root_name)
302         root.append(texml_cmd('pagebreak', opts=['1']))
303         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
304         inner = root[1][0]
305         return root, inner
306
307 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
308
309 # TODO: stopnie
310 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
311
312 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
313
314 class ImgRenderer(CmdRenderer):
315     def parms(self):
316         return ["", ""]
317
318     def render(self, element, ctx):
319         root = super(ImgRenderer, self).render(element, ctx)
320         url = element.get('src')
321         nr = getattr(ctx, 'img', 0)
322         ctx.img = nr + 1
323         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
324         root[0][0].text = 'f%d.png' % nr
325         try:
326             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
327         except IOError: # not an image
328             del root[0];
329             return root
330         root[0][1].text = '15cm'
331         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
332         return root
333
334 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
335
336
337 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
338 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
339 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
340 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
341
342
343
344 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
345 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
346 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
347 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
348 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
349
350 class SpanUri(CmdRenderer):
351     def parms(self):
352         return [""]
353     def render(self, element, ctx):
354         root = super(SpanUri, self).render(element, ctx)
355         src = element.text
356         if src.startswith('file://'):
357            src = ctx.files_path + src[7:]
358         root[0][0].text = src
359         return root
360 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
361
362
363 class SpanLink(CmdRenderer):
364     def parms(self):
365         return [""]
366     def render(self, element, ctx):
367         root = super(SpanLink, self).render(element, ctx)
368         src = element.attrib.get('href', '')
369         if src.startswith('file://'):
370            src = ctx.files_path + src[7:]
371         root[0][0].text = src
372         return root
373 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
374
375
376
377
378 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
379 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
380 PdfFormat.renderers.register(core.Aside, 'comment', Silent())
381