document validation (stub)
[librarian.git] / librarian / formats / pdf / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import shutil
8 from subprocess import call, PIPE
9 from tempfile import NamedTemporaryFile, mkdtemp
10 from lxml import etree
11 from urllib import urlretrieve
12 from StringIO import StringIO
13 from Texml.processor import process
14 from librarian import DCNS, XMLNamespace
15 from librarian.formats import Format
16 from librarian.output import OutputFile
17 from librarian.renderers import Register, TreeRenderer
18 from librarian.utils import Context, get_resource
19 from librarian import core
20 from PIL import Image
21 from ..html import Silent
22
23
24 TexmlNS = XMLNamespace('http://getfo.sourceforge.net/texml/ns1')
25
26
27 def texml_cmd(name, *parms, **kwargs):
28     cmd = etree.Element(TexmlNS('cmd'), name=name)
29     for opt in kwargs.get('opts', []):
30         etree.SubElement(cmd, TexmlNS('opt')).text = opt
31     for parm in parms:
32         etree.SubElement(cmd, TexmlNS('parm')).text = parm
33     return cmd
34
35
36 class PdfFormat(Format):
37     format_name = 'PDF'
38     format_ext = 'pdf'
39     tex_passes = 1
40     style = get_resource('formats/pdf/res/default.sty')
41
42     local_packages = [
43         get_resource('formats/pdf/res/coverimage.sty'),
44         get_resource('formats/pdf/res/insertimage.sty'),
45     ]
46
47     renderers = Register()
48
49     def retrieve_file(self, url, save_as):
50         # TODO: local sheme
51         return False
52
53     def add_file(self, ctx, filename, url=None, path=None, image=False):
54         from subprocess import call
55         assert url or path
56         save_as = os.path.join(ctx.workdir, filename)
57         if path is not None:
58             ext = path.rsplit('.', 1)[-1]
59             if image:
60                 if ext == 'gif':
61                     call(['convert', path, save_as])
62                 else:
63                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
64                     call(['convert', '-units', 'PixelsPerInch', path, '-density', '300', save_as + '_.' + ext])
65                     shutil.move(save_as + '_.' + ext, save_as)
66             else:
67                 shutil.copy(path, save_as)
68         elif not self.retrieve_file(url, save_as):
69             if url.startswith('file://'):
70                 url = ctx.files_path + url[7:]
71
72             if url.startswith('/'):
73                 url = 'http://milpeer.eu' + url
74
75             ext = url.rsplit('.', 1)[-1]
76             if image:
77                 urlretrieve(url, save_as + '_.' + ext)
78                 if ext == 'gif':
79                     call(['convert', save_as + '_.' + ext, save_as])
80                 else:
81                     # JPEGs with bad density will break LaTeX with 'Dimension too large'.
82                     r = call(['convert', '-units', 'PixelsPerInch', save_as + '_.' + ext, '-density', '300',
83                               save_as + '_2.' + ext])
84                     if r:
85                         shutil.move(save_as + '_.' + ext, save_as)
86                     else:
87                         shutil.move(save_as + '_2.' + ext, save_as)
88             else:
89                 urlretrieve(url, save_as)
90
91     def get_file(self, ctx, filename):
92         return os.path.join(ctx.workdir, filename)
93
94     def get_texml(self, build_ctx):
95         t = etree.Element(TexmlNS('TeXML'))
96
97         self.add_file(build_ctx, 'wl.cls', path=get_resource('formats/pdf/res/wl.cls'))
98         t.append(texml_cmd("documentclass", "wl"))
99
100         # global packages
101         self.add_file(build_ctx, 'style.sty', path=self.style)
102         t.append(texml_cmd("usepackage", "style"))
103         t.append(texml_cmd("usepackage", "hyphenat"))
104
105         # local packages
106         for i, package in enumerate(self.local_packages):
107             self.add_file(build_ctx, "librarianlocalpackage%s.sty" % i, path=package)
108             t.append(texml_cmd("usepackage", "librarianlocalpackage%s" % i))
109
110         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
111         title = self.doc.meta.title()
112         t.append(texml_cmd("author", author))
113         t.append(texml_cmd("title", title))
114
115         doc = etree.SubElement(t, TexmlNS('env'), name="document")
116         doc.append(texml_cmd("thispagestyle", "empty"))
117
118         # title page
119         height_left = 297
120         cover_url = self.doc.meta.get_one(DCNS('relation.coverimage.url'))
121         if cover_url:
122             self.add_file(build_ctx, 'cover.png', cover_url, image=True)
123             
124             img = Image.open(self.get_file(build_ctx, 'cover.png'))
125             size = img.size
126
127             if size[1] > size[0]:
128                 img = img.crop((0, 0, size[0], size[0]))
129                 img.save(self.get_file(build_ctx, 'cover.png'), format=img.format, quality=90)
130             size = img.size
131
132             # TODO: hardcoded paper size here
133             height = 210.0 * size[1] / size[0]
134             doc.append(texml_cmd("makecover", "%fmm" % height))
135         else:
136             doc.append(texml_cmd("vfill*"))
137
138         # Wielkości!
139         grp = etree.SubElement(doc, 'group')
140         grp.append(texml_cmd("raggedright"))
141         grp.append(texml_cmd("vfill"))
142         if author:
143             p = texml_cmd("par", "")
144             grp.append(p)
145             p[0].append(texml_cmd("Large"))
146             p[0].append(texml_cmd("noindent"))
147             p[0].append(texml_cmd("nohyphens", author))
148             p[0].append(texml_cmd("vspace", "1em"))
149             # p[0][-1].tail = author
150         if title:
151             p = texml_cmd("par", "")
152             grp.append(p)
153             p[0].append(texml_cmd("Huge"))
154             p[0].append(texml_cmd("noindent"))
155             p[0].append(texml_cmd("nohyphens", title))
156             # p[0][-1].tail = title
157         doc.append(texml_cmd("vfill"))
158         doc.append(texml_cmd("vfill"))
159
160         # IOFile probably would be better
161         cover_logo_url = getattr(build_ctx, 'cover_logo', None)
162         # TEST
163         # TODO: convert
164         # cover_logo_url = 'http://milpeer.mdrn.pl/media/dynamic/people/logo/nowoczesnapolska.org.pl.png'
165         if cover_logo_url:
166             self.add_file(build_ctx, 'coverlogo.png', cover_logo_url, image=True)
167             size = Image.open(self.get_file(build_ctx, 'coverlogo.png')).size
168             p = texml_cmd("par", "")
169             doc.append(p)
170             p[0].append(texml_cmd("noindent"))
171             p[0].append(texml_cmd("insertimage", 'coverlogo.png', "%fcm" % (1.0 * size[0] / size[1]), "1cm"))
172             
173         # logo organizacji!
174         doc.append(texml_cmd("clearpage"))
175
176         ctx = Context(build_ctx, format=self, img=1)
177         doc.extend(self.render(self.doc.edoc.getroot(), ctx))
178
179         # Redakcyjna na końcu.
180         doc.append(texml_cmd("clearpage"))
181
182         doc.append(texml_cmd("section*", "Information about the resource"))
183         doc.append(texml_cmd("vspace", "1em"))
184
185         for m, f in (
186                 ('Publisher: ', DCNS('publisher')),
187                 ('Rights: ', DCNS('rights')),
188                 ('Intended audience: ', DCNS('audience')),
189                 ('', DCNS('description'))):
190             v = self.doc.meta.get_one(f)
191             if v:
192                 e = texml_cmd("par", "")
193                 e[0].append(texml_cmd("noindent"))
194                 e[0][0].tail = "%s%s" % (m, v)
195                 doc.append(e)
196                 doc.append(texml_cmd("vspace", "1em"))
197
198         e = texml_cmd("par", "")
199         e[0].append(texml_cmd("noindent"))
200         e[0][0].tail = "Resource prepared using "
201         e[0].append(texml_cmd("href", "http://milpeer.eu", "MIL/PEER"))
202         e[0][-1].tail = " editing platform. "
203         doc.append(e)
204
205         source_url = getattr(build_ctx, 'source_url', None)
206         # source_url = 'http://milpeer.mdrn.pl/documents/27/'
207         if source_url:
208             e = texml_cmd("par", "")
209             doc.append(e)
210             e[0].append(texml_cmd("noindent"))
211             e[0][0].tail = "Source available at "
212             e[0].append(texml_cmd("href", source_url, source_url))
213
214         return t
215
216     def get_tex_dir(self, ctx):
217         ctx.workdir = mkdtemp('-wl2pdf')
218         texml = self.get_texml(ctx)
219         tex_path = os.path.join(ctx.workdir, 'doc.tex')
220         with open(tex_path, 'w') as fout:
221             # print etree.tostring(texml)
222             process(StringIO(etree.tostring(texml)), fout, 'utf-8')
223
224         # if self.save_tex:
225         #     shutil.copy(tex_path, self.save_tex)
226
227         # for sfile in ['wasysym.sty', 'uwasyvar.fd', 'uwasy.fd']:
228         #     shutil.copy(get_resource(os.path.join('res/wasysym', sfile)), temp)
229         return ctx.workdir
230
231     def build(self, ctx=None, verbose=False):
232         temp = self.get_tex_dir(ctx)
233         tex_path = os.path.join(temp, 'doc.tex')
234         try:
235             cwd = os.getcwd()
236         except OSError:
237             cwd = None
238         os.chdir(temp)
239
240         if verbose:
241             for i in range(self.tex_passes):
242                 p = call(['xelatex', tex_path])
243         else:
244             for i in range(self.tex_passes):
245                 p = call(['xelatex', '-interaction=batchmode', tex_path],
246                          stdout=PIPE, stderr=PIPE)
247         if p:
248             # raise ParseError("Error parsing .tex file: %s" % tex_path)
249             raise RuntimeError("Error parsing .tex file: %s" % tex_path)
250
251         if cwd is not None:
252             os.chdir(cwd)
253
254         output_file = NamedTemporaryFile(prefix='librarian', suffix='.pdf', delete=False)
255         pdf_path = os.path.join(temp, 'doc.pdf')
256         shutil.move(pdf_path, output_file.name)
257         shutil.rmtree(temp)
258         os.system("ls -l " + output_file.name)
259         return OutputFile.from_filename(output_file.name)
260     
261     def render(self, element, ctx):
262         return self.renderers.get_for(element).render(element, ctx)
263
264
265 class CmdRenderer(TreeRenderer):
266     def parms(self):
267         return []
268
269     def container(self):
270         root = etree.Element(self.root_name)
271         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
272         inner = root[0][-1]
273         return root, inner
274
275
276 class EnvRenderer(TreeRenderer):
277     def container(self):
278         root = etree.Element(self.root_name)
279         inner = etree.SubElement(root, 'env', name=self.tag_name)
280         return root, inner
281
282
283 class GroupRenderer(CmdRenderer):
284     def container(self):
285         root = etree.Element(self.root_name)
286         inner = etree.SubElement(root, 'group')
287         if self.tag_name:
288             inner.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
289         return root, inner
290
291
292 class SectionRenderer(CmdRenderer):
293     def subcontext(self, element, ctx):
294         # here?
295         return Context(ctx, toc_level=getattr(ctx, 'toc_level', 1) + 2)
296
297     def container(self):
298         root = etree.Element(self.root_name)
299         root.append(texml_cmd('pagebreak', opts=['1']))
300         root.append(texml_cmd(self.tag_name, *(self.parms() + [""])))
301         inner = root[1][0]
302         return root, inner
303
304 PdfFormat.renderers.register(core.Section, None, SectionRenderer('par'))
305
306 # TODO: stopnie
307 PdfFormat.renderers.register(core.Header, None, CmdRenderer('section*'))
308
309 PdfFormat.renderers.register(core.Div, None, CmdRenderer('par'))
310
311
312 class ImgRenderer(CmdRenderer):
313     def parms(self):
314         return ["", ""]
315
316     def render(self, element, ctx):
317         root = super(ImgRenderer, self).render(element, ctx)
318         url = element.get('src')
319         nr = getattr(ctx, 'img', 0)
320         ctx.img = nr + 1
321         ctx.format.add_file(ctx, 'f%d.png' % nr, url, image=True)
322         root[0][0].text = 'f%d.png' % nr
323         try:
324             size = Image.open(ctx.format.get_file(ctx, 'f%d.png' % nr)).size
325         except IOError:  # not an image
326             del root[0]
327             return root
328         root[0][1].text = '15cm'
329         root[0][2].text = '%fcm' % (15.0 * size[1] / size[0])
330         return root
331
332 PdfFormat.renderers.register(core.Div, 'img', ImgRenderer('insertimage'))
333
334
335 PdfFormat.renderers.register(core.Div, 'defined', CmdRenderer('textbf'))
336 PdfFormat.renderers.register(core.Div, 'item', CmdRenderer('item'))
337 PdfFormat.renderers.register(core.Div, 'list', EnvRenderer('itemize'))
338 PdfFormat.renderers.register(core.Div, 'list.enum', EnvRenderer('enumerate'))
339
340
341 PdfFormat.renderers.register(core.Span, None, TreeRenderer())
342 PdfFormat.renderers.register(core.Span, 'cite', CmdRenderer('emph'))
343 PdfFormat.renderers.register(core.Span, 'cite.code', CmdRenderer('texttt'))
344 PdfFormat.renderers.register(core.Span, 'emp', CmdRenderer('textbf'))
345 PdfFormat.renderers.register(core.Span, 'emph', CmdRenderer('emph'))
346
347
348 class SpanUri(CmdRenderer):
349     def parms(self):
350         return [""]
351
352     def render(self, element, ctx):
353         root = super(SpanUri, self).render(element, ctx)
354         src = element.text
355         if src.startswith('file://'):
356             src = ctx.files_path + src[7:]
357         root[0][0].text = src
358         return root
359 PdfFormat.renderers.register(core.Span, 'uri', SpanUri('href'))
360
361
362 class SpanLink(CmdRenderer):
363     def parms(self):
364         return [""]
365
366     def render(self, element, ctx):
367         root = super(SpanLink, self).render(element, ctx)
368         src = element.attrib.get('href', '')
369         if src.startswith('file://'):
370             src = ctx.files_path + src[7:]
371         root[0][0].text = src
372         return root
373 PdfFormat.renderers.register(core.Span, 'link', SpanLink('href'))
374
375
376 PdfFormat.renderers.register(core.Aside, None, TreeRenderer())
377 PdfFormat.renderers.register(core.Aside, 'editorial', CmdRenderer('editorialpage'))
378 PdfFormat.renderers.register(core.Aside, 'comment', Silent())