render missing tags for epub
[librarian.git] / librarian / formats / epub / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import re
8 import urllib
9 from copy import deepcopy
10 from mimetypes import guess_type
11 from tempfile import NamedTemporaryFile
12 import zipfile
13 from urllib2 import urlopen
14
15 from lxml import etree
16 from librarian import OPFNS, NCXNS, XHTMLNS, DCNS
17 from librarian import core
18 from librarian.formats import Format
19 from librarian.formats.cover.evens import EvensCover
20 from librarian.output import OutputFile
21 from librarian.renderers import Register, TreeRenderer, UnknownElement
22 from librarian.utils import Context, get_resource, extend_element
23
24
25 class EpubFormat(Format):
26     format_name = 'EPUB'
27     format_ext = 'epub'
28
29     cover = EvensCover
30     renderers = Register()
31
32     def __init__(self, doc, cover=None, with_fonts=True):
33         super(EpubFormat, self).__init__(doc)
34         self.with_fonts = with_fonts
35         if cover is not None:
36             self.cover = cover
37
38     def dc(self, tag):
39         return self.doc.meta.get_one(DCNS(tag))
40
41     def build(self, ctx=None):
42
43         def add_file(url, file_id):
44             filename = url.rsplit('/', 1)[1]
45             if url.startswith('file://'):
46                 url = ctx.files_path + urllib.quote(url[7:])
47             if url.startswith('/'):
48                 url = 'http://milpeer.eu' + url
49             file_content = urlopen(url).read()
50             zip.writestr(os.path.join('OPS', filename), file_content)
51             manifest.append(etree.fromstring(
52                 '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
53
54         opf = etree.parse(get_resource('formats/epub/res/content.opf'))
55         manifest = opf.find(OPFNS('manifest'))
56         guide = opf.find(OPFNS('guide'))
57         spine = opf.find(OPFNS('spine'))
58
59         author = ", ". join(self.doc.meta.get(DCNS('creator')) or [])
60         title = self.doc.meta.title()
61         opf.find('.//' + DCNS('creator')).text = author
62         opf.find('.//' + DCNS('title')).text = title
63
64         output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
65         zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
66
67         mime = zipfile.ZipInfo()
68         mime.filename = 'mimetype'
69         mime.compress_type = zipfile.ZIP_STORED
70         mime.extra = ''
71         zip.writestr(mime, 'application/epub+zip')
72         zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
73                      'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
74                      '<rootfiles><rootfile full-path="OPS/content.opf" '
75                      'media-type="application/oebps-package+xml" />'
76                      '</rootfiles></container>')
77
78         toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
79                                     '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
80                                     '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
81                                     'version="2005-1"><head></head><docTitle></docTitle><navMap>'
82                                     '</navMap></ncx>')
83         # nav_map = toc_file[-1]
84
85         if self.cover is not None:
86             # cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
87             cover = self.cover(self.doc)
88             cover.set_images(ctx)
89             cover_output = cover.build()
90             cover_name = 'cover.%s' % cover.format_ext
91             zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
92             del cover_output
93
94             cover_tree = etree.parse(get_resource('formats/epub/res/cover.html'))
95             cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
96             zip.writestr('OPS/cover.html', etree.tostring(
97                             cover_tree, method="html", pretty_print=True))
98
99             if cover.uses_dc_cover:
100                 if self.doc.meta.get_one('cover_by'):
101                     self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
102                 if self.doc.meta.get_one('cover_source'):
103                     self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
104
105             manifest.append(etree.fromstring(
106                 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
107             manifest.append(etree.fromstring(
108                 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, cover.mime_type())))
109             spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
110             opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
111             guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
112
113         if not ctx:
114             ctx = Context(format=self)
115         else:
116             ctx.format = self
117         ctx.toc = TOC()
118         ctx.toc_level = 0
119         ctx.footnotes = Footnotes()
120         ctx.images = []
121         ctx.part_no = 0
122
123         wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
124         for e in self.render(self.doc.edoc.getroot(), ctx):
125             if not len(e) and not (e.text and e.text.strip()):
126                 continue
127             wrap = deepcopy(wrap_tmpl)
128             extend_element(wrap.find('//*[@id="book-text"]'), e)
129
130             partstr = 'part%d' % int(e.get('part_no'))
131             manifest.append(manifest.makeelement(OPFNS('item'), attrib={
132                                  'id': partstr,
133                                  'href': partstr + ".html",
134                                  'media-type': 'application/xhtml+xml',
135                              }))
136             spine.append(spine.makeelement(OPFNS('itemref'), attrib={
137                         'idref': partstr,
138                     }))
139             zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
140
141         for i, url in enumerate(ctx.images):
142             add_file(url, 'image%s' % i)
143
144         if len(ctx.footnotes.output):
145             ctx.toc.add("Przypisy", "footnotes.html")
146             manifest.append(etree.Element(
147                 OPFNS('item'), id='footnotes', href='footnotes.html',
148                 **{'media-type': "application/xhtml+xml"}))
149             spine.append(etree.Element('itemref', idref='footnotes'))
150             wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
151             extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
152             
153             # chars = chars.union(used_chars(html_tree.getroot()))
154             zip.writestr('OPS/footnotes.html', etree.tostring(
155                                 wrap, method="html", pretty_print=True))
156
157         footer_text = [
158             'Information about the resource',
159             'Publisher: %s' % self.dc('publisher'),
160             'Rights: %s' % self.dc('rights'),
161             'Intended audience: %s' % self.dc('audience'),
162             self.dc('description'),
163             'Resource prepared using MIL/PEER editing platform.',
164             'Source available at %s' % ctx.source_url,
165         ]
166         footer_wrap = deepcopy(wrap_tmpl)
167         footer_body = footer_wrap.find('//*[@id="book-text"]')
168         for line in footer_text:
169             footer_line = etree.Element('p')
170             footer_line.text = line
171             footer_body.append(footer_line)
172         manifest.append(manifest.makeelement(OPFNS('item'), attrib={
173             'id': 'footer',
174             'href': "footer.html",
175             'media-type': 'application/xhtml+xml',
176         }))
177         spine.append(spine.makeelement(OPFNS('itemref'), attrib={
178             'idref': 'footer',
179         }))
180         zip.writestr('OPS/footer.html', etree.tostring(footer_wrap, method='html'))
181
182         zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
183         ctx.toc.render(toc_file[-1])
184         zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
185         zip.close()
186         return OutputFile.from_filename(output_file.name)
187
188     def render(self, element, ctx):
189         return self.renderers.get_for(element).render(element, ctx)
190
191
192 # Helpers
193
194 class EpubRenderer(TreeRenderer):
195     """ Renders insides as XML in a <_/> container. """
196     def container(self, ctx):
197         root, inner = super(EpubRenderer, self).container()
198         root.set("part_no", str(ctx.part_no))
199         return root, inner
200
201     def render(self, element, ctx):
202         subctx = self.subcontext(element, ctx)
203         wrapper, inside = self.container(ctx)
204         if element.text:
205             extend_element(inside, self.render_text(element.text, ctx))
206         for child in element:
207             try:
208                 child_renderer = ctx.format.renderers.get_for(child)
209             except UnknownElement:
210                 continue
211             else:
212                 if getattr(child_renderer, 'epub_separate', False):
213                     yield wrapper
214                     ctx.part_no += 1
215                     for child_part in child_renderer.render(child, subctx):
216                         yield child_part
217                     wrapper, inside = self.container(ctx)
218                 else:
219                     child_parts = list(child_renderer.render(child, subctx))
220                     extend_element(inside, child_parts[0])
221                     if len(child_parts) > 1:
222                         yield wrapper
223                         for child_part in child_parts[1:-1]:
224                             yield child_part
225                         wrapper, inside = self.container(ctx)
226                         extend_element(inside, child_parts[-1])
227             finally:
228                 if child.tail:
229                     extend_element(inside, self.render_text(child.tail, ctx))
230         yield wrapper
231
232
233 class NaturalText(EpubRenderer):
234     def render_text(self, text, ctx):
235         root, inner = self.text_container()
236         chunks = re.split('(?<=\s\w) ', text)
237         inner.text = chunks[0]
238         for chunk in chunks[1:]:
239             x = etree.Entity("nbsp")
240             x.tail = chunk
241             inner.append(x)
242         return root
243
244
245 class Silent(EpubRenderer):
246     def render_text(self, text, ctx):
247         root, inner = self.text_container()
248         return root
249
250
251 class Footnotes(object):
252     def __init__(self):
253         self.counter = 0
254         self.output = etree.Element("_")
255
256     def append(self, items):
257         self.counter += 1
258         e = etree.Element(
259             "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
260             id="footnote-%d" % self.counter,
261             style="float:left;margin-right:1em")
262         e.text = "[%d]" % self.counter
263         e.tail = " "
264         self.output.append(e)
265         for item in items:
266             extend_element(self.output, item)
267         anchor = etree.Element(
268             "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
269         anchor.text = "[%d]" % self.counter
270         return anchor
271
272
273 class TOC(object):
274     def __init__(self, title=None, href="", root=None):
275         if root is None:
276             self.counter = 0
277             self.root = self
278         else:
279             self.root = root
280         self.children = []
281         self.title = title
282         self.href = href.format(counter=self.root.counter)
283         self.number = self.root.counter
284         self.root.counter += 1
285
286     def add(self, title, href):
287         subtoc = type(self)(title, href, root=self.root)
288         self.children.append(subtoc)
289         return subtoc
290
291     def render(self, nav_map):
292         for child in self.children:
293             nav_point = etree.Element(NCXNS('navPoint'))
294             nav_point.set('id', 'NavPoint-%d' % child.number)
295             nav_point.set('playOrder', str(child.number))
296
297             nav_label = etree.Element(NCXNS('navLabel'))
298             text = etree.Element(NCXNS('text'))
299             text.text = child.title
300             nav_label.append(text)
301             nav_point.append(nav_label)
302
303             content = etree.Element(NCXNS('content'))
304             content.set('src', child.href)
305             nav_point.append(content)
306             nav_map.append(nav_point)
307             child.render(nav_point)
308
309
310 # Renderers
311
312 class AsideR(NaturalText):
313     def render(self, element, ctx):
314         outputs = list(super(AsideR, self).render(element, ctx))
315         anchor = ctx.footnotes.append(outputs)
316         wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
317         inside.append(anchor)
318         yield wrapper
319 EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
320
321 EpubFormat.renderers.register(core.Aside, 'comment', Silent())
322
323
324 class DivR(NaturalText):
325     def container(self, ctx):
326         root, inner = super(DivR, self).container(ctx)
327         if getattr(ctx, 'inline', False):
328             inner.tag = 'span'
329             inner.set('style', 'display: block;')
330         return root, inner
331 EpubFormat.renderers.register(core.Div, None, DivR('div'))
332 EpubFormat.renderers.register(core.Div, 'p', NaturalText('p'))
333
334 EpubFormat.renderers.register(core.Div, 'list', NaturalText('ul'))
335 EpubFormat.renderers.register(core.Div, 'list.enum', NaturalText('ol'))
336 EpubFormat.renderers.register(core.Div, 'item', NaturalText('li'))
337
338
339 class DivImageR(EpubRenderer):
340     def render(self, element, ctx):
341         src = element.attrib.get('src', '')
342         ctx.images.append(src)
343         src = src.rsplit('/', 1)[1]
344         return super(DivImageR, self).render(element, Context(ctx, src=src))
345
346     def container(self, ctx):
347         root, inner = super(DivImageR, self).container(ctx)
348         src = getattr(ctx, 'src', '')
349         inner.set('src', src)
350         # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
351         return root, inner
352 EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
353
354
355 class HeaderR(NaturalText):
356     def subcontext(self, element, ctx):
357         return Context(ctx, inline=True)
358 EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
359
360
361 class SectionR(NaturalText):
362     epub_separate = True
363
364     def render(self, element, ctx):
365         # Add 'poczatek'?
366         if element.getparent() is not None:
367             tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no)
368             ctx = Context(ctx, toc=tocitem)
369         return super(SectionR, self).render(element, ctx)
370 EpubFormat.renderers.register(core.Section, None, SectionR())
371
372
373 class SpanR(NaturalText):
374     pass
375 EpubFormat.renderers.register(core.Span, None, SpanR('span'))
376 EpubFormat.renderers.register(core.Span, 'cite', SpanR('i'))
377 EpubFormat.renderers.register(core.Span, 'emp', SpanR('b'))
378 EpubFormat.renderers.register(core.Span, 'emph', SpanR('i'))
379
380
381 class SpanLink(EpubRenderer):
382     def render(self, element, ctx):
383         parts = super(SpanLink, self).render(element, ctx)
384         for part in parts:
385             src = element.attrib.get('href', '')
386             if src.startswith('file://'):
387                 src = ctx.files_path + src[7:]
388             part[0].attrib['href'] = src
389             yield part
390 EpubFormat.renderers.register(core.Span, 'link', SpanLink('a'))