report images without extensions, allow span.item
[librarian.git] / librarian / formats / epub / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import re
8 import urllib
9 from copy import deepcopy
10 from mimetypes import guess_type
11 from tempfile import NamedTemporaryFile
12 import zipfile
13 from urllib2 import urlopen
14
15 from lxml import etree
16 from librarian import OPFNS, NCXNS, XHTMLNS, DCNS, BuildError
17 from librarian import core
18 from librarian.formats import Format
19 from librarian.formats.cover.evens import EvensCover
20 from librarian.output import OutputFile
21 from librarian.renderers import Register, TreeRenderer, UnknownElement
22 from librarian.utils import Context, get_resource, extend_element
23
24
25 class EpubFormat(Format):
26     format_name = 'EPUB'
27     format_ext = 'epub'
28
29     cover = EvensCover
30     renderers = Register()
31
32     def __init__(self, doc, cover=None, with_fonts=True):
33         super(EpubFormat, self).__init__(doc)
34         self.with_fonts = with_fonts
35         if cover is not None:
36             self.cover = cover
37
38     def dc(self, tag):
39         return self.doc.meta.get_one(DCNS(tag))
40
41     def build(self, ctx=None):
42
43         def add_file(url, file_id):
44             filename = url.rsplit('/', 1)[1]
45             if url.startswith('file://'):
46                 url = ctx.files_path + urllib.quote(url[7:])
47             if url.startswith('/'):
48                 url = 'http://milpeer.eu' + url
49             file_content = urlopen(url).read()
50             zip.writestr(os.path.join('OPS', filename), file_content)
51             manifest.append(etree.fromstring(
52                 '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
53
54         opf = etree.parse(get_resource('formats/epub/res/content.opf'))
55         manifest = opf.find(OPFNS('manifest'))
56         guide = opf.find(OPFNS('guide'))
57         spine = opf.find(OPFNS('spine'))
58
59         author = ", ". join(self.doc.meta.get(DCNS('creator')) or [])
60         title = self.doc.meta.title()
61         opf.find('.//' + DCNS('creator')).text = author
62         opf.find('.//' + DCNS('title')).text = title
63
64         output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
65         zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
66
67         mime = zipfile.ZipInfo()
68         mime.filename = 'mimetype'
69         mime.compress_type = zipfile.ZIP_STORED
70         mime.extra = ''
71         zip.writestr(mime, 'application/epub+zip')
72         zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
73                      'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
74                      '<rootfiles><rootfile full-path="OPS/content.opf" '
75                      'media-type="application/oebps-package+xml" />'
76                      '</rootfiles></container>')
77
78         toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
79                                     '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
80                                     '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
81                                     'version="2005-1"><head></head><docTitle></docTitle><navMap>'
82                                     '</navMap></ncx>')
83         # nav_map = toc_file[-1]
84
85         if self.cover is not None:
86             # cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
87             cover = self.cover(self.doc)
88             cover.set_images(ctx)
89             cover_output = cover.build()
90             cover_name = 'cover.%s' % cover.format_ext
91             zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
92             del cover_output
93
94             cover_tree = etree.parse(get_resource('formats/epub/res/cover.html'))
95             cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
96             zip.writestr('OPS/cover.html', etree.tostring(
97                             cover_tree, method="html", pretty_print=True))
98
99             if cover.uses_dc_cover:
100                 if self.doc.meta.get_one('cover_by'):
101                     self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
102                 if self.doc.meta.get_one('cover_source'):
103                     self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
104
105             manifest.append(etree.fromstring(
106                 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
107             manifest.append(etree.fromstring(
108                 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, cover.mime_type())))
109             spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
110             opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
111             guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
112
113         if not ctx:
114             ctx = Context(format=self)
115         else:
116             ctx.format = self
117         ctx.toc = TOC()
118         ctx.toc_level = 0
119         ctx.footnotes = Footnotes()
120         ctx.images = []
121         ctx.part_no = 0
122
123         wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
124         for e in self.render(self.doc.edoc.getroot(), ctx):
125             if not len(e) and not (e.text and e.text.strip()):
126                 continue
127             wrap = deepcopy(wrap_tmpl)
128             extend_element(wrap.find('//*[@id="book-text"]'), e)
129
130             partstr = 'part%d' % int(e.get('part_no'))
131             manifest.append(manifest.makeelement(OPFNS('item'), attrib={
132                                  'id': partstr,
133                                  'href': partstr + ".html",
134                                  'media-type': 'application/xhtml+xml',
135                              }))
136             spine.append(spine.makeelement(OPFNS('itemref'), attrib={
137                         'idref': partstr,
138                     }))
139             zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
140
141         for i, url in enumerate(ctx.images):
142             add_file(url, 'image%s' % i)
143
144         if len(ctx.footnotes.output):
145             ctx.toc.add("Przypisy", "footnotes.html")
146             manifest.append(etree.Element(
147                 OPFNS('item'), id='footnotes', href='footnotes.html',
148                 **{'media-type': "application/xhtml+xml"}))
149             spine.append(etree.Element('itemref', idref='footnotes'))
150             wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
151             extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
152             
153             # chars = chars.union(used_chars(html_tree.getroot()))
154             zip.writestr('OPS/footnotes.html', etree.tostring(
155                                 wrap, method="html", pretty_print=True))
156
157         footer_text = [
158             'Information about the resource',
159             'Publisher: %s' % self.dc('publisher'),
160             'Rights: %s' % self.dc('rights'),
161             'Intended audience: %s' % self.dc('audience'),
162             self.dc('description'),
163             'Resource prepared using MIL/PEER editing platform.',
164             'Source available at %s' % ctx.source_url,
165         ]
166         footer_wrap = deepcopy(wrap_tmpl)
167         footer_body = footer_wrap.find('//*[@id="book-text"]')
168         for line in footer_text:
169             footer_line = etree.Element('p')
170             footer_line.text = line
171             footer_body.append(footer_line)
172         manifest.append(manifest.makeelement(OPFNS('item'), attrib={
173             'id': 'footer',
174             'href': "footer.html",
175             'media-type': 'application/xhtml+xml',
176         }))
177         spine.append(spine.makeelement(OPFNS('itemref'), attrib={
178             'idref': 'footer',
179         }))
180         zip.writestr('OPS/footer.html', etree.tostring(footer_wrap, method='html'))
181
182         zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
183         ctx.toc.render(toc_file[-1])
184         zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
185         zip.close()
186         return OutputFile.from_filename(output_file.name)
187
188     def render(self, element, ctx):
189         return self.renderers.get_for(element).render(element, ctx)
190
191
192 # Helpers
193
194 class EpubRenderer(TreeRenderer):
195     """ Renders insides as XML in a <_/> container. """
196     def container(self, ctx):
197         root, inner = super(EpubRenderer, self).container()
198         root.set("part_no", str(ctx.part_no))
199         return root, inner
200
201     def render(self, element, ctx):
202         subctx = self.subcontext(element, ctx)
203         wrapper, inside = self.container(ctx)
204         if element.text:
205             extend_element(inside, self.render_text(element.text, ctx))
206         for child in element:
207             try:
208                 child_renderer = ctx.format.renderers.get_for(child)
209             except UnknownElement:
210                 continue
211             else:
212                 if getattr(child_renderer, 'epub_separate', False):
213                     yield wrapper
214                     ctx.part_no += 1
215                     for child_part in child_renderer.render(child, subctx):
216                         yield child_part
217                     wrapper, inside = self.container(ctx)
218                 else:
219                     child_parts = list(child_renderer.render(child, subctx))
220                     extend_element(inside, child_parts[0])
221                     if len(child_parts) > 1:
222                         yield wrapper
223                         for child_part in child_parts[1:-1]:
224                             yield child_part
225                         wrapper, inside = self.container(ctx)
226                         extend_element(inside, child_parts[-1])
227             finally:
228                 if child.tail:
229                     extend_element(inside, self.render_text(child.tail, ctx))
230         yield wrapper
231
232
233 class NaturalText(EpubRenderer):
234     def render_text(self, text, ctx):
235         root, inner = self.text_container()
236         chunks = re.split('(?<=\s\w) ', text)
237         inner.text = chunks[0]
238         for chunk in chunks[1:]:
239             x = etree.Entity("nbsp")
240             x.tail = chunk
241             inner.append(x)
242         return root
243
244
245 class Silent(EpubRenderer):
246     def render_text(self, text, ctx):
247         root, inner = self.text_container()
248         return root
249
250
251 class Footnotes(object):
252     def __init__(self):
253         self.counter = 0
254         self.output = etree.Element("_")
255
256     def append(self, items):
257         self.counter += 1
258         e = etree.Element(
259             "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
260             id="footnote-%d" % self.counter,
261             style="float:left;margin-right:1em")
262         e.text = "[%d]" % self.counter
263         e.tail = " "
264         self.output.append(e)
265         for item in items:
266             extend_element(self.output, item)
267         anchor = etree.Element(
268             "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
269         anchor.text = "[%d]" % self.counter
270         return anchor
271
272
273 class TOC(object):
274     def __init__(self, title=None, href="", root=None):
275         if root is None:
276             self.counter = 0
277             self.root = self
278         else:
279             self.root = root
280         self.children = []
281         self.title = title
282         self.href = href.format(counter=self.root.counter)
283         self.number = self.root.counter
284         self.root.counter += 1
285
286     def add(self, title, href):
287         subtoc = type(self)(title, href, root=self.root)
288         self.children.append(subtoc)
289         return subtoc
290
291     def render(self, nav_map):
292         for child in self.children:
293             nav_point = etree.Element(NCXNS('navPoint'))
294             nav_point.set('id', 'NavPoint-%d' % child.number)
295             nav_point.set('playOrder', str(child.number))
296
297             nav_label = etree.Element(NCXNS('navLabel'))
298             text = etree.Element(NCXNS('text'))
299             text.text = child.title
300             nav_label.append(text)
301             nav_point.append(nav_label)
302
303             content = etree.Element(NCXNS('content'))
304             content.set('src', child.href)
305             nav_point.append(content)
306             nav_map.append(nav_point)
307             child.render(nav_point)
308
309
310 # Renderers
311
312 class AsideR(NaturalText):
313     def render(self, element, ctx):
314         outputs = list(super(AsideR, self).render(element, ctx))
315         anchor = ctx.footnotes.append(outputs)
316         wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
317         inside.append(anchor)
318         yield wrapper
319 EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
320
321 EpubFormat.renderers.register(core.Aside, 'comment', Silent())
322
323
324 class DivR(NaturalText):
325     def container(self, ctx):
326         root, inner = super(DivR, self).container(ctx)
327         if getattr(ctx, 'inline', False):
328             inner.tag = 'span'
329             inner.set('style', 'display: block;')
330         return root, inner
331 EpubFormat.renderers.register(core.Div, None, DivR('div'))
332 EpubFormat.renderers.register(core.Div, 'p', NaturalText('p'))
333
334 EpubFormat.renderers.register(core.Div, 'list', NaturalText('ul'))
335 EpubFormat.renderers.register(core.Div, 'list.enum', NaturalText('ol'))
336 EpubFormat.renderers.register(core.Div, 'item', NaturalText('li'))
337 EpubFormat.renderers.register(core.Span, 'item', NaturalText('li'))
338
339
340 class DivImageR(EpubRenderer):
341     def render(self, element, ctx):
342         src = element.attrib.get('src', '')
343         ctx.images.append(src)
344         if '/' not in src:
345             raise BuildError('Bad image URL')
346         src = src.rsplit('/', 1)[1]
347         return super(DivImageR, self).render(element, Context(ctx, src=src))
348
349     def container(self, ctx):
350         root, inner = super(DivImageR, self).container(ctx)
351         src = getattr(ctx, 'src', '')
352         inner.set('src', src)
353         # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
354         return root, inner
355 EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
356
357
358 class DivVideoR(Silent):
359     def render(self, element, ctx):
360         src = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid', '')
361         return super(DivVideoR, self).render(element, Context(ctx, src=src))
362
363     def container(self, ctx):
364         root, inner = super(DivVideoR, self).container(ctx)
365         src = getattr(ctx, 'src', '')
366         link = etree.Element('a', {'href': src})
367         link.text = src
368         inner.append(link)
369         return root, inner
370 EpubFormat.renderers.register(core.Div, 'video', DivVideoR('p'))
371
372
373 class HeaderR(NaturalText):
374     def subcontext(self, element, ctx):
375         return Context(ctx, inline=True)
376 EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
377
378
379 class SectionR(NaturalText):
380     epub_separate = True
381
382     def render(self, element, ctx):
383         # Add 'poczatek'?
384         if element.getparent() is not None:
385             tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no)
386             ctx = Context(ctx, toc=tocitem)
387         return super(SectionR, self).render(element, ctx)
388 EpubFormat.renderers.register(core.Section, None, SectionR())
389
390
391 class SpanR(NaturalText):
392     pass
393 EpubFormat.renderers.register(core.Span, None, SpanR('span'))
394 EpubFormat.renderers.register(core.Span, 'cite', SpanR('i'))
395 EpubFormat.renderers.register(core.Span, 'emp', SpanR('b'))
396 EpubFormat.renderers.register(core.Span, 'emph', SpanR('i'))
397
398
399 class SpanLink(EpubRenderer):
400     def render(self, element, ctx):
401         parts = super(SpanLink, self).render(element, ctx)
402         for part in parts:
403             src = element.attrib.get('href', '')
404             if src.startswith('file://'):
405                 src = ctx.files_path + src[7:]
406             part[0].attrib['href'] = src
407             yield part
408 EpubFormat.renderers.register(core.Span, 'link', SpanLink('a'))