images in epub
[librarian.git] / librarian / formats / epub / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import urllib
8 from copy import deepcopy
9 from mimetypes import guess_type
10 from tempfile import NamedTemporaryFile
11 import zipfile
12 from urllib2 import urlopen
13
14 from lxml import etree
15 from librarian import OPFNS, NCXNS, XHTMLNS, DCNS
16 from librarian import core
17 from librarian.formats import Format
18 from librarian.formats.cover.wolnelektury import WLCover
19 from librarian.output import OutputFile
20 from librarian.renderers import Register, TreeRenderer, UnknownElement
21 from librarian.utils import Context, get_resource, extend_element
22
23
24 class EpubFormat(Format):
25     format_name = 'EPUB'
26     format_ext = 'epub'
27
28     cover = WLCover
29     renderers = Register()
30
31     def __init__(self, doc, cover=None, with_fonts=True):
32         super(EpubFormat, self).__init__(doc)
33         self.with_fonts = with_fonts
34         if cover is not None:
35             self.cover = cover
36
37     def build(self, ctx=None):
38
39         def add_file(url, file_id):
40             filename = url.rsplit('/', 1)[1]
41             if url.startswith('file://'):
42                 url = ctx.files_path + urllib.quote(url[7:])
43             if url.startswith('/'):
44                 url = 'http://milpeer.eu' + url
45             file_content = urlopen(url).read()
46             zip.writestr(os.path.join('OPS', filename), file_content)
47             manifest.append(etree.fromstring(
48                 '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
49
50         opf = etree.parse(get_resource('formats/epub/res/content.opf'))
51         manifest = opf.find(OPFNS('manifest'))
52         guide = opf.find(OPFNS('guide'))
53         spine = opf.find(OPFNS('spine'))
54
55         author = ", ". join(self.doc.meta.get(DCNS('creator')) or '')
56         title = self.doc.meta.title()
57         opf.find('.//' + DCNS('creator')).text = author
58         opf.find('.//' + DCNS('title')).text = title
59
60         output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
61         zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
62
63         mime = zipfile.ZipInfo()
64         mime.filename = 'mimetype'
65         mime.compress_type = zipfile.ZIP_STORED
66         mime.extra = ''
67         zip.writestr(mime, 'application/epub+zip')
68         zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
69                      'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
70                      '<rootfiles><rootfile full-path="OPS/content.opf" '
71                      'media-type="application/oebps-package+xml" />'
72                      '</rootfiles></container>')
73
74         toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
75                                     '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
76                                     '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
77                                     'version="2005-1"><head></head><docTitle></docTitle><navMap>'
78                                     '</navMap></ncx>')
79         # nav_map = toc_file[-1]
80
81         if self.cover is not None:
82             cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
83             cover = self.cover(self.doc)
84             cover_output = cover.build()
85             cover_name = 'cover.%s' % cover.format_ext
86             zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
87             del cover_output
88
89             cover_tree = etree.parse(get_resource('formats/epub/res/cover.html'))
90             cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
91             zip.writestr('OPS/cover.html', etree.tostring(
92                             cover_tree, method="html", pretty_print=True))
93
94             if cover.uses_dc_cover:
95                 if self.doc.meta.get_one('cover_by'):
96                     self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
97                 if self.doc.meta.get_one('cover_source'):
98                     self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
99
100             manifest.append(etree.fromstring(
101                 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
102             manifest.append(etree.fromstring(
103                 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, cover.mime_type())))
104             spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
105             opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
106             guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
107
108         if not ctx:
109             ctx = Context(format=self)
110         else:
111             ctx.format = self
112         ctx.toc = TOC()
113         ctx.toc_level = 0
114         ctx.footnotes = Footnotes()
115         ctx.images = []
116         ctx.part_no = 0
117
118         wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
119         for e in self.render(self.doc.edoc.getroot(), ctx):
120             if not len(e) and not e.text.strip():
121                 continue
122             wrap = deepcopy(wrap_tmpl)
123             extend_element(wrap.find('//*[@id="book-text"]'), e)
124
125             partstr = 'part%d' % int(e.get('part_no'))
126             manifest.append(manifest.makeelement(OPFNS('item'), attrib={
127                                  'id': partstr,
128                                  'href': partstr + ".html",
129                                  'media-type': 'application/xhtml+xml',
130                              }))
131             spine.append(spine.makeelement(OPFNS('itemref'), attrib={
132                         'idref': partstr,
133                     }))
134             zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
135
136         for i, url in enumerate(ctx.images):
137             add_file(url, 'image%s' % i)
138
139         if len(ctx.footnotes.output):
140             ctx.toc.add("Przypisy", "footnotes.html")
141             manifest.append(etree.Element(
142                 OPFNS('item'), id='footnotes', href='footnotes.html',
143                 **{'media-type': "application/xhtml+xml"}))
144             spine.append(etree.Element('itemref', idref='footnotes'))
145             wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
146             extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
147             
148             # chars = chars.union(used_chars(html_tree.getroot()))
149             zip.writestr('OPS/footnotes.html', etree.tostring(
150                                 wrap, method="html", pretty_print=True))
151
152         zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
153         ctx.toc.render(toc_file[-1])
154         zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
155         zip.close()
156         return OutputFile.from_filename(output_file.name)
157
158     def render(self, element, ctx):
159         return self.renderers.get_for(element).render(element, ctx)
160
161
162 # Helpers
163
164 class EpubRenderer(TreeRenderer):
165     """ Renders insides as XML in a <_/> container. """
166     def container(self, ctx):
167         root, inner = super(EpubRenderer, self).container()
168         root.set("part_no", str(ctx.part_no))
169         return root, inner
170
171     def render(self, element, ctx):
172         subctx = self.subcontext(element, ctx)
173         wrapper, inside = self.container(ctx)
174         if element.text:
175             extend_element(inside, self.render_text(element.text, ctx))
176         for child in element:
177             try:
178                 child_renderer = ctx.format.renderers.get_for(child)
179             except UnknownElement:
180                 continue
181             else:
182                 if getattr(child_renderer, 'epub_separate', False):
183                     yield wrapper
184                     ctx.part_no += 1
185                     for child_part in child_renderer.render(child, subctx):
186                         yield child_part
187                     wrapper, inside = self.container(ctx)
188                 else:
189                     child_parts = list(child_renderer.render(child, subctx))
190                     extend_element(inside, child_parts[0])
191                     if len(child_parts) > 1:
192                         yield wrapper
193                         for child_part in child_parts[1:-1]:
194                             yield child_part
195                         wrapper, inside = self.container(ctx)
196                         extend_element(inside, child_parts[-1])
197             finally:
198                 if child.tail:
199                     extend_element(inside, self.render_text(child.tail, ctx))
200         yield wrapper
201
202
203 class Footnotes(object):
204     def __init__(self):
205         self.counter = 0
206         self.output = etree.Element("_")
207
208     def append(self, items):
209         self.counter += 1
210         e = etree.Element(
211             "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
212             id="footnote-%d" % self.counter,
213             style="float:left;margin-right:1em")
214         e.text = "[%d]" % self.counter
215         e.tail = " "
216         self.output.append(e)
217         for item in items:
218             extend_element(self.output, item)
219         anchor = etree.Element(
220             "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
221         anchor.text = "[%d]" % self.counter
222         return anchor
223
224
225 class TOC(object):
226     def __init__(self, title=None, href="", root=None):
227         if root is None:
228             self.counter = 0
229             self.root = self
230         else:
231             self.root = root
232         self.children = []
233         self.title = title
234         self.href = href.format(counter=self.root.counter)
235         self.number = self.root.counter
236         self.root.counter += 1
237
238     def add(self, title, href):
239         subtoc = type(self)(title, href, root=self.root)
240         self.children.append(subtoc)
241         return subtoc
242
243     def render(self, nav_map):
244         for child in self.children:
245             nav_point = etree.Element(NCXNS('navPoint'))
246             nav_point.set('id', 'NavPoint-%d' % child.number)
247             nav_point.set('playOrder', str(child.number))
248
249             nav_label = etree.Element(NCXNS('navLabel'))
250             text = etree.Element(NCXNS('text'))
251             text.text = child.title
252             nav_label.append(text)
253             nav_point.append(nav_label)
254
255             content = etree.Element(NCXNS('content'))
256             content.set('src', child.href)
257             nav_point.append(content)
258             nav_map.append(nav_point)
259             child.render(nav_point)
260
261
262 # Renderers
263
264 class AsideR(EpubRenderer):
265     def render(self, element, ctx):
266         outputs = list(super(AsideR, self).render(element, ctx))
267         anchor = ctx.footnotes.append(outputs)
268         wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
269         inside.append(anchor)
270         yield wrapper
271 EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
272
273
274 class DivR(EpubRenderer):
275     def container(self, ctx):
276         root, inner = super(DivR, self).container(ctx)
277         if getattr(ctx, 'inline', False):
278             inner.tag = 'span'
279             inner.set('style', 'display: block;')
280         return root, inner
281 EpubFormat.renderers.register(core.Div, None, DivR('div'))
282
283
284 class DivImageR(EpubRenderer):
285     def render(self, element, ctx):
286         src = element.attrib.get('src', '')
287         ctx.images.append(src)
288         src = src.rsplit('/', 1)[1]
289         return super(DivImageR, self).render(element, Context(ctx, src=src))
290
291     def container(self, ctx):
292         root, inner = super(DivImageR, self).container(ctx)
293         src = getattr(ctx, 'src', '')
294         inner.set('src', src)
295         # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
296         return root, inner
297 EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
298
299
300 class HeaderR(EpubRenderer):
301     def subcontext(self, element, ctx):
302         return Context(ctx, inline=True)
303 EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
304
305
306 class SectionR(EpubRenderer):
307     epub_separate = True
308
309     def render(self, element, ctx):
310         # Add 'poczatek'?
311         if element.getparent() is not None:
312             tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no)
313             ctx = Context(ctx, toc=tocitem)
314         return super(SectionR, self).render(element, ctx)
315 EpubFormat.renderers.register(core.Section, None, SectionR())
316
317
318 class SpanR(EpubRenderer):
319     pass
320 EpubFormat.renderers.register(core.Span, None, SpanR('span'))