708b0634b7de9427c54785d7de60362c5fe20987
[librarian.git] / librarian / formats / epub / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import urllib
8 from copy import deepcopy
9 from mimetypes import guess_type
10 from tempfile import NamedTemporaryFile
11 import zipfile
12 from urllib2 import urlopen
13
14 from lxml import etree
15 from librarian import OPFNS, NCXNS, XHTMLNS, DCNS
16 from librarian import core
17 from librarian.formats import Format
18 from librarian.formats.cover.evens import EvensCover
19 from librarian.output import OutputFile
20 from librarian.renderers import Register, TreeRenderer, UnknownElement
21 from librarian.utils import Context, get_resource, extend_element
22
23
24 class EpubFormat(Format):
25     format_name = 'EPUB'
26     format_ext = 'epub'
27
28     cover = EvensCover
29     renderers = Register()
30
31     def __init__(self, doc, cover=None, with_fonts=True):
32         super(EpubFormat, self).__init__(doc)
33         self.with_fonts = with_fonts
34         if cover is not None:
35             self.cover = cover
36
37     def dc(self, tag):
38         return self.doc.meta.get_one(DCNS(tag))
39
40     def build(self, ctx=None):
41
42         def add_file(url, file_id):
43             filename = url.rsplit('/', 1)[1]
44             if url.startswith('file://'):
45                 url = ctx.files_path + urllib.quote(url[7:])
46             if url.startswith('/'):
47                 url = 'http://milpeer.eu' + url
48             file_content = urlopen(url).read()
49             zip.writestr(os.path.join('OPS', filename), file_content)
50             manifest.append(etree.fromstring(
51                 '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
52
53         opf = etree.parse(get_resource('formats/epub/res/content.opf'))
54         manifest = opf.find(OPFNS('manifest'))
55         guide = opf.find(OPFNS('guide'))
56         spine = opf.find(OPFNS('spine'))
57
58         author = ", ". join(self.doc.meta.get(DCNS('creator')) or [])
59         title = self.doc.meta.title()
60         opf.find('.//' + DCNS('creator')).text = author
61         opf.find('.//' + DCNS('title')).text = title
62
63         output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
64         zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
65
66         mime = zipfile.ZipInfo()
67         mime.filename = 'mimetype'
68         mime.compress_type = zipfile.ZIP_STORED
69         mime.extra = ''
70         zip.writestr(mime, 'application/epub+zip')
71         zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
72                      'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
73                      '<rootfiles><rootfile full-path="OPS/content.opf" '
74                      'media-type="application/oebps-package+xml" />'
75                      '</rootfiles></container>')
76
77         toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
78                                     '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
79                                     '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
80                                     'version="2005-1"><head></head><docTitle></docTitle><navMap>'
81                                     '</navMap></ncx>')
82         # nav_map = toc_file[-1]
83
84         if self.cover is not None:
85             # cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
86             cover = self.cover(self.doc)
87             cover.set_images(ctx)
88             cover_output = cover.build()
89             cover_name = 'cover.%s' % cover.format_ext
90             zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
91             del cover_output
92
93             cover_tree = etree.parse(get_resource('formats/epub/res/cover.html'))
94             cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
95             zip.writestr('OPS/cover.html', etree.tostring(
96                             cover_tree, method="html", pretty_print=True))
97
98             if cover.uses_dc_cover:
99                 if self.doc.meta.get_one('cover_by'):
100                     self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
101                 if self.doc.meta.get_one('cover_source'):
102                     self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
103
104             manifest.append(etree.fromstring(
105                 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
106             manifest.append(etree.fromstring(
107                 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, cover.mime_type())))
108             spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
109             opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
110             guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
111
112         if not ctx:
113             ctx = Context(format=self)
114         else:
115             ctx.format = self
116         ctx.toc = TOC()
117         ctx.toc_level = 0
118         ctx.footnotes = Footnotes()
119         ctx.images = []
120         ctx.part_no = 0
121
122         wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
123         for e in self.render(self.doc.edoc.getroot(), ctx):
124             if not len(e) and not (e.text and e.text.strip()):
125                 continue
126             wrap = deepcopy(wrap_tmpl)
127             extend_element(wrap.find('//*[@id="book-text"]'), e)
128
129             partstr = 'part%d' % int(e.get('part_no'))
130             manifest.append(manifest.makeelement(OPFNS('item'), attrib={
131                                  'id': partstr,
132                                  'href': partstr + ".html",
133                                  'media-type': 'application/xhtml+xml',
134                              }))
135             spine.append(spine.makeelement(OPFNS('itemref'), attrib={
136                         'idref': partstr,
137                     }))
138             zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
139
140         for i, url in enumerate(ctx.images):
141             add_file(url, 'image%s' % i)
142
143         if len(ctx.footnotes.output):
144             ctx.toc.add("Przypisy", "footnotes.html")
145             manifest.append(etree.Element(
146                 OPFNS('item'), id='footnotes', href='footnotes.html',
147                 **{'media-type': "application/xhtml+xml"}))
148             spine.append(etree.Element('itemref', idref='footnotes'))
149             wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
150             extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
151             
152             # chars = chars.union(used_chars(html_tree.getroot()))
153             zip.writestr('OPS/footnotes.html', etree.tostring(
154                                 wrap, method="html", pretty_print=True))
155
156         footer_text = [
157             'Information about the resource',
158             'Publisher: %s' % self.dc('publisher'),
159             'Rights: %s' % self.dc('rights'),
160             'Intended audience: %s' % self.dc('audience'),
161             self.dc('description'),
162             'Resource prepared using MIL/PEER editing platform.',
163             'Source available at %s' % ctx.source_url,
164         ]
165         footer_wrap = deepcopy(wrap_tmpl)
166         footer_body = footer_wrap.find('//*[@id="book-text"]')
167         for line in footer_text:
168             footer_line = etree.Element('p')
169             footer_line.text = line
170             footer_body.append(footer_line)
171         manifest.append(manifest.makeelement(OPFNS('item'), attrib={
172             'id': 'footer',
173             'href': "footer.html",
174             'media-type': 'application/xhtml+xml',
175         }))
176         spine.append(spine.makeelement(OPFNS('itemref'), attrib={
177             'idref': 'footer',
178         }))
179         zip.writestr('OPS/footer.html', etree.tostring(footer_wrap, method='html'))
180
181         zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
182         ctx.toc.render(toc_file[-1])
183         zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
184         zip.close()
185         return OutputFile.from_filename(output_file.name)
186
187     def render(self, element, ctx):
188         return self.renderers.get_for(element).render(element, ctx)
189
190
191 # Helpers
192
193 class EpubRenderer(TreeRenderer):
194     """ Renders insides as XML in a <_/> container. """
195     def container(self, ctx):
196         root, inner = super(EpubRenderer, self).container()
197         root.set("part_no", str(ctx.part_no))
198         return root, inner
199
200     def render(self, element, ctx):
201         subctx = self.subcontext(element, ctx)
202         wrapper, inside = self.container(ctx)
203         if element.text:
204             extend_element(inside, self.render_text(element.text, ctx))
205         for child in element:
206             try:
207                 child_renderer = ctx.format.renderers.get_for(child)
208             except UnknownElement:
209                 continue
210             else:
211                 if getattr(child_renderer, 'epub_separate', False):
212                     yield wrapper
213                     ctx.part_no += 1
214                     for child_part in child_renderer.render(child, subctx):
215                         yield child_part
216                     wrapper, inside = self.container(ctx)
217                 else:
218                     child_parts = list(child_renderer.render(child, subctx))
219                     extend_element(inside, child_parts[0])
220                     if len(child_parts) > 1:
221                         yield wrapper
222                         for child_part in child_parts[1:-1]:
223                             yield child_part
224                         wrapper, inside = self.container(ctx)
225                         extend_element(inside, child_parts[-1])
226             finally:
227                 if child.tail:
228                     extend_element(inside, self.render_text(child.tail, ctx))
229         yield wrapper
230
231
232 class Footnotes(object):
233     def __init__(self):
234         self.counter = 0
235         self.output = etree.Element("_")
236
237     def append(self, items):
238         self.counter += 1
239         e = etree.Element(
240             "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
241             id="footnote-%d" % self.counter,
242             style="float:left;margin-right:1em")
243         e.text = "[%d]" % self.counter
244         e.tail = " "
245         self.output.append(e)
246         for item in items:
247             extend_element(self.output, item)
248         anchor = etree.Element(
249             "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
250         anchor.text = "[%d]" % self.counter
251         return anchor
252
253
254 class TOC(object):
255     def __init__(self, title=None, href="", root=None):
256         if root is None:
257             self.counter = 0
258             self.root = self
259         else:
260             self.root = root
261         self.children = []
262         self.title = title
263         self.href = href.format(counter=self.root.counter)
264         self.number = self.root.counter
265         self.root.counter += 1
266
267     def add(self, title, href):
268         subtoc = type(self)(title, href, root=self.root)
269         self.children.append(subtoc)
270         return subtoc
271
272     def render(self, nav_map):
273         for child in self.children:
274             nav_point = etree.Element(NCXNS('navPoint'))
275             nav_point.set('id', 'NavPoint-%d' % child.number)
276             nav_point.set('playOrder', str(child.number))
277
278             nav_label = etree.Element(NCXNS('navLabel'))
279             text = etree.Element(NCXNS('text'))
280             text.text = child.title
281             nav_label.append(text)
282             nav_point.append(nav_label)
283
284             content = etree.Element(NCXNS('content'))
285             content.set('src', child.href)
286             nav_point.append(content)
287             nav_map.append(nav_point)
288             child.render(nav_point)
289
290
291 # Renderers
292
293 class AsideR(EpubRenderer):
294     def render(self, element, ctx):
295         outputs = list(super(AsideR, self).render(element, ctx))
296         anchor = ctx.footnotes.append(outputs)
297         wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
298         inside.append(anchor)
299         yield wrapper
300 EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
301
302
303 class DivR(EpubRenderer):
304     def container(self, ctx):
305         root, inner = super(DivR, self).container(ctx)
306         if getattr(ctx, 'inline', False):
307             inner.tag = 'span'
308             inner.set('style', 'display: block;')
309         return root, inner
310 EpubFormat.renderers.register(core.Div, None, DivR('div'))
311
312
313 class DivImageR(EpubRenderer):
314     def render(self, element, ctx):
315         src = element.attrib.get('src', '')
316         ctx.images.append(src)
317         src = src.rsplit('/', 1)[1]
318         return super(DivImageR, self).render(element, Context(ctx, src=src))
319
320     def container(self, ctx):
321         root, inner = super(DivImageR, self).container(ctx)
322         src = getattr(ctx, 'src', '')
323         inner.set('src', src)
324         # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
325         return root, inner
326 EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
327
328
329 class HeaderR(EpubRenderer):
330     def subcontext(self, element, ctx):
331         return Context(ctx, inline=True)
332 EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
333
334
335 class SectionR(EpubRenderer):
336     epub_separate = True
337
338     def render(self, element, ctx):
339         # Add 'poczatek'?
340         if element.getparent() is not None:
341             tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no)
342             ctx = Context(ctx, toc=tocitem)
343         return super(SectionR, self).render(element, ctx)
344 EpubFormat.renderers.register(core.Section, None, SectionR())
345
346
347 class SpanR(EpubRenderer):
348     pass
349 EpubFormat.renderers.register(core.Span, None, SpanR('span'))