cleanup unused files
[librarian.git] / librarian / formats / epub / __init__.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 #
6 import os
7 import re
8 import urllib
9 from copy import deepcopy
10 from mimetypes import guess_type
11 from tempfile import NamedTemporaryFile
12 import zipfile
13 from urllib2 import urlopen
14
15 from lxml import etree
16 from librarian import OPFNS, NCXNS, XHTMLNS, DCNS, BuildError
17 from librarian import core
18 from librarian.formats import Format
19 from librarian.formats.cover.evens import EvensCover
20 from librarian.output import OutputFile
21 from librarian.renderers import Register, TreeRenderer, UnknownElement
22 from librarian.utils import Context, get_resource, extend_element
23
24
25 class EpubFormat(Format):
26     format_name = 'EPUB'
27     format_ext = 'epub'
28
29     cover = EvensCover
30     renderers = Register()
31
32     def __init__(self, doc, cover=None, with_fonts=True):
33         super(EpubFormat, self).__init__(doc)
34         self.with_fonts = with_fonts
35         if cover is not None:
36             self.cover = cover
37
38     def dc(self, tag, multiple=False):
39         if multiple:
40             return ', '.join(self.doc.meta.get(DCNS(tag)))
41         else:
42             return self.doc.meta.get_one(DCNS(tag))
43
44     def build(self, ctx=None):
45
46         def add_file(url, file_id):
47             filename = url.rsplit('/', 1)[1]
48             if url.startswith('file://'):
49                 url = ctx.files_path + urllib.quote(url[7:])
50             if url.startswith('/'):
51                 url = 'http://milpeer.eu' + url
52             file_content = urlopen(url).read()
53             zip.writestr(os.path.join('OPS', filename), file_content)
54             manifest.append(etree.fromstring(
55                 '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
56
57         opf = etree.parse(get_resource('formats/epub/res/content.opf'))
58         manifest = opf.find(OPFNS('manifest'))
59         guide = opf.find(OPFNS('guide'))
60         spine = opf.find(OPFNS('spine'))
61
62         author = ", ". join(self.doc.meta.get(DCNS('creator')) or [])
63         title = self.doc.meta.title()
64         opf.find('.//' + DCNS('creator')).text = author
65         opf.find('.//' + DCNS('title')).text = title
66
67         output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
68         zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
69
70         mime = zipfile.ZipInfo()
71         mime.filename = 'mimetype'
72         mime.compress_type = zipfile.ZIP_STORED
73         mime.extra = ''
74         zip.writestr(mime, 'application/epub+zip')
75         zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
76                      'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
77                      '<rootfiles><rootfile full-path="OPS/content.opf" '
78                      'media-type="application/oebps-package+xml" />'
79                      '</rootfiles></container>')
80
81         toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
82                                     '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
83                                     '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
84                                     'version="2005-1"><head></head><docTitle></docTitle><navMap>'
85                                     '</navMap></ncx>')
86         # nav_map = toc_file[-1]
87
88         if self.cover is not None:
89             # cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
90             cover = self.cover(self.doc)
91             cover.set_images(ctx)
92             cover_output = cover.build()
93             cover_name = 'cover.%s' % cover.format_ext
94             zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
95             del cover_output
96
97             cover_tree = etree.parse(get_resource('formats/epub/res/cover.html'))
98             cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
99             zip.writestr('OPS/cover.html', etree.tostring(
100                             cover_tree, method="html", pretty_print=True))
101
102             if cover.uses_dc_cover:
103                 if self.doc.meta.get_one('cover_by'):
104                     self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
105                 if self.doc.meta.get_one('cover_source'):
106                     self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
107
108             manifest.append(etree.fromstring(
109                 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
110             manifest.append(etree.fromstring(
111                 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, cover.mime_type())))
112             spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
113             opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
114             guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
115
116         if not ctx:
117             ctx = Context(format=self)
118         else:
119             ctx.format = self
120         ctx.toc = TOC()
121         ctx.toc_level = 0
122         ctx.footnotes = Footnotes()
123         ctx.images = []
124         ctx.part_no = 0
125
126         wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
127         for e in self.render(self.doc.edoc.getroot(), ctx):
128             if not len(e) and not (e.text and e.text.strip()):
129                 continue
130             wrap = deepcopy(wrap_tmpl)
131             extend_element(wrap.find('//*[@id="book-text"]'), e)
132
133             partstr = 'part%d' % int(e.get('part_no'))
134             manifest.append(manifest.makeelement(OPFNS('item'), attrib={
135                                  'id': partstr,
136                                  'href': partstr + ".html",
137                                  'media-type': 'application/xhtml+xml',
138                              }))
139             spine.append(spine.makeelement(OPFNS('itemref'), attrib={
140                         'idref': partstr,
141                     }))
142             zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
143
144         for i, url in enumerate(ctx.images):
145             add_file(url, 'image%s' % i)
146
147         if len(ctx.footnotes.output):
148             ctx.toc.add("Przypisy", "footnotes.html")
149             manifest.append(etree.Element(
150                 OPFNS('item'), id='footnotes', href='footnotes.html',
151                 **{'media-type': "application/xhtml+xml"}))
152             spine.append(etree.Element('itemref', idref='footnotes'))
153             wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
154             extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
155             
156             # chars = chars.union(used_chars(html_tree.getroot()))
157             zip.writestr('OPS/footnotes.html', etree.tostring(
158                                 wrap, method="html", pretty_print=True))
159
160         footer_text = [
161             'Information about the resource',
162             'Publisher: %s' % self.dc('publisher'),
163             'Rights: %s' % self.dc('rights'),
164             'Intended audience: %s' % self.dc('audience', multiple=True),
165             self.dc('description'),
166             'Resource prepared using MIL/PEER editing platform.',
167             'Source available at %s' % ctx.source_url,
168         ]
169         footer_wrap = deepcopy(wrap_tmpl)
170         footer_body = footer_wrap.find('//*[@id="book-text"]')
171         for line in footer_text:
172             footer_line = etree.Element('p')
173             footer_line.text = line
174             footer_body.append(footer_line)
175         manifest.append(manifest.makeelement(OPFNS('item'), attrib={
176             'id': 'footer',
177             'href': "footer.html",
178             'media-type': 'application/xhtml+xml',
179         }))
180         spine.append(spine.makeelement(OPFNS('itemref'), attrib={
181             'idref': 'footer',
182         }))
183         zip.writestr('OPS/footer.html', etree.tostring(footer_wrap, method='html'))
184
185         zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
186         ctx.toc.render(toc_file[-1])
187         zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
188         zip.close()
189         return OutputFile.from_filename(output_file.name)
190
191     def render(self, element, ctx):
192         return self.renderers.get_for(element).render(element, ctx)
193
194
195 # Helpers
196
197 class EpubRenderer(TreeRenderer):
198     """ Renders insides as XML in a <_/> container. """
199     def container(self, ctx):
200         root, inner = super(EpubRenderer, self).container()
201         root.set("part_no", str(ctx.part_no))
202         return root, inner
203
204     def render(self, element, ctx):
205         subctx = self.subcontext(element, ctx)
206         wrapper, inside = self.container(ctx)
207         if element.text:
208             extend_element(inside, self.render_text(element.text, ctx))
209         for child in element:
210             try:
211                 child_renderer = ctx.format.renderers.get_for(child)
212             except UnknownElement:
213                 continue
214             else:
215                 if getattr(child_renderer, 'epub_separate', False):
216                     yield wrapper
217                     ctx.part_no += 1
218                     for child_part in child_renderer.render(child, subctx):
219                         yield child_part
220                     wrapper, inside = self.container(ctx)
221                 else:
222                     child_parts = list(child_renderer.render(child, subctx))
223                     extend_element(inside, child_parts[0])
224                     if len(child_parts) > 1:
225                         yield wrapper
226                         for child_part in child_parts[1:-1]:
227                             yield child_part
228                         wrapper, inside = self.container(ctx)
229                         extend_element(inside, child_parts[-1])
230             finally:
231                 if child.tail:
232                     extend_element(inside, self.render_text(child.tail, ctx))
233         yield wrapper
234
235
236 class NaturalText(EpubRenderer):
237     def render_text(self, text, ctx):
238         root, inner = self.text_container()
239         chunks = re.split('(?<=\s\w) ', text)
240         inner.text = chunks[0]
241         for chunk in chunks[1:]:
242             x = etree.Entity("nbsp")
243             x.tail = chunk
244             inner.append(x)
245         return root
246
247
248 class Silent(EpubRenderer):
249     def render_text(self, text, ctx):
250         root, inner = self.text_container()
251         return root
252
253
254 class Footnotes(object):
255     def __init__(self):
256         self.counter = 0
257         self.output = etree.Element("_")
258
259     def append(self, items):
260         self.counter += 1
261         e = etree.Element(
262             "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
263             id="footnote-%d" % self.counter,
264             style="float:left;margin-right:1em")
265         e.text = "[%d]" % self.counter
266         e.tail = " "
267         self.output.append(e)
268         for item in items:
269             extend_element(self.output, item)
270         anchor = etree.Element(
271             "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
272         anchor.text = "[%d]" % self.counter
273         return anchor
274
275
276 class TOC(object):
277     def __init__(self, title=None, href="", root=None):
278         if root is None:
279             self.counter = 0
280             self.root = self
281         else:
282             self.root = root
283         self.children = []
284         self.title = title
285         self.href = href.format(counter=self.root.counter)
286         self.number = self.root.counter
287         self.root.counter += 1
288
289     def add(self, title, href):
290         subtoc = type(self)(title, href, root=self.root)
291         self.children.append(subtoc)
292         return subtoc
293
294     def render(self, nav_map):
295         for child in self.children:
296             nav_point = etree.Element(NCXNS('navPoint'))
297             nav_point.set('id', 'NavPoint-%d' % child.number)
298             nav_point.set('playOrder', str(child.number))
299
300             nav_label = etree.Element(NCXNS('navLabel'))
301             text = etree.Element(NCXNS('text'))
302             text.text = child.title
303             nav_label.append(text)
304             nav_point.append(nav_label)
305
306             content = etree.Element(NCXNS('content'))
307             content.set('src', child.href)
308             nav_point.append(content)
309             nav_map.append(nav_point)
310             child.render(nav_point)
311
312
313 # Renderers
314
315 class AsideR(NaturalText):
316     def render(self, element, ctx):
317         outputs = list(super(AsideR, self).render(element, ctx))
318         anchor = ctx.footnotes.append(outputs)
319         wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
320         inside.append(anchor)
321         yield wrapper
322 EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
323
324 EpubFormat.renderers.register(core.Aside, 'comment', Silent())
325
326
327 class DivR(NaturalText):
328     def container(self, ctx):
329         root, inner = super(DivR, self).container(ctx)
330         if getattr(ctx, 'inline', False):
331             inner.tag = 'span'
332             inner.set('style', 'display: block;')
333         return root, inner
334 EpubFormat.renderers.register(core.Div, None, DivR('div'))
335 EpubFormat.renderers.register(core.Div, 'p', NaturalText('p'))
336
337 EpubFormat.renderers.register(core.Div, 'list', NaturalText('ul'))
338 EpubFormat.renderers.register(core.Div, 'list.enum', NaturalText('ol'))
339 EpubFormat.renderers.register(core.Div, 'item', NaturalText('li'))
340 EpubFormat.renderers.register(core.Span, 'item', NaturalText('li'))
341
342
343 class DivImageR(EpubRenderer):
344     def render(self, element, ctx):
345         src = element.attrib.get('src', '')
346         ctx.images.append(src)
347         if '/' not in src:
348             raise BuildError('Bad image URL')
349         src = src.rsplit('/', 1)[1]
350         return super(DivImageR, self).render(element, Context(ctx, src=src))
351
352     def container(self, ctx):
353         root, inner = super(DivImageR, self).container(ctx)
354         src = getattr(ctx, 'src', '')
355         inner.set('src', src)
356         # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
357         return root, inner
358 EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
359
360
361 class DivVideoR(Silent):
362     def render(self, element, ctx):
363         src = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid', '')
364         return super(DivVideoR, self).render(element, Context(ctx, src=src))
365
366     def container(self, ctx):
367         root, inner = super(DivVideoR, self).container(ctx)
368         src = getattr(ctx, 'src', '')
369         link = etree.Element('a', {'href': src})
370         link.text = src
371         inner.append(link)
372         return root, inner
373 EpubFormat.renderers.register(core.Div, 'video', DivVideoR('p'))
374
375
376 class HeaderR(NaturalText):
377     def subcontext(self, element, ctx):
378         return Context(ctx, inline=True)
379 EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
380
381
382 class SectionR(NaturalText):
383     epub_separate = True
384
385     def render(self, element, ctx):
386         # Add 'poczatek'?
387         if element.getparent() is not None:
388             tocitem = ctx.toc.add(element.meta.title(), 'part%d.html' % ctx.part_no)
389             ctx = Context(ctx, toc=tocitem)
390         return super(SectionR, self).render(element, ctx)
391 EpubFormat.renderers.register(core.Section, None, SectionR())
392
393
394 class SpanR(NaturalText):
395     pass
396 EpubFormat.renderers.register(core.Span, None, SpanR('span'))
397 EpubFormat.renderers.register(core.Span, 'cite', SpanR('i'))
398 EpubFormat.renderers.register(core.Span, 'emp', SpanR('b'))
399 EpubFormat.renderers.register(core.Span, 'emph', SpanR('i'))
400
401
402 class SpanLink(EpubRenderer):
403     def render(self, element, ctx):
404         parts = super(SpanLink, self).render(element, ctx)
405         for part in parts:
406             src = element.attrib.get('href', '')
407             if src.startswith('file://'):
408                 src = ctx.files_path + src[7:]
409             part[0].attrib['href'] = src
410             yield part
411 EpubFormat.renderers.register(core.Span, 'link', SpanLink('a'))