changes in pdf: footer on first page, logo next to title, no duplicate title

[librarian.git] / librarian / formats / epub / __init__.py
diff --git a/librarian/formats/epub/__init__.py b/librarian/formats/epub/__init__.py

index f9f7565..36891be 100644 (file)
--- a/librarian/formats/epub/__init__.py
+++ b/librarian/formats/epub/__init__.py
@@ -4,14 +4,19 @@
  # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
  #
  import os
+import re
+import urllib
  from copy import deepcopy
+from mimetypes import guess_type
  from tempfile import NamedTemporaryFile
  import zipfile
+from urllib2 import urlopen
+
  from lxml import etree
-from librarian import OPFNS, NCXNS, XHTMLNS
+from librarian import OPFNS, NCXNS, XHTMLNS, DCNS, BuildError
  from librarian import core
  from librarian.formats import Format
-from librarian.formats.cover.wolnelektury import WLCover
+from librarian.formats.cover.evens import EvensCover
  from librarian.output import OutputFile
  from librarian.renderers import Register, TreeRenderer, UnknownElement
  from librarian.utils import Context, get_resource, extend_element
@@ -21,7 +26,7 @@ class EpubFormat(Format):
      format_name = 'EPUB'
      format_ext = 'epub'
  
-    cover = WLCover
+    cover = EvensCover
      renderers = Register()
  
      def __init__(self, doc, cover=None, with_fonts=True):
@@ -30,12 +35,35 @@ class EpubFormat(Format):
          if cover is not None:
              self.cover = cover
  
-    def build(self):
+    def dc(self, tag, multiple=False):
+        if multiple:
+            return ', '.join(self.doc.meta.get(DCNS(tag)))
+        else:
+            return self.doc.meta.get_one(DCNS(tag))
+
+    def build(self, ctx=None):
+
+        def add_file(url, file_id):
+            filename = url.rsplit('/', 1)[1]
+            if url.startswith('file://'):
+                url = ctx.files_path + urllib.quote(url[7:])
+            if url.startswith('/'):
+                url = 'http://milpeer.eu' + url
+            file_content = urlopen(url).read()
+            zip.writestr(os.path.join('OPS', filename), file_content)
+            manifest.append(etree.fromstring(
+                '<item id="%s" href="%s" media-type="%s" />' % (file_id, filename, guess_type(url)[0])))
+
          opf = etree.parse(get_resource('formats/epub/res/content.opf'))
          manifest = opf.find(OPFNS('manifest'))
          guide = opf.find(OPFNS('guide'))
          spine = opf.find(OPFNS('spine'))
  
+        author = ", ". join(self.doc.meta.get(DCNS('creator')) or [])
+        title = self.doc.meta.title()
+        opf.find('.//' + DCNS('creator')).text = author
+        opf.find('.//' + DCNS('title')).text = title
+
          output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
          zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
  
@@ -44,21 +72,23 @@ class EpubFormat(Format):
          mime.compress_type = zipfile.ZIP_STORED
          mime.extra = ''
          zip.writestr(mime, 'application/epub+zip')
-        zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
-                       'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
-                       '<rootfiles><rootfile full-path="OPS/content.opf" ' \
-                       'media-type="application/oebps-package+xml" />' \
-                       '</rootfiles></container>')
-
-        toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
-                               '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
-                               '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
-                               'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
-                               '</navMap></ncx>')
-        nav_map = toc_file[-1]
+        zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
+                     'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
+                     '<rootfiles><rootfile full-path="OPS/content.opf" '
+                     'media-type="application/oebps-package+xml" />'
+                     '</rootfiles></container>')
+
+        toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
+                                    '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
+                                    '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
+                                    'version="2005-1"><head></head><docTitle></docTitle><navMap>'
+                                    '</navMap></ncx>')
+        # nav_map = toc_file[-1]
  
          if self.cover is not None:
+            # cover_image = self.doc.meta.get(DCNS('relation.coverimage.url'))[0]
              cover = self.cover(self.doc)
+            cover.set_images(ctx)
              cover_output = cover.build()
              cover_name = 'cover.%s' % cover.format_ext
              zip.writestr(os.path.join('OPS', cover_name), cover_output.get_string())
@@ -71,9 +101,9 @@ class EpubFormat(Format):
  
              if cover.uses_dc_cover:
                  if self.doc.meta.get_one('cover_by'):
-                    document.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
+                    self.doc.edoc.getroot().set('data-cover-by', self.doc.meta.get_one('cover_by'))
                  if self.doc.meta.get_one('cover_source'):
-                    document.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
+                    self.doc.edoc.getroot().set('data-cover-source', self.doc.meta.get_one('cover_source'))
  
              manifest.append(etree.fromstring(
                  '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
@@ -83,16 +113,19 @@ class EpubFormat(Format):
              opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
              guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
  
-
-        ctx = Context(format=self)
+        if not ctx:
+            ctx = Context(format=self)
+        else:
+            ctx.format = self
          ctx.toc = TOC()
          ctx.toc_level = 0
          ctx.footnotes = Footnotes()
+        ctx.images = []
          ctx.part_no = 0
  
          wrap_tmpl = etree.parse(get_resource('formats/epub/res/chapter.html'))
          for e in self.render(self.doc.edoc.getroot(), ctx):
-            if not len(e) and not e.text.strip():
+            if not len(e) and not (e.text and e.text.strip()):
                  continue
              wrap = deepcopy(wrap_tmpl)
              extend_element(wrap.find('//*[@id="book-text"]'), e)
@@ -108,19 +141,46 @@ class EpubFormat(Format):
                      }))
              zip.writestr('OPS/%s.html' % partstr, etree.tostring(wrap, method='html'))
  
+        for i, url in enumerate(ctx.images):
+            add_file(url, 'image%s' % i)
+
          if len(ctx.footnotes.output):
              ctx.toc.add("Przypisy", "footnotes.html")
-            manifest.append(etree.Element(OPFNS('item'),
-                    id='footnotes', href='footnotes.html',
-                    **{'media-type': "application/xhtml+xml"}))
+            manifest.append(etree.Element(
+                OPFNS('item'), id='footnotes', href='footnotes.html',
+                **{'media-type': "application/xhtml+xml"}))
              spine.append(etree.Element('itemref', idref='footnotes'))
              wrap = etree.parse(get_resource('formats/epub/res/footnotes.html'))
              extend_element(wrap.find('//*[@id="footnotes"]'), ctx.footnotes.output)
              
-            #chars = chars.union(used_chars(html_tree.getroot()))
+            # chars = chars.union(used_chars(html_tree.getroot()))
              zip.writestr('OPS/footnotes.html', etree.tostring(
                                  wrap, method="html", pretty_print=True))
  
+        footer_text = [
+            'Information about the resource',
+            'Publisher: %s' % self.dc('publisher'),
+            'Rights: %s' % self.dc('rights'),
+            'Intended audience: %s' % self.dc('audience', multiple=True),
+            self.dc('description'),
+            'Resource prepared using MIL/PEER editing platform.',
+            'Source available at %s' % ctx.source_url,
+        ]
+        footer_wrap = deepcopy(wrap_tmpl)
+        footer_body = footer_wrap.find('//*[@id="book-text"]')
+        for line in footer_text:
+            footer_line = etree.Element('p')
+            footer_line.text = line
+            footer_body.append(footer_line)
+        manifest.append(manifest.makeelement(OPFNS('item'), attrib={
+            'id': 'footer',
+            'href': "footer.html",
+            'media-type': 'application/xhtml+xml',
+        }))
+        spine.append(spine.makeelement(OPFNS('itemref'), attrib={
+            'idref': 'footer',
+        }))
+        zip.writestr('OPS/footer.html', etree.tostring(footer_wrap, method='html'))
  
          zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
          ctx.toc.render(toc_file[-1])
@@ -173,6 +233,24 @@ class EpubRenderer(TreeRenderer):
          yield wrapper
  
  
+class NaturalText(EpubRenderer):
+    def render_text(self, text, ctx):
+        root, inner = self.text_container()
+        chunks = re.split('(?<=\s\w) ', text)
+        inner.text = chunks[0]
+        for chunk in chunks[1:]:
+            x = etree.Entity("nbsp")
+            x.tail = chunk
+            inner.append(x)
+        return root
+
+
+class Silent(EpubRenderer):
+    def render_text(self, text, ctx):
+        root, inner = self.text_container()
+        return root
+
+
  class Footnotes(object):
      def __init__(self):
          self.counter = 0
@@ -180,8 +258,8 @@ class Footnotes(object):
  
      def append(self, items):
          self.counter += 1
-        e = etree.Element("a",
-            href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
+        e = etree.Element(
+            "a", href="part%d.html#footnote-anchor-%d" % (int(items[0].get('part_no')), self.counter),
              id="footnote-%d" % self.counter,
              style="float:left;margin-right:1em")
          e.text = "[%d]" % self.counter
@@ -189,9 +267,8 @@ class Footnotes(object):
          self.output.append(e)
          for item in items:
              extend_element(self.output, item)
-        anchor = etree.Element("a",
-            id="footnote-anchor-%d" % self.counter,
-            href="footnotes.html#footnote-%d" % self.counter)
+        anchor = etree.Element(
+            "a", href="footnotes.html#footnote-%d" % self.counter, id="footnote-anchor-%d" % self.counter)
          anchor.text = "[%d]" % self.counter
          return anchor
  
@@ -230,22 +307,24 @@ class TOC(object):
              content.set('src', child.href)
              nav_point.append(content)
              nav_map.append(nav_point)
-            child.render(nav_map)
+            child.render(nav_point)
  
  
  # Renderers
  
-class AsideR(EpubRenderer):
+class AsideR(NaturalText):
      def render(self, element, ctx):
          outputs = list(super(AsideR, self).render(element, ctx))
          anchor = ctx.footnotes.append(outputs)
-        wrapper, inside = self.text_container()  #etree.Element('_', part_no=str(ctx.part_no))
+        wrapper, inside = self.text_container()  # etree.Element('_', part_no=str(ctx.part_no))
          inside.append(anchor)
          yield wrapper
  EpubFormat.renderers.register(core.Aside, None, AsideR('div'))
  
+EpubFormat.renderers.register(core.Aside, 'comment', Silent())
+
  
-class DivR(EpubRenderer):
+class DivR(NaturalText):
      def container(self, ctx):
          root, inner = super(DivR, self).container(ctx)
          if getattr(ctx, 'inline', False):
@@ -253,15 +332,54 @@ class DivR(EpubRenderer):
              inner.set('style', 'display: block;')
          return root, inner
  EpubFormat.renderers.register(core.Div, None, DivR('div'))
+EpubFormat.renderers.register(core.Div, 'p', NaturalText('p'))
+
+EpubFormat.renderers.register(core.Div, 'list', NaturalText('ul'))
+EpubFormat.renderers.register(core.Div, 'list.enum', NaturalText('ol'))
+EpubFormat.renderers.register(core.Div, 'item', NaturalText('li'))
+EpubFormat.renderers.register(core.Span, 'item', NaturalText('li'))
  
  
-class HeaderR(EpubRenderer):
+class DivImageR(EpubRenderer):
+    def render(self, element, ctx):
+        src = element.attrib.get('src', '')
+        ctx.images.append(src)
+        if '/' not in src:
+            raise BuildError('Bad image URL')
+        src = src.rsplit('/', 1)[1]
+        return super(DivImageR, self).render(element, Context(ctx, src=src))
+
+    def container(self, ctx):
+        root, inner = super(DivImageR, self).container(ctx)
+        src = getattr(ctx, 'src', '')
+        inner.set('src', src)
+        # inner.set('style', 'display: block; width: 60%; margin: 3em auto')
+        return root, inner
+EpubFormat.renderers.register(core.Div, 'img', DivImageR('img'))
+
+
+class DivVideoR(Silent):
+    def render(self, element, ctx):
+        src = 'https://www.youtube.com/watch?v=%s' % element.attrib.get('videoid', '')
+        return super(DivVideoR, self).render(element, Context(ctx, src=src))
+
+    def container(self, ctx):
+        root, inner = super(DivVideoR, self).container(ctx)
+        src = getattr(ctx, 'src', '')
+        link = etree.Element('a', {'href': src})
+        link.text = src
+        inner.append(link)
+        return root, inner
+EpubFormat.renderers.register(core.Div, 'video', DivVideoR('p'))
+
+
+class HeaderR(NaturalText):
      def subcontext(self, element, ctx):
          return Context(ctx, inline=True)
  EpubFormat.renderers.register(core.Header, None, HeaderR('h1'))
  
  
-class SectionR(EpubRenderer):
+class SectionR(NaturalText):
      epub_separate = True
  
      def render(self, element, ctx):
@@ -273,7 +391,21 @@ class SectionR(EpubRenderer):
  EpubFormat.renderers.register(core.Section, None, SectionR())
  
  
-class SpanR(EpubRenderer):
+class SpanR(NaturalText):
      pass
  EpubFormat.renderers.register(core.Span, None, SpanR('span'))
+EpubFormat.renderers.register(core.Span, 'cite', SpanR('i'))
+EpubFormat.renderers.register(core.Span, 'emp', SpanR('b'))
+EpubFormat.renderers.register(core.Span, 'emph', SpanR('i'))
+
  
+class SpanLink(EpubRenderer):
+    def render(self, element, ctx):
+        parts = super(SpanLink, self).render(element, ctx)
+        for part in parts:
+            src = element.attrib.get('href', '')
+            if src.startswith('file://'):
+                src = ctx.files_path + src[7:]
+            part[0].attrib['href'] = src
+            yield part
+EpubFormat.renderers.register(core.Span, 'link', SpanLink('a'))