joined epubs, with no themes
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import shutil
11 import sys
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
17 from librarian.parser import WLDocument
18
19 #TODO: shouldn't be repeated here
20 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
21 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
22
23
24 class DocProvider(object):
25     class DoesNotExist(Exception):
26         pass
27     
28     def by_slug(self, slug):
29         raise NotImplemented
30
31     def __getitem__(self, slug):
32         return self.by_slug(slug)
33
34     def by_uri(self, uri):
35         return self.by_slug(uri.rsplit('/', 1)[1])
36
37
38 class DirDocProvider(DocProvider):
39     def __init__(self, dir):
40         self.dir = dir
41         self.files = {}
42
43     def by_slug(self, slug):
44         return open(os.path.join(self.dir, '%s.xml' % slug))
45
46
47 def inner_xml(node):
48     """ returns node's text and children as a string
49
50     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
51     x<b>y</b>z
52     """
53
54     nt = node.text if node.text is not None else ''
55     return ''.join([nt] + [etree.tostring(child) for child in node]) 
56
57 def set_inner_xml(node, text):
58     """ sets node's text and children from a string
59
60     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
61     >>> set_inner_xml(e, 'x<b>y</b>z')
62     >>> print etree.tostring(e)
63     <a>x<b>y</b>z</a>
64     """
65
66     
67     p = etree.fromstring('<x>%s</x>' % text)
68     node.text = p.text
69     node[:] = p[:]
70
71
72 def node_name(node):
73     """ Find out a node's name
74
75     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
76     XYZ
77     """
78
79     tempnode = deepcopy(node)
80
81     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
82         for e in tempnode.findall('.//%s' % p):
83             t = e.tail
84             e.clear()
85             e.tail = t
86     etree.strip_tags(tempnode, '*')
87     return tempnode.text
88
89
90 def xslt(xml, sheet):
91     if isinstance(xml, etree._Element):
92         xml = etree.ElementTree(xml)
93     with open(sheet) as xsltf:
94         return xml.xslt(etree.parse(xsltf))
95
96
97 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
98 def res(fname):
99     return os.path.join(_resdir, fname)
100
101
102 def replace_characters(node):
103     def replace_chars(text):
104         if text is None:
105             return None
106         return text.replace("&", "&amp;")\
107                    .replace("---", "&#8212;")\
108                    .replace("--", "&#8211;")\
109                    .replace(",,", "&#8222;")\
110                    .replace('"', "&#8221;")\
111                    .replace("'", "&#8217;")
112     if node.tag == 'extra':
113         node.clear()
114     else:
115         node.text = replace_chars(node.text)
116         node.tail = replace_chars(node.tail)
117         for child in node:
118             replace_characters(child)
119
120
121 def find_annotations(annotations, source, part_no):
122     for child in source:
123         if child.tag in ('pe', 'pa', 'pt', 'pr'):
124             annotation = deepcopy(child)
125             number = str(len(annotations)+1)
126             annotation.set('number', number)
127             annotation.set('part', str(part_no))
128             annotation.tail = ''
129             annotations.append(annotation)
130             tail = child.tail
131             child.clear()
132             child.tail = tail
133             child.text = number
134         if child.tag not in ('extra', 'podtytul'):
135             find_annotations(annotations, child, part_no)
136
137
138 def replace_by_verse(tree):
139     """ Find stanzas and create new verses in place of a '/' character """
140
141     stanzas = tree.findall('.//' + WLNS('strofa'))
142     for node in stanzas:
143         for child_node in node:
144             if child_node.tag in ('slowo_obce', 'wyroznienie'):
145                 foreign_verses = inner_xml(child_node).split('/\n')
146                 if len(foreign_verses) > 1:
147                     new_foreign = ''
148                     for foreign_verse in foreign_verses:
149                         if foreign_verse.startswith('<wers'):
150                             new_foreign += foreign_verse
151                         else:
152                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
153                     set_inner_xml(child_node, new_foreign)
154         verses = inner_xml(node).split('/\n')
155         if len(verses) > 1:
156             modified_inner_xml = ''
157             for verse in verses:
158                 if verse.startswith('<wers') or verse.startswith('<extra'):
159                     modified_inner_xml += verse
160                 else:
161                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
162             set_inner_xml(node, modified_inner_xml)
163
164
165 def add_to_manifest(manifest, partno):
166     """ Adds a node to the manifest section in content.opf file """
167
168     partstr = 'part%d' % partno
169     e = manifest.makeelement(OPFNS('item'), attrib={
170                                  'id': partstr,
171                                  'href': partstr + '.html',
172                                  'media-type': 'application/xhtml+xml',
173                              })
174     manifest.append(e)
175
176
177 def add_to_spine(spine, partno):
178     """ Adds a node to the spine section in content.opf file """
179
180     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
181     spine.append(e)
182
183
184 class TOC(object):
185     def __init__(self, name=None, part_number=None):
186         self.children = []
187         self.name = name
188         self.part_number = part_number
189         self.sub_number = None
190
191     def add(self, name, part_number, level=0, is_part=True):
192         if level > 0 and self.children:
193             return self.children[-1].add(name, part_number, level-1, is_part)
194         else:
195             t = TOC(name)
196             t.part_number = part_number
197             self.children.append(t)
198             if not is_part:
199                 t.sub_number = len(self.children) + 1
200                 return t.sub_number
201
202     def append(self, toc):
203         self.children.append(toc)
204
205     def extend(self, toc):
206         self.children.extend(toc.children)
207
208     def depth(self):
209         if self.children:
210             return max((c.depth() for c in self.children)) + 1
211         else:
212             return 0
213
214     def write_to_xml(self, nav_map, counter):
215         for child in self.children:
216             nav_point = nav_map.makeelement(NCXNS('navPoint'))
217             nav_point.set('id', 'NavPoint-%d' % counter)
218             nav_point.set('playOrder', str(counter))
219
220             nav_label = nav_map.makeelement(NCXNS('navLabel'))
221             text = nav_map.makeelement(NCXNS('text'))
222             text.text = child.name
223             nav_label.append(text)
224             nav_point.append(nav_label)
225
226             content = nav_map.makeelement(NCXNS('content'))
227             src = 'part%d.html' % child.part_number
228             if child.sub_number is not None:
229                 src += '#sub%d' % child.sub_number
230             content.set('src', src)
231             nav_point.append(content)
232             nav_map.append(nav_point)
233             counter = child.write_to_xml(nav_point, counter + 1)
234         return counter
235
236
237 def chop(main_text):
238     """ divide main content of the XML file into chunks """
239
240     # prepare a container for each chunk
241     part_xml = etree.Element('utwor')
242     etree.SubElement(part_xml, 'master')
243     main_xml_part = part_xml[0] # master
244
245     last_node_part = False
246     for one_part in main_text:
247         name = one_part.tag
248         if name == 'naglowek_czesc':
249             yield part_xml
250             last_node_part = True
251             main_xml_part[:] = [deepcopy(one_part)]
252         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253             yield part_xml
254             main_xml_part[:] = [deepcopy(one_part)]
255         else:
256             main_xml_part.append(deepcopy(one_part))
257             last_node_part = False
258     yield part_xml
259
260
261 def transform_chunk(chunk_xml, chunk_no, annotations):
262     """ transforms one chunk, returns a HTML string and a TOC object """
263
264     toc = TOC()
265     for element in chunk_xml[0]:
266         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
267             toc.add(node_name(element), chunk_no)
268         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
269             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
270             element.set('sub', str(subnumber))
271     find_annotations(annotations, chunk_xml, chunk_no)
272     replace_by_verse(chunk_xml)
273     output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
274     return output_html, toc
275
276
277 def transform(provider, slug, output_file):
278     """ produces an epub
279
280     provider is a DocProvider
281     output_file should be filelike object
282     """
283
284     def transform_file(input_xml, chunk_counter=1, first=True):
285         """ processes one input file and proceeds to its children """
286
287         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
288
289         # every input file will have a TOC entry,
290         # pointing to starting chunk
291         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
292         if first:
293             # write book title page
294             zip.writestr('OPS/title.html',
295                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
296         elif children:
297             # write title page for every parent
298             zip.writestr('OPS/part%d.html' % chunk_counter, 
299                 etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
300             add_to_manifest(manifest, chunk_counter)
301             add_to_spine(spine, chunk_counter)
302             chunk_counter += 1
303
304         if len(input_xml.getroot()) > 1:
305             # rdf before style master
306             main_text = input_xml.getroot()[1]
307         else:
308             # rdf in style master
309             main_text = input_xml.getroot()[0]
310             if main_text.tag == RDFNS('RDF'):
311                 main_text = None
312
313         if main_text is not None:
314             replace_characters(main_text)
315
316             for chunk_no, chunk_xml in enumerate(chop(main_text), chunk_counter):
317                 chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
318                 toc.extend(chunk_toc)
319                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
320                 add_to_manifest(manifest, chunk_counter)
321                 add_to_spine(spine, chunk_counter)
322                 chunk_counter += 1
323
324         if children:
325             for child in children:
326                 child_xml = etree.parse(provider.by_uri(child))
327                 child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
328                 toc.append(child_toc)
329
330         return toc, chunk_counter
331
332
333     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
334
335     # write static elements
336     mime = zipfile.ZipInfo()
337     mime.filename = 'mimetype'
338     mime.compress_type = zipfile.ZIP_STORED
339     mime.extra = ''
340     zip.writestr(mime, 'application/epub+zip')
341     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
342                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
343                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
344                        'media-type="application/oebps-package+xml" />' \
345                        '</rootfiles></container>')
346     for fname in 'style.css', 'logo_wolnelektury.png':
347         zip.write(res(fname), os.path.join('OPS', fname))
348
349     # metadata from first file
350     input_xml = etree.parse(provider[slug])
351     metadata = input_xml.find('.//'+RDFNS('Description'))
352     if metadata is None:
353         raise NoDublinCore('Document has no DublinCore - which is required.')
354     metadata = etree.ElementTree(metadata)
355     opf = xslt(metadata, res('xsltContent.xsl'))
356     manifest = opf.find('.//' + OPFNS('manifest'))
357     spine = opf.find('.//' + OPFNS('spine'))
358
359     annotations = etree.Element('annotations')
360
361     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
362                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
363                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
364                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
365                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
366                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
367                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
368                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
369                                '</navPoint></navMap></ncx>')
370     nav_map = toc_file[-1]
371
372     toc, chunk_counter = transform_file(input_xml)
373     toc_counter = toc.write_to_xml(nav_map, 3) # we already have 2 navpoints
374
375     # Last modifications in container files and EPUB creation
376     if len(annotations) > 0:
377         nav_map.append(etree.fromstring(
378             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
379             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
380         manifest.append(etree.fromstring(
381             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
382         spine.append(etree.fromstring(
383             '<itemref idref="annotations" />'))
384         replace_by_verse(annotations)
385         zip.writestr('OPS/annotations.html', etree.tostring(
386                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
387
388     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
389     contents = []
390     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
391     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
392     for st in attributes:
393         meta = toc_file.makeelement(NCXNS('meta'))
394         meta.set('name', st)
395         meta.set('content', '0')
396         toc_file[0].append(meta)
397     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
398     toc_file[0][1].set('content', str(toc.depth()))
399     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
400     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
401     zip.close()
402
403
404 if __name__ == '__main__':
405     if len(sys.argv) < 2:
406         print >> sys.stderr, 'Usage: python epub.py <input file>'
407         sys.exit(1)
408
409     main_input = sys.argv[1]
410     basepath, ext = os.path.splitext(main_input)
411     path, slug = os.path.realpath(basepath).rsplit('/', 1)
412     output = basepath + '.epub'
413     provider = DirDocProvider(path)
414     transform(provider, slug, open(output, 'w'))
415