fix special characters
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import shutil
11 import sys
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
17 from librarian.parser import WLDocument
18
19 #TODO: shouldn't be repeated here
20 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
21 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
22
23
24 class DocProvider(object):
25     class DoesNotExist(Exception):
26         pass
27     
28     def by_slug(self, slug):
29         raise NotImplemented
30
31     def __getitem__(self, slug):
32         return self.by_slug(slug)
33
34     def by_uri(self, uri):
35         return self.by_slug(uri.rsplit('/', 1)[1])
36
37
38 class DirDocProvider(DocProvider):
39     def __init__(self, dir):
40         self.dir = dir
41         self.files = {}
42
43     def by_slug(self, slug):
44         return open(os.path.join(self.dir, '%s.xml' % slug))
45
46
47 def inner_xml(node):
48     """ returns node's text and children as a string
49
50     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
51     x<b>y</b>z
52     """
53
54     nt = node.text if node.text is not None else ''
55     return ''.join([nt] + [etree.tostring(child) for child in node]) 
56
57 def set_inner_xml(node, text):
58     """ sets node's text and children from a string
59
60     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
61     >>> set_inner_xml(e, 'x<b>y</b>z')
62     >>> print etree.tostring(e)
63     <a>x<b>y</b>z</a>
64     """
65
66     
67     p = etree.fromstring('<x>%s</x>' % text)
68     node.text = p.text
69     node[:] = p[:]
70
71
72 def node_name(node):
73     """ Find out a node's name
74
75     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
76     XYZ
77     """
78
79     tempnode = deepcopy(node)
80
81     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
82         for e in tempnode.findall('.//%s' % p):
83             t = e.tail
84             e.clear()
85             e.tail = t
86     etree.strip_tags(tempnode, '*')
87     return tempnode.text
88
89
90 def xslt(xml, sheet):
91     if isinstance(xml, etree._Element):
92         xml = etree.ElementTree(xml)
93     with open(sheet) as xsltf:
94         return xml.xslt(etree.parse(xsltf))
95
96
97 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
98 def res(fname):
99     return os.path.join(_resdir, fname)
100
101
102 def replace_characters(node):
103     def replace_chars(text):
104         if text is None:
105             return None
106         return text.replace("---", u"\u2014")\
107                    .replace("--", u"\u2013")\
108                    .replace(",,", u"\u201E")\
109                    .replace('"', u"\u201D")\
110                    .replace("'", u"\u2019")
111     if node.tag == 'extra':
112         node.clear()
113     else:
114         node.text = replace_chars(node.text)
115         node.tail = replace_chars(node.tail)
116         for child in node:
117             replace_characters(child)
118
119
120 def find_annotations(annotations, source, part_no):
121     for child in source:
122         if child.tag in ('pe', 'pa', 'pt', 'pr'):
123             annotation = deepcopy(child)
124             number = str(len(annotations)+1)
125             annotation.set('number', number)
126             annotation.set('part', str(part_no))
127             annotation.tail = ''
128             annotations.append(annotation)
129             tail = child.tail
130             child.clear()
131             child.tail = tail
132             child.text = number
133         if child.tag not in ('extra', 'podtytul'):
134             find_annotations(annotations, child, part_no)
135
136
137 def replace_by_verse(tree):
138     """ Find stanzas and create new verses in place of a '/' character """
139
140     stanzas = tree.findall('.//' + WLNS('strofa'))
141     for node in stanzas:
142         for child_node in node:
143             if child_node.tag in ('slowo_obce', 'wyroznienie'):
144                 foreign_verses = inner_xml(child_node).split('/\n')
145                 if len(foreign_verses) > 1:
146                     new_foreign = ''
147                     for foreign_verse in foreign_verses:
148                         if foreign_verse.startswith('<wers'):
149                             new_foreign += foreign_verse
150                         else:
151                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
152                     set_inner_xml(child_node, new_foreign)
153         verses = inner_xml(node).split('/\n')
154         if len(verses) > 1:
155             modified_inner_xml = ''
156             for verse in verses:
157                 if verse.startswith('<wers') or verse.startswith('<extra'):
158                     modified_inner_xml += verse
159                 else:
160                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
161             set_inner_xml(node, modified_inner_xml)
162
163
164 def add_to_manifest(manifest, partno):
165     """ Adds a node to the manifest section in content.opf file """
166
167     partstr = 'part%d' % partno
168     e = manifest.makeelement(OPFNS('item'), attrib={
169                                  'id': partstr,
170                                  'href': partstr + '.html',
171                                  'media-type': 'application/xhtml+xml',
172                              })
173     manifest.append(e)
174
175
176 def add_to_spine(spine, partno):
177     """ Adds a node to the spine section in content.opf file """
178
179     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
180     spine.append(e)
181
182
183 class TOC(object):
184     def __init__(self, name=None, part_number=None):
185         self.children = []
186         self.name = name
187         self.part_number = part_number
188         self.sub_number = None
189
190     def add(self, name, part_number, level=0, is_part=True):
191         if level > 0 and self.children:
192             return self.children[-1].add(name, part_number, level-1, is_part)
193         else:
194             t = TOC(name)
195             t.part_number = part_number
196             self.children.append(t)
197             if not is_part:
198                 t.sub_number = len(self.children) + 1
199                 return t.sub_number
200
201     def append(self, toc):
202         self.children.append(toc)
203
204     def extend(self, toc):
205         self.children.extend(toc.children)
206
207     def depth(self):
208         if self.children:
209             return max((c.depth() for c in self.children)) + 1
210         else:
211             return 0
212
213     def write_to_xml(self, nav_map, counter):
214         for child in self.children:
215             nav_point = nav_map.makeelement(NCXNS('navPoint'))
216             nav_point.set('id', 'NavPoint-%d' % counter)
217             nav_point.set('playOrder', str(counter))
218
219             nav_label = nav_map.makeelement(NCXNS('navLabel'))
220             text = nav_map.makeelement(NCXNS('text'))
221             text.text = child.name
222             nav_label.append(text)
223             nav_point.append(nav_label)
224
225             content = nav_map.makeelement(NCXNS('content'))
226             src = 'part%d.html' % child.part_number
227             if child.sub_number is not None:
228                 src += '#sub%d' % child.sub_number
229             content.set('src', src)
230             nav_point.append(content)
231             nav_map.append(nav_point)
232             counter = child.write_to_xml(nav_point, counter + 1)
233         return counter
234
235
236 def chop(main_text):
237     """ divide main content of the XML file into chunks """
238
239     # prepare a container for each chunk
240     part_xml = etree.Element('utwor')
241     etree.SubElement(part_xml, 'master')
242     main_xml_part = part_xml[0] # master
243
244     last_node_part = False
245     for one_part in main_text:
246         name = one_part.tag
247         if name == 'naglowek_czesc':
248             yield part_xml
249             last_node_part = True
250             main_xml_part[:] = [deepcopy(one_part)]
251         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
252             yield part_xml
253             main_xml_part[:] = [deepcopy(one_part)]
254         else:
255             main_xml_part.append(deepcopy(one_part))
256             last_node_part = False
257     yield part_xml
258
259
260 def transform_chunk(chunk_xml, chunk_no, annotations):
261     """ transforms one chunk, returns a HTML string and a TOC object """
262
263     toc = TOC()
264     for element in chunk_xml[0]:
265         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
266             toc.add(node_name(element), chunk_no)
267         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
268             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
269             element.set('sub', str(subnumber))
270     find_annotations(annotations, chunk_xml, chunk_no)
271     replace_by_verse(chunk_xml)
272     output_html = etree.tostring(xslt(chunk_xml, res('xsltScheme.xsl')), pretty_print=True)
273     return output_html, toc
274
275
276 def transform(provider, slug, output_file):
277     """ produces an epub
278
279     provider is a DocProvider
280     output_file should be filelike object
281     """
282
283     def transform_file(input_xml, chunk_counter=1, first=True):
284         """ processes one input file and proceeds to its children """
285
286         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
287
288         # every input file will have a TOC entry,
289         # pointing to starting chunk
290         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
291         if first:
292             # write book title page
293             zip.writestr('OPS/title.html',
294                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
295         elif children:
296             # write title page for every parent
297             zip.writestr('OPS/part%d.html' % chunk_counter, 
298                 etree.tostring(xslt(input_xml, res('xsltChunkTitle.xsl')), pretty_print=True))
299             add_to_manifest(manifest, chunk_counter)
300             add_to_spine(spine, chunk_counter)
301             chunk_counter += 1
302
303         if len(input_xml.getroot()) > 1:
304             # rdf before style master
305             main_text = input_xml.getroot()[1]
306         else:
307             # rdf in style master
308             main_text = input_xml.getroot()[0]
309             if main_text.tag == RDFNS('RDF'):
310                 main_text = None
311
312         if main_text is not None:
313             replace_characters(main_text)
314
315             for chunk_no, chunk_xml in enumerate(chop(main_text), chunk_counter):
316                 chunk_html, chunk_toc = transform_chunk(chunk_xml, chunk_counter, annotations)
317                 toc.extend(chunk_toc)
318                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
319                 add_to_manifest(manifest, chunk_counter)
320                 add_to_spine(spine, chunk_counter)
321                 chunk_counter += 1
322
323         if children:
324             for child in children:
325                 child_xml = etree.parse(provider.by_uri(child))
326                 child_toc, chunk_counter = transform_file(child_xml, chunk_counter, first=False)
327                 toc.append(child_toc)
328
329         return toc, chunk_counter
330
331
332     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
333
334     # write static elements
335     mime = zipfile.ZipInfo()
336     mime.filename = 'mimetype'
337     mime.compress_type = zipfile.ZIP_STORED
338     mime.extra = ''
339     zip.writestr(mime, 'application/epub+zip')
340     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
341                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
342                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
343                        'media-type="application/oebps-package+xml" />' \
344                        '</rootfiles></container>')
345     for fname in 'style.css', 'logo_wolnelektury.png':
346         zip.write(res(fname), os.path.join('OPS', fname))
347
348     # metadata from first file
349     input_xml = etree.parse(provider[slug])
350     metadata = input_xml.find('.//'+RDFNS('Description'))
351     if metadata is None:
352         raise NoDublinCore('Document has no DublinCore - which is required.')
353     metadata = etree.ElementTree(metadata)
354     opf = xslt(metadata, res('xsltContent.xsl'))
355     manifest = opf.find('.//' + OPFNS('manifest'))
356     spine = opf.find('.//' + OPFNS('spine'))
357
358     annotations = etree.Element('annotations')
359
360     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
361                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
362                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
363                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
364                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
365                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
366                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
367                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
368                                '</navPoint></navMap></ncx>')
369     nav_map = toc_file[-1]
370
371     toc, chunk_counter = transform_file(input_xml)
372     toc_counter = toc.write_to_xml(nav_map, 3) # we already have 2 navpoints
373
374     # Last modifications in container files and EPUB creation
375     if len(annotations) > 0:
376         nav_map.append(etree.fromstring(
377             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
378             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
379         manifest.append(etree.fromstring(
380             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
381         spine.append(etree.fromstring(
382             '<itemref idref="annotations" />'))
383         replace_by_verse(annotations)
384         zip.writestr('OPS/annotations.html', etree.tostring(
385                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
386
387     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
388     contents = []
389     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
390     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
391     for st in attributes:
392         meta = toc_file.makeelement(NCXNS('meta'))
393         meta.set('name', st)
394         meta.set('content', '0')
395         toc_file[0].append(meta)
396     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
397     toc_file[0][1].set('content', str(toc.depth()))
398     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
399     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
400     zip.close()
401
402
403 if __name__ == '__main__':
404     if len(sys.argv) < 2:
405         print >> sys.stderr, 'Usage: python epub.py <input file>'
406         sys.exit(1)
407
408     main_input = sys.argv[1]
409     basepath, ext = os.path.splitext(main_input)
410     path, slug = os.path.realpath(basepath).rsplit('/', 1)
411     output = basepath + '.epub'
412     provider = DirDocProvider(path)
413     transform(provider, slug, open(output, 'w'))
414