fix for files without dc
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import shutil
11 import sys
12 from copy import deepcopy
13 from lxml import etree
14 import zipfile
15
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
17
18 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
19 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
20
21
22 def inner_xml(node):
23     """ returns node's text and children as a string
24
25     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
26     x<b>y</b>z
27     """
28
29     nt = node.text if node.text is not None else ''
30     return ''.join([nt] + [etree.tostring(child) for child in node]) 
31
32 def set_inner_xml(node, text):
33     """ sets node's text and children from a string
34
35     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
36     >>> set_inner_xml(e, 'x<b>y</b>z')
37     >>> print etree.tostring(e)
38     <a>x<b>y</b>z</a>
39     """
40
41     
42     p = etree.fromstring('<x>%s</x>' % text)
43     node.text = p.text
44     node[:] = p[:]
45
46
47 def node_name(node):
48     """ Find out a node's name
49
50     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
51     XYZ
52     """
53
54     tempnode = deepcopy(node)
55
56     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
57         for e in tempnode.findall('.//%s' % p):
58             t = e.tail
59             e.clear()
60             e.tail = t
61     etree.strip_tags(tempnode, '*')
62     return tempnode.text
63
64
65 def xslt(xml, sheet):
66     if isinstance(xml, etree._Element):
67         xml = etree.ElementTree(xml)
68     with open(sheet) as xsltf:
69         return xml.xslt(etree.parse(xsltf))
70
71
72 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
73 def res(fname):
74     return os.path.join(_resdir, fname)
75
76
77 def replace_characters(node):
78     def replace_chars(text):
79         if text is None:
80             return None
81         return text.replace("&", "&amp;")\
82                    .replace("---", "&#8212;")\
83                    .replace("--", "&#8211;")\
84                    .replace(",,", "&#8222;")\
85                    .replace('"', "&#8221;")\
86                    .replace("'", "&#8217;")
87     if node.tag == 'extra':
88         node.clear()
89     else:
90         node.text = replace_chars(node.text)
91         node.tail = replace_chars(node.tail)
92         for child in node:
93             replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_number):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             annotation.set('number', str(len(annotations)+1))
101             annotation.set('part', str(part_number))
102             annotation.tail = ''
103             annotations.append(annotation)
104             tail = child.tail
105             child.clear()
106             child.tail = tail
107             child.text = str(len(annotations))
108         if child.tag not in ('extra', 'podtytul'):
109             find_annotations(annotations, child, part_number)
110
111
112 def replace_by_verse(tree):
113     """ Find stanzas and create new verses in place of a '/' character """
114     stanzas = tree.findall('.//' + WLNS('strofa'))
115     for node in stanzas:
116         for child_node in node:
117             if child_node.tag in ('slowo_obce', 'wyroznienie'):
118                 foreign_verses = inner_xml(child_node).split('/\n')
119                 if len(foreign_verses) > 1:
120                     new_foreign = ''
121                     for foreign_verse in foreign_verses:
122                         if foreign_verse.startswith('<wers'):
123                             new_foreign += foreign_verse
124                         else:
125                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
126                     set_inner_xml(child_node, new_foreign)
127         verses = inner_xml(node).split('/\n')
128         if len(verses) > 1:
129             modified_inner_xml = ''
130             for verse in verses:
131                 if verse.startswith('<wers') or verse.startswith('<extra'):
132                     modified_inner_xml += verse
133                 else:
134                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
135             set_inner_xml(node, modified_inner_xml)
136
137
138 def add_to_manifest(manifest, partno):
139     """ Adds a node to the manifest section in content.opf file """
140     partstr = 'part%d' % partno
141     e = manifest.makeelement(OPFNS('item'), attrib={
142                                  'id': partstr,
143                                  'href': partstr + '.html',
144                                  'media-type': 'application/xhtml+xml',
145                              })
146     manifest.append(e)
147
148
149 def add_to_spine(spine, partno):
150     """ Adds a node to the spine section in content.opf file """
151     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
152     spine.append(e)
153
154
155 def add_nav_point(nav_map, counter, title, part_counter):
156     nav_point = nav_map.makeelement(NCXNS('navPoint'))
157     nav_point.set('id', 'NavPoint-%d' % counter)
158     nav_point.set('playOrder', str(counter))
159
160     nav_label = nav_map.makeelement(NCXNS('navLabel'))
161     text = nav_map.makeelement(NCXNS('text'))
162     text.text = title
163     nav_label.append(text)
164     nav_point.append(nav_label)
165
166     content = nav_map.makeelement(NCXNS('content'))
167     content.set('src', 'part%d.html' % part_counter)
168     nav_point.append(content)
169
170     nav_map.append(nav_point)
171
172
173 def add_nav_point2(nav_map, counter, title, part_counter, subcounter):
174     nav_point = nav_map.makeelement(NCXNS('navPoint'))
175     nav_point.set('id', 'NavPoint-%d' % counter)
176     nav_point.set('playOrder', str(counter))
177
178     nav_label = nav_map.makeelement(NCXNS('navLabel'))
179     text = nav_map.makeelement(NCXNS('text'))
180     text.text = title
181     nav_label.append(text)
182     nav_point.append(nav_label)
183
184     content = nav_map.makeelement(NCXNS('content'))
185     content.set('src', 'part%d.html#sub%d' % (part_counter, subcounter))
186     nav_point.append(content)
187
188     nav_map[-1].append(nav_point)
189
190
191 def transform(input_file, output_file):
192     """ produces an epub
193
194     input_file and output_file should be filelike objects
195     """
196
197     input_xml = etree.parse(input_file)
198
199     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
200
201     mime = zipfile.ZipInfo()
202     mime.filename = 'mimetype'
203     mime.compress_type = zipfile.ZIP_STORED
204     mime.extra = ''
205     zip.writestr(mime, 'application/epub+zip')
206
207     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
208                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
209                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
210                        'media-type="application/oebps-package+xml" />' \
211                        '</rootfiles></container>')
212
213     metadata_el = input_xml.find('.//'+RDFNS('Description'))
214     if metadata_el is None:
215         raise NoDublinCore('Document has no DublinCore - which is required.')
216     metadatasource = etree.ElementTree(metadata_el)
217
218     opf = xslt(metadatasource, res('xsltContent.xsl'))
219
220     manifest = opf.find('.//' + OPFNS('manifest'))
221     spine = opf.find('.//' + OPFNS('spine'))
222
223     for fname in 'style.css', 'logo_wolnelektury.png':
224         zip.write(res(fname), os.path.join('OPS', fname))
225
226     annotations = etree.Element('annotations')
227     part_xml = etree.Element('utwor')
228     etree.SubElement(part_xml, 'master')
229
230     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
231                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
232                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
233                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
234                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
235                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
236                                '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
237                                '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
238                                '</navPoint></navMap></ncx>')
239
240     main_xml_part = part_xml[0] # było [0][0], master
241     nav_map = toc_file[-1] # było [-1][-1]
242     depth = 1 # navmap
243
244     if len(input_xml.getroot()) > 1:
245         # rdf before style master
246         main_text = input_xml.getroot()[1]
247     else:
248         # rdf in style master
249         main_text = input_xml.getroot()[0]
250
251     replace_characters(main_text)
252     zip.writestr('OPS/title.html', 
253                  etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
254
255     # Search for table of contents elements and book division
256
257     stupid_i = stupid_j = stupid_k = 1
258     last_node_part = False
259     for one_part in main_text:
260         name = one_part.tag
261         if name in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
262             if name == "naglowek_czesc":
263                 stupid_k = 1
264                 last_node_part = True
265                 find_annotations(annotations, part_xml, stupid_j)
266                 replace_by_verse(part_xml)
267                 zip.writestr('OPS/part%d.html' % stupid_j,
268                             etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
269                 main_xml_part[:] = [deepcopy(one_part)]
270                 # add to manifest and spine
271                 add_to_manifest(manifest, stupid_j)
272                 add_to_spine(spine, stupid_j)
273                 name_toc = node_name(one_part)
274                 # build table of contents
275                 # i+2 because of title page
276                 add_nav_point(nav_map, stupid_i+2, name_toc, stupid_j + 1)
277                 stupid_i += 1
278                 stupid_j += 1
279             else:
280                 if last_node_part:
281                     main_xml_part.append(one_part)
282                     last_node_part = False
283                     name_toc = node_name(one_part)
284                     add_nav_point(nav_map, stupid_i + 1, name_toc, stupid_j)
285                 else:
286                     stupid_k = 1
287                     find_annotations(annotations, part_xml, stupid_j)
288                     replace_by_verse(part_xml)
289                     zip.writestr('OPS/part%d.html' % stupid_j,
290                                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
291                     # start building a new part
292                     main_xml_part[:] = [deepcopy(one_part)]
293                     add_to_manifest(manifest, stupid_j)
294                     add_to_spine(spine, stupid_j)
295                     name_toc = node_name(one_part)
296                     add_nav_point(nav_map, stupid_i + 2, name_toc, stupid_j + 1) # title page
297                     stupid_j += 1
298                     stupid_i += 1
299         else:
300             if name in ('naglowek_podrozdzial', 'naglowek_scena'):
301                 depth = 2
302                 name_toc =  node_name(one_part)
303                 add_nav_point2(nav_map, stupid_i + 2, name_toc, stupid_j, stupid_k)
304                 one_part.set('sub', str(stupid_k))
305                 stupid_k += 1
306                 stupid_i += 1
307             main_xml_part.append(deepcopy(one_part))
308             last_node_part = False
309     find_annotations(annotations, part_xml, stupid_j)
310     replace_by_verse(part_xml)
311     add_to_manifest(manifest, stupid_j)
312     add_to_spine(spine, stupid_j)
313
314     zip.writestr('OPS/part%d.html' % stupid_j,
315                  etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
316
317     # Last modifications in container files and EPUB creation
318     if len(annotations) > 0:
319         nav_map.append(etree.fromstring(
320             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
321             '</navLabel><content src="annotations.html" /></navPoint>' % {'i':stupid_i+2}))
322         manifest.append(etree.fromstring(
323             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
324         spine.append(etree.fromstring(
325             '<itemref idref="annotations" />'))
326         replace_by_verse(annotations)
327         zip.writestr('OPS/annotations.html', etree.tostring(
328                             xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
329
330     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
331     contents = []
332     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
333     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
334     for st in attributes:
335         meta = toc_file.makeelement(NCXNS('meta'))
336         meta.set('name', st)
337         meta.set('content', '0')
338         toc_file[0].append(meta)
339     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
340     toc_file[0][1].set('content', str(depth))
341     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
342     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
343     zip.close()
344
345
346 if __name__ == '__main__':
347     import html
348
349     if len(sys.argv) < 2:
350         print >> sys.stderr, 'Usage: python epub.py <input file> [output file]'
351         sys.exit(1)
352
353     input = sys.argv[1]
354     if len(sys.argv) > 2:
355         output = sys.argv[2]
356     else:
357         basename, ext = os.path.splitext(input)
358         output = basename + '.epub' 
359
360     print input
361     if html.transform(input, is_file=True) == '<empty />':
362         print 'empty content - skipping'
363     else:
364         transform(open(input, 'r'), open(output, 'w'))
365
366
367