change paragraph to p in epub
[librarian.git] / librarian / epub.py
1 # -*- coding: utf-8 -*-
2 #
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.  
5 #
6 from __future__ import with_statement
7
8 import os
9 import os.path
10 import subprocess
11 from copy import deepcopy
12 from lxml import etree
13 import zipfile
14 from tempfile import mkdtemp
15 from shutil import rmtree
16
17 import sys
18 sys.path.append('..') # for running from working copy
19
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
21 from librarian.dcparser import BookInfo
22
23
24 def inner_xml(node):
25     """ returns node's text and children as a string
26
27     >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
28     x<b>y</b>z
29     """
30
31     nt = node.text if node.text is not None else ''
32     return ''.join([nt] + [etree.tostring(child) for child in node]) 
33
34 def set_inner_xml(node, text):
35     """ sets node's text and children from a string
36
37     >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
38     >>> set_inner_xml(e, 'x<b>y</b>z')
39     >>> print etree.tostring(e)
40     <a>x<b>y</b>z</a>
41     """
42
43     p = etree.fromstring('<x>%s</x>' % text)
44     node.text = p.text
45     node[:] = p[:]
46
47
48 def node_name(node):
49     """ Find out a node's name
50
51     >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
52     XYZ
53     """
54
55     tempnode = deepcopy(node)
56
57     for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
58         for e in tempnode.findall('.//%s' % p):
59             t = e.tail
60             e.clear()
61             e.tail = t
62     etree.strip_tags(tempnode, '*')
63     return tempnode.text
64
65
66 def xslt(xml, sheet):
67     if isinstance(xml, etree._Element):
68         xml = etree.ElementTree(xml)
69     with open(sheet) as xsltf:
70         return xml.xslt(etree.parse(xsltf))
71
72
73 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
74 def res(fname):
75     return os.path.join(_resdir, fname)
76
77
78 def replace_characters(node):
79     def replace_chars(text):
80         if text is None:
81             return None
82         return text.replace("---", u"\u2014")\
83                    .replace("--", u"\u2013")\
84                    .replace(",,", u"\u201E")\
85                    .replace('"', u"\u201D")\
86                    .replace("'", u"\u2019")
87     if node.tag == 'extra':
88         node.clear()
89     else:
90         node.text = replace_chars(node.text)
91         node.tail = replace_chars(node.tail)
92         for child in node:
93             replace_characters(child)
94
95
96 def find_annotations(annotations, source, part_no):
97     for child in source:
98         if child.tag in ('pe', 'pa', 'pt', 'pr'):
99             annotation = deepcopy(child)
100             number = str(len(annotations)+1)
101             annotation.set('number', number)
102             annotation.set('part', str(part_no))
103             annotation.tail = ''
104             annotations.append(annotation)
105             tail = child.tail
106             child.clear()
107             child.tail = tail
108             child.text = number
109         if child.tag not in ('extra', 'podtytul'):
110             find_annotations(annotations, child, part_no)
111
112
113 def replace_by_verse(tree):
114     """ Find stanzas and create new verses in place of a '/' character """
115
116     stanzas = tree.findall('.//' + WLNS('strofa'))
117     for node in stanzas:
118         for child_node in node:
119             if child_node.tag in ('slowo_obce', 'wyroznienie'):
120                 foreign_verses = inner_xml(child_node).split('/\n')
121                 if len(foreign_verses) > 1:
122                     new_foreign = ''
123                     for foreign_verse in foreign_verses:
124                         if foreign_verse.startswith('<wers'):
125                             new_foreign += foreign_verse
126                         else:
127                             new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128                     set_inner_xml(child_node, new_foreign)
129         verses = inner_xml(node).split('/\n')
130         if len(verses) > 1:
131             modified_inner_xml = ''
132             for verse in verses:
133                 if verse.startswith('<wers') or verse.startswith('<extra'):
134                     modified_inner_xml += verse
135                 else:
136                     modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137             set_inner_xml(node, modified_inner_xml)
138
139
140 def add_to_manifest(manifest, partno):
141     """ Adds a node to the manifest section in content.opf file """
142
143     partstr = 'part%d' % partno
144     e = manifest.makeelement(OPFNS('item'), attrib={
145                                  'id': partstr,
146                                  'href': partstr + '.html',
147                                  'media-type': 'application/xhtml+xml',
148                              })
149     manifest.append(e)
150
151
152 def add_to_spine(spine, partno):
153     """ Adds a node to the spine section in content.opf file """
154
155     e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
156     spine.append(e)
157
158
159 class TOC(object):
160     def __init__(self, name=None, part_number=None):
161         self.children = []
162         self.name = name
163         self.part_number = part_number
164         self.sub_number = None
165
166     def add(self, name, part_number, level=0, is_part=True):
167         if level > 0 and self.children:
168             return self.children[-1].add(name, part_number, level-1, is_part)
169         else:
170             t = TOC(name)
171             t.part_number = part_number
172             self.children.append(t)
173             if not is_part:
174                 t.sub_number = len(self.children) + 1
175                 return t.sub_number
176
177     def append(self, toc):
178         self.children.append(toc)
179
180     def extend(self, toc):
181         self.children.extend(toc.children)
182
183     def depth(self):
184         if self.children:
185             return max((c.depth() for c in self.children)) + 1
186         else:
187             return 0
188
189     def write_to_xml(self, nav_map, counter):
190         for child in self.children:
191             nav_point = nav_map.makeelement(NCXNS('navPoint'))
192             nav_point.set('id', 'NavPoint-%d' % counter)
193             nav_point.set('playOrder', str(counter))
194
195             nav_label = nav_map.makeelement(NCXNS('navLabel'))
196             text = nav_map.makeelement(NCXNS('text'))
197             text.text = child.name
198             nav_label.append(text)
199             nav_point.append(nav_label)
200
201             content = nav_map.makeelement(NCXNS('content'))
202             src = 'part%d.html' % child.part_number
203             if child.sub_number is not None:
204                 src += '#sub%d' % child.sub_number
205             content.set('src', src)
206             nav_point.append(content)
207             nav_map.append(nav_point)
208             counter = child.write_to_xml(nav_point, counter + 1)
209         return counter
210
211
212 def used_chars(element):
213     """ Lists characters used in an ETree Element """
214     chars = set((element.text or '') + (element.tail or ''))
215     for child in element:
216         chars = chars.union(used_chars(child))
217     return chars
218
219
220 def chop(main_text):
221     """ divide main content of the XML file into chunks """
222
223     # prepare a container for each chunk
224     part_xml = etree.Element('utwor')
225     etree.SubElement(part_xml, 'master')
226     main_xml_part = part_xml[0] # master
227
228     last_node_part = False
229     for one_part in main_text:
230         name = one_part.tag
231         if name == 'naglowek_czesc':
232             yield part_xml
233             last_node_part = True
234             main_xml_part[:] = [deepcopy(one_part)]
235         elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
236             yield part_xml
237             main_xml_part[:] = [deepcopy(one_part)]
238         else:
239             main_xml_part.append(deepcopy(one_part))
240             last_node_part = False
241     yield part_xml
242
243
244 def transform_chunk(chunk_xml, chunk_no, annotations):
245     """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
246
247     toc = TOC()
248     for element in chunk_xml[0]:
249         if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
250             toc.add(node_name(element), chunk_no)
251         elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
252             subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
253             element.set('sub', str(subnumber))
254     find_annotations(annotations, chunk_xml, chunk_no)
255     replace_by_verse(chunk_xml)
256     html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
257     chars = used_chars(html_tree.getroot())
258     output_html = etree.tostring(html_tree, pretty_print=True)
259     return output_html, toc, chars
260
261
262 def transform(provider, slug, output_file=None, output_dir=None):
263     """ produces an epub
264
265     provider is a DocProvider
266     either output_file (a file-like object) or output_dir (path to file/dir) should be specified
267     if output_dir is specified, file will be written to <output_dir>/<author>/<slug>.epub
268     """
269
270     def transform_file(input_xml, chunk_counter=1, first=True):
271         """ processes one input file and proceeds to its children """
272
273         children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
274
275         # every input file will have a TOC entry,
276         # pointing to starting chunk
277         toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
278         chars = set()
279         if first:
280             # write book title page
281             html_tree = xslt(input_xml, res('xsltTitle.xsl'))
282             chars = used_chars(html_tree.getroot())
283             zip.writestr('OPS/title.html',
284                  etree.tostring(html_tree, pretty_print=True))
285         elif children:
286             # write title page for every parent
287             html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
288             chars = used_chars(html_tree.getroot())
289             zip.writestr('OPS/part%d.html' % chunk_counter, 
290                 etree.tostring(html_tree, pretty_print=True))
291             add_to_manifest(manifest, chunk_counter)
292             add_to_spine(spine, chunk_counter)
293             chunk_counter += 1
294
295         if len(input_xml.getroot()) > 1:
296             # rdf before style master
297             main_text = input_xml.getroot()[1]
298         else:
299             # rdf in style master
300             main_text = input_xml.getroot()[0]
301             if main_text.tag == RDFNS('RDF'):
302                 main_text = None
303
304         if main_text is not None:
305             replace_characters(main_text)
306
307             for chunk_xml in chop(main_text):
308                 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
309                 toc.extend(chunk_toc)
310                 chars = chars.union(chunk_chars)
311                 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
312                 add_to_manifest(manifest, chunk_counter)
313                 add_to_spine(spine, chunk_counter)
314                 chunk_counter += 1
315
316         if children:
317             for child in children:
318                 child_xml = etree.parse(provider.by_uri(child))
319                 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
320                 toc.append(child_toc)
321                 chars = chars.union(chunk_chars)
322
323         return toc, chunk_counter, chars
324
325     # read metadata from the first file
326     input_xml = etree.parse(provider[slug])
327     metadata = input_xml.find('.//'+RDFNS('Description'))
328     if metadata is None:
329         raise NoDublinCore('Document has no DublinCore - which is required.')
330     book_info = BookInfo.from_element(input_xml)
331     metadata = etree.ElementTree(metadata)
332
333     # if output to dir, create the file
334     if output_dir is not None:
335         author = unicode(book_info.author)
336         author_dir = os.path.join(output_dir, author)
337         try:
338             os.makedirs(author_dir)
339         except OSError:
340             pass
341         output_file = open(os.path.join(author_dir, '%s.epub' % slug), 'w')
342
343
344     zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
345
346     # write static elements
347     mime = zipfile.ZipInfo()
348     mime.filename = 'mimetype'
349     mime.compress_type = zipfile.ZIP_STORED
350     mime.extra = ''
351     zip.writestr(mime, 'application/epub+zip')
352     zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
353                        'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
354                        '<rootfiles><rootfile full-path="OPS/content.opf" ' \
355                        'media-type="application/oebps-package+xml" />' \
356                        '</rootfiles></container>')
357     for fname in 'style.css', 'logo_wolnelektury.png':
358         zip.write(res(fname), os.path.join('OPS', fname))
359
360     opf = xslt(metadata, res('xsltContent.xsl'))
361     manifest = opf.find('.//' + OPFNS('manifest'))
362     spine = opf.find('.//' + OPFNS('spine'))
363
364     annotations = etree.Element('annotations')
365
366     toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
367                                '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
368                                '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
369                                'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
370                                '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
371                                '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
372                                '</navPoint></navMap></ncx>')
373     nav_map = toc_file[-1]
374
375     toc, chunk_counter, chars = transform_file(input_xml)
376
377     if not toc.children:
378         toc.add(u"Początek utworu", 1)
379     toc_counter = toc.write_to_xml(nav_map, 2)
380
381     # Last modifications in container files and EPUB creation
382     if len(annotations) > 0:
383         nav_map.append(etree.fromstring(
384             '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
385             '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
386         manifest.append(etree.fromstring(
387             '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
388         spine.append(etree.fromstring(
389             '<itemref idref="annotations" />'))
390         replace_by_verse(annotations)
391         html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
392         chars = chars.union(used_chars(html_tree.getroot()))
393         zip.writestr('OPS/annotations.html', etree.tostring(
394                             html_tree, pretty_print=True))
395
396     # strip fonts
397     tmpdir = mkdtemp('-librarian-epub')
398     cwd = os.getcwd()
399
400     os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), '../font-optimizer'))
401     for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
402         subprocess.check_call(['./subset.pl', '--chars', ''.join(chars), res('../fonts/' + fname), os.path.join(tmpdir, fname)])
403         zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
404     rmtree(tmpdir)
405     os.chdir(cwd)
406
407     zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
408     contents = []
409     title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
410     attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
411     for st in attributes:
412         meta = toc_file.makeelement(NCXNS('meta'))
413         meta.set('name', st)
414         meta.set('content', '0')
415         toc_file[0].append(meta)
416     toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
417     toc_file[0][1].set('content', str(toc.depth()))
418     set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
419     zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
420     zip.close()
421
422
423 if __name__ == '__main__':
424     from librarian import DirDocProvider
425
426     if len(sys.argv) < 2:
427         print >> sys.stderr, 'Usage: python epub.py <input file>'
428         sys.exit(1)
429
430     main_input = sys.argv[1]
431     basepath, ext = os.path.splitext(main_input)
432     path, slug = os.path.realpath(basepath).rsplit('/', 1)
433     provider = DirDocProvider(path)
434     transform(provider, slug, output_dir=path)
435