1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from copy import deepcopy
13 from lxml import etree
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS, NoDublinCore
18 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
19 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
23 """ returns node's text and children as a string
25 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
29 nt = node.text if node.text is not None else ''
30 return ''.join([nt] + [etree.tostring(child) for child in node])
32 def set_inner_xml(node, text):
33 """ sets node's text and children from a string
35 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
36 >>> set_inner_xml(e, 'x<b>y</b>z')
37 >>> print etree.tostring(e)
42 p = etree.fromstring('<x>%s</x>' % text)
48 """ Find out a node's name
50 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
54 tempnode = deepcopy(node)
56 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
57 for e in tempnode.findall('.//%s' % p):
61 etree.strip_tags(tempnode, '*')
66 if isinstance(xml, etree._Element):
67 xml = etree.ElementTree(xml)
68 with open(sheet) as xsltf:
69 return xml.xslt(etree.parse(xsltf))
72 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
74 return os.path.join(_resdir, fname)
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace("&", "&")\
82 .replace("---", "—")\
83 .replace("--", "–")\
84 .replace(",,", "„")\
85 .replace('"', "”")\
86 .replace("'", "’")
87 if node.tag == 'extra':
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_number):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 annotation.set('number', str(len(annotations)+1))
101 annotation.set('part', str(part_number))
103 annotations.append(annotation)
107 child.text = str(len(annotations))
108 if child.tag not in ('extra', 'podtytul'):
109 find_annotations(annotations, child, part_number)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
114 stanzas = tree.findall('.//' + WLNS('strofa'))
116 for child_node in node:
117 if child_node.tag in ('slowo_obce', 'wyroznienie'):
118 foreign_verses = inner_xml(child_node).split('/\n')
119 if len(foreign_verses) > 1:
121 for foreign_verse in foreign_verses:
122 if foreign_verse.startswith('<wers'):
123 new_foreign += foreign_verse
125 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
126 set_inner_xml(child_node, new_foreign)
127 verses = inner_xml(node).split('/\n')
129 modified_inner_xml = ''
131 if verse.startswith('<wers') or verse.startswith('<extra'):
132 modified_inner_xml += verse
134 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
135 set_inner_xml(node, modified_inner_xml)
138 def add_to_manifest(manifest, partno):
139 """ Adds a node to the manifest section in content.opf file """
140 partstr = 'part%d' % partno
141 e = manifest.makeelement(OPFNS('item'), attrib={
143 'href': partstr + '.html',
144 'media-type': 'application/xhtml+xml',
149 def add_to_spine(spine, partno):
150 """ Adds a node to the spine section in content.opf file """
151 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
155 def add_nav_point(nav_map, counter, title, part_counter):
156 nav_point = nav_map.makeelement(NCXNS('navPoint'))
157 nav_point.set('id', 'NavPoint-%d' % counter)
158 nav_point.set('playOrder', str(counter))
160 nav_label = nav_map.makeelement(NCXNS('navLabel'))
161 text = nav_map.makeelement(NCXNS('text'))
163 nav_label.append(text)
164 nav_point.append(nav_label)
166 content = nav_map.makeelement(NCXNS('content'))
167 content.set('src', 'part%d.html' % part_counter)
168 nav_point.append(content)
170 nav_map.append(nav_point)
173 def add_nav_point2(nav_map, counter, title, part_counter, subcounter):
174 nav_point = nav_map.makeelement(NCXNS('navPoint'))
175 nav_point.set('id', 'NavPoint-%d' % counter)
176 nav_point.set('playOrder', str(counter))
178 nav_label = nav_map.makeelement(NCXNS('navLabel'))
179 text = nav_map.makeelement(NCXNS('text'))
181 nav_label.append(text)
182 nav_point.append(nav_label)
184 content = nav_map.makeelement(NCXNS('content'))
185 content.set('src', 'part%d.html#sub%d' % (part_counter, subcounter))
186 nav_point.append(content)
188 nav_map[-1].append(nav_point)
191 def transform(input_file, output_file):
194 input_file and output_file should be filelike objects
197 input_xml = etree.parse(input_file)
199 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
201 mime = zipfile.ZipInfo()
202 mime.filename = 'mimetype'
203 mime.compress_type = zipfile.ZIP_STORED
205 zip.writestr(mime, 'application/epub+zip')
207 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
208 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
209 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
210 'media-type="application/oebps-package+xml" />' \
211 '</rootfiles></container>')
213 metadata_el = input_xml.find('.//'+RDFNS('Description'))
214 if metadata_el is None:
215 raise NoDublinCore('Document has no DublinCore - which is required.')
216 metadatasource = etree.ElementTree(metadata_el)
218 opf = xslt(metadatasource, res('xsltContent.xsl'))
220 manifest = opf.find('.//' + OPFNS('manifest'))
221 spine = opf.find('.//' + OPFNS('spine'))
223 for fname in 'style.css', 'logo_wolnelektury.png':
224 zip.write(res(fname), os.path.join('OPS', fname))
226 annotations = etree.Element('annotations')
227 part_xml = etree.Element('utwor')
228 etree.SubElement(part_xml, 'master')
230 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
231 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
232 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
233 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
234 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
235 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
236 '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
237 '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
238 '</navPoint></navMap></ncx>')
240 main_xml_part = part_xml[0] # było [0][0], master
241 nav_map = toc_file[-1] # było [-1][-1]
244 if len(input_xml.getroot()) > 1:
245 # rdf before style master
246 main_text = input_xml.getroot()[1]
248 # rdf in style master
249 main_text = input_xml.getroot()[0]
251 replace_characters(main_text)
252 zip.writestr('OPS/title.html',
253 etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
255 # Search for table of contents elements and book division
257 stupid_i = stupid_j = stupid_k = 1
258 last_node_part = False
259 for one_part in main_text:
261 if name in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
262 if name == "naglowek_czesc":
264 last_node_part = True
265 find_annotations(annotations, part_xml, stupid_j)
266 replace_by_verse(part_xml)
267 zip.writestr('OPS/part%d.html' % stupid_j,
268 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
269 main_xml_part[:] = [deepcopy(one_part)]
270 # add to manifest and spine
271 add_to_manifest(manifest, stupid_j)
272 add_to_spine(spine, stupid_j)
273 name_toc = node_name(one_part)
274 # build table of contents
275 # i+2 because of title page
276 add_nav_point(nav_map, stupid_i+2, name_toc, stupid_j + 1)
281 main_xml_part.append(one_part)
282 last_node_part = False
283 name_toc = node_name(one_part)
284 add_nav_point(nav_map, stupid_i + 1, name_toc, stupid_j)
287 find_annotations(annotations, part_xml, stupid_j)
288 replace_by_verse(part_xml)
289 zip.writestr('OPS/part%d.html' % stupid_j,
290 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
291 # start building a new part
292 main_xml_part[:] = [deepcopy(one_part)]
293 add_to_manifest(manifest, stupid_j)
294 add_to_spine(spine, stupid_j)
295 name_toc = node_name(one_part)
296 add_nav_point(nav_map, stupid_i + 2, name_toc, stupid_j + 1) # title page
300 if name in ('naglowek_podrozdzial', 'naglowek_scena'):
302 name_toc = node_name(one_part)
303 add_nav_point2(nav_map, stupid_i + 2, name_toc, stupid_j, stupid_k)
304 one_part.set('sub', str(stupid_k))
307 main_xml_part.append(deepcopy(one_part))
308 last_node_part = False
309 find_annotations(annotations, part_xml, stupid_j)
310 replace_by_verse(part_xml)
311 add_to_manifest(manifest, stupid_j)
312 add_to_spine(spine, stupid_j)
314 zip.writestr('OPS/part%d.html' % stupid_j,
315 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
317 # Last modifications in container files and EPUB creation
318 if len(annotations) > 0:
319 nav_map.append(etree.fromstring(
320 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
321 '</navLabel><content src="annotations.html" /></navPoint>' % {'i':stupid_i+2}))
322 manifest.append(etree.fromstring(
323 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
324 spine.append(etree.fromstring(
325 '<itemref idref="annotations" />'))
326 replace_by_verse(annotations)
327 zip.writestr('OPS/annotations.html', etree.tostring(
328 xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
330 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
332 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
333 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
334 for st in attributes:
335 meta = toc_file.makeelement(NCXNS('meta'))
337 meta.set('content', '0')
338 toc_file[0].append(meta)
339 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
340 toc_file[0][1].set('content', str(depth))
341 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
342 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
346 if __name__ == '__main__':
349 if len(sys.argv) < 2:
350 print >> sys.stderr, 'Usage: python epub.py <input file> [output file]'
354 if len(sys.argv) > 2:
357 basename, ext = os.path.splitext(input)
358 output = basename + '.epub'
361 if html.transform(input, is_file=True) == '<empty />':
362 print 'empty content - skipping'
364 transform(open(input, 'r'), open(output, 'w'))