1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from copy import deepcopy
13 from lxml import etree
16 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, XHTMLNS
18 NCXNS = XMLNamespace("http://www.daisy.org/z3986/2005/ncx/")
19 OPFNS = XMLNamespace("http://www.idpf.org/2007/opf")
23 """ returns node's text and children as a string
25 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
29 nt = node.text if node.text is not None else ''
30 return ''.join([nt] + [etree.tostring(child) for child in node])
32 def set_inner_xml(node, text):
33 """ sets node's text and children from a string
35 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
36 >>> set_inner_xml(e, 'x<b>y</b>z')
37 >>> print etree.tostring(e)
42 p = etree.fromstring('<x>%s</x>' % text)
48 """ Find out a node's name
50 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
54 tempnode = deepcopy(node)
56 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
57 for e in tempnode.findall('.//%s' % p):
61 etree.strip_tags(tempnode, '*')
66 if isinstance(xml, etree._Element):
67 xml = etree.ElementTree(xml)
68 with open(sheet) as xsltf:
69 return xml.xslt(etree.parse(xsltf))
72 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
74 return os.path.join(_resdir, fname)
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace("&", "&")\
82 .replace("---", "—")\
83 .replace("--", "–")\
84 .replace(",,", "„")\
85 .replace('"', "”")\
86 .replace("'", "’")
87 if node.tag == 'extra':
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_number):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 annotation.set('number', str(len(annotations)+1))
101 annotation.set('part', str(part_number))
103 annotations.append(annotation)
107 child.text = str(len(annotations))
108 if child.tag not in ('extra', 'podtytul'):
109 find_annotations(annotations, child, part_number)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
114 stanzas = tree.findall('.//' + WLNS('strofa'))
116 for child_node in node:
117 if child_node.tag in ('slowo_obce', 'wyroznienie'):
118 foreign_verses = inner_xml(child_node).split('/\n')
119 if len(foreign_verses) > 1:
121 for foreign_verse in foreign_verses:
122 if foreign_verse.startswith('<wers'):
123 new_foreign += foreign_verse
125 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
126 set_inner_xml(child_node, new_foreign)
127 verses = inner_xml(node).split('/\n')
129 modified_inner_xml = ''
131 if verse.startswith('<wers') or verse.startswith('<extra'):
132 modified_inner_xml += verse
134 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
135 set_inner_xml(node, modified_inner_xml)
138 def add_to_manifest(manifest, partno):
139 """ Adds a node to the manifest section in content.opf file """
140 partstr = 'part%d' % partno
141 e = manifest.makeelement(OPFNS('item'), attrib={
143 'href': partstr + '.html',
144 'media-type': 'application/xhtml+xml',
149 def add_to_spine(spine, partno):
150 """ Adds a node to the spine section in content.opf file """
151 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
155 def add_nav_point(nav_map, counter, title, part_counter):
156 nav_point = nav_map.makeelement(NCXNS('navPoint'))
157 nav_point.set('id', 'NavPoint-%d' % counter)
158 nav_point.set('playOrder', str(counter))
160 nav_label = nav_map.makeelement(NCXNS('navLabel'))
161 text = nav_map.makeelement(NCXNS('text'))
163 nav_label.append(text)
164 nav_point.append(nav_label)
166 content = nav_map.makeelement(NCXNS('content'))
167 content.set('src', 'part%d.html' % part_counter)
168 nav_point.append(content)
170 nav_map.append(nav_point)
173 def add_nav_point2(nav_map, counter, title, part_counter, subcounter):
174 nav_point = nav_map.makeelement(NCXNS('navPoint'))
175 nav_point.set('id', 'NavPoint-%d' % counter)
176 nav_point.set('playOrder', str(counter))
178 nav_label = nav_map.makeelement(NCXNS('navLabel'))
179 text = nav_map.makeelement(NCXNS('text'))
181 nav_label.append(text)
182 nav_point.append(nav_label)
184 content = nav_map.makeelement(NCXNS('content'))
185 content.set('src', 'part%d.html#sub%d' % (part_counter, subcounter))
186 nav_point.append(content)
188 nav_map[-1].append(nav_point)
191 def transform(input_file, output_file):
194 input_file and output_file should be filelike objects
197 input_xml = etree.parse(input_file)
199 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
201 mime = zipfile.ZipInfo()
202 mime.filename = 'mimetype'
203 mime.compress_type = zipfile.ZIP_STORED
205 zip.writestr(mime, 'application/epub+zip')
207 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
208 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
209 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
210 'media-type="application/oebps-package+xml" />' \
211 '</rootfiles></container>')
213 metadata_el = input_xml.find('.//'+RDFNS('Description'))
214 metadatasource = etree.ElementTree(metadata_el)
216 opf = xslt(metadatasource, res('xsltContent.xsl'))
218 manifest = opf.find('.//' + OPFNS('manifest'))
219 spine = opf.find('.//' + OPFNS('spine'))
221 for fname in 'style.css', 'logo_wolnelektury.png':
222 zip.write(res(fname), os.path.join('OPS', fname))
224 annotations = etree.Element('annotations')
225 part_xml = etree.Element('utwor')
226 etree.SubElement(part_xml, 'master')
228 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
229 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
230 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
231 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
232 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
233 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
234 '</navPoint><navPoint id="NavPoint-2" playOrder="2"><navLabel>' \
235 '<text>Początek utworu</text></navLabel><content src="part1.html" />' \
236 '</navPoint></navMap></ncx>')
238 main_xml_part = part_xml[0] # było [0][0], master
239 nav_map = toc_file[-1] # było [-1][-1]
242 if len(input_xml.getroot()) > 1:
243 # rdf before style master
244 main_text = input_xml.getroot()[1]
246 # rdf in style master
247 main_text = input_xml.getroot()[0]
249 replace_characters(main_text)
250 zip.writestr('OPS/title.html',
251 etree.tostring(xslt(input_xml, res('xsltTitle.xsl')), pretty_print=True))
253 # Search for table of contents elements and book division
255 stupid_i = stupid_j = stupid_k = 1
256 last_node_part = False
257 for one_part in main_text:
259 if name in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
260 if name == "naglowek_czesc":
262 last_node_part = True
263 find_annotations(annotations, part_xml, stupid_j)
264 replace_by_verse(part_xml)
265 zip.writestr('OPS/part%d.html' % stupid_j,
266 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
267 main_xml_part[:] = [deepcopy(one_part)]
268 # add to manifest and spine
269 add_to_manifest(manifest, stupid_j)
270 add_to_spine(spine, stupid_j)
271 name_toc = node_name(one_part)
272 # build table of contents
273 # i+2 because of title page
274 add_nav_point(nav_map, stupid_i+2, name_toc, stupid_j + 1)
279 main_xml_part.append(one_part)
280 last_node_part = False
281 name_toc = node_name(one_part)
282 add_nav_point(nav_map, stupid_i + 1, name_toc, stupid_j)
285 find_annotations(annotations, part_xml, stupid_j)
286 replace_by_verse(part_xml)
287 zip.writestr('OPS/part%d.html' % stupid_j,
288 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
289 # start building a new part
290 main_xml_part[:] = [deepcopy(one_part)]
291 add_to_manifest(manifest, stupid_j)
292 add_to_spine(spine, stupid_j)
293 name_toc = node_name(one_part)
294 add_nav_point(nav_map, stupid_i + 2, name_toc, stupid_j + 1) # title page
298 if name in ('naglowek_podrozdzial', 'naglowek_scena'):
300 name_toc = node_name(one_part)
301 add_nav_point2(nav_map, stupid_i + 2, name_toc, stupid_j, stupid_k)
302 one_part.set('sub', str(stupid_k))
305 main_xml_part.append(deepcopy(one_part))
306 last_node_part = False
307 find_annotations(annotations, part_xml, stupid_j)
308 replace_by_verse(part_xml)
309 add_to_manifest(manifest, stupid_j)
310 add_to_spine(spine, stupid_j)
312 zip.writestr('OPS/part%d.html' % stupid_j,
313 etree.tostring(xslt(part_xml, res('xsltScheme.xsl')), pretty_print=True))
315 # Last modifications in container files and EPUB creation
316 if len(annotations) > 0:
317 nav_map.append(etree.fromstring(
318 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
319 '</navLabel><content src="annotations.html" /></navPoint>' % {'i':stupid_i+2}))
320 manifest.append(etree.fromstring(
321 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
322 spine.append(etree.fromstring(
323 '<itemref idref="annotations" />'))
324 replace_by_verse(annotations)
325 zip.writestr('OPS/annotations.html', etree.tostring(
326 xslt(annotations, res("xsltAnnotations.xsl")), pretty_print=True))
328 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
330 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
331 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
332 for st in attributes:
333 meta = toc_file.makeelement(NCXNS('meta'))
335 meta.set('content', '0')
336 toc_file[0].append(meta)
337 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
338 toc_file[0][1].set('content', str(depth))
339 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
340 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
344 if __name__ == '__main__':
347 if len(sys.argv) < 2:
348 print >> sys.stderr, 'Usage: wl2epub <input file> [output file]'
352 if len(sys.argv) > 2:
355 basename, ext = os.path.splitext(input)
356 output = basename + '.epub'
359 if html.transform(input, is_file=True) == '<empty />':
360 print 'empty content - skipping'
362 transform(open(input, 'r'), open(output, 'w'))