1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from copy import deepcopy
12 from lxml import etree
14 from tempfile import mkdtemp
15 from shutil import rmtree
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
22 from librarian import functions
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
78 return os.path.join(_resdir, fname)
81 def replace_characters(node):
82 def replace_chars(text):
85 return text.replace(u"\ufeff", u"")\
86 .replace("---", u"\u2014")\
87 .replace("--", u"\u2013")\
88 .replace(",,", u"\u201E")\
89 .replace('"', u"\u201D")\
90 .replace("'", u"\u2019")
91 if node.tag == 'extra':
94 node.text = replace_chars(node.text)
95 node.tail = replace_chars(node.tail)
97 replace_characters(child)
100 def find_annotations(annotations, source, part_no):
102 if child.tag in ('pe', 'pa', 'pt', 'pr'):
103 annotation = deepcopy(child)
104 number = str(len(annotations)+1)
105 annotation.set('number', number)
106 annotation.set('part', str(part_no))
108 annotations.append(annotation)
113 if child.tag not in ('extra', 'podtytul'):
114 find_annotations(annotations, child, part_no)
117 def replace_by_verse(tree):
118 """ Find stanzas and create new verses in place of a '/' character """
120 stanzas = tree.findall('.//' + WLNS('strofa'))
122 for child_node in node:
123 if child_node.tag in ('slowo_obce', 'wyroznienie'):
124 foreign_verses = inner_xml(child_node).split('/\n')
125 if len(foreign_verses) > 1:
127 for foreign_verse in foreign_verses:
128 if foreign_verse.startswith('<wers'):
129 new_foreign += foreign_verse
131 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
132 set_inner_xml(child_node, new_foreign)
133 verses = inner_xml(node).split('/\n')
135 modified_inner_xml = ''
137 if verse.startswith('<wers') or verse.startswith('<extra'):
138 modified_inner_xml += verse
140 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
141 set_inner_xml(node, modified_inner_xml)
144 def add_to_manifest(manifest, partno):
145 """ Adds a node to the manifest section in content.opf file """
147 partstr = 'part%d' % partno
148 e = manifest.makeelement(OPFNS('item'), attrib={
150 'href': partstr + '.html',
151 'media-type': 'application/xhtml+xml',
156 def add_to_spine(spine, partno):
157 """ Adds a node to the spine section in content.opf file """
159 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
164 def __init__(self, name=None, part_number=None):
167 self.part_number = part_number
168 self.sub_number = None
170 def add(self, name, part_number, level=0, is_part=True):
171 if level > 0 and self.children:
172 return self.children[-1].add(name, part_number, level-1, is_part)
175 t.part_number = part_number
176 self.children.append(t)
178 t.sub_number = len(self.children) + 1
181 def append(self, toc):
182 self.children.append(toc)
184 def extend(self, toc):
185 self.children.extend(toc.children)
189 return max((c.depth() for c in self.children)) + 1
193 def write_to_xml(self, nav_map, counter):
194 for child in self.children:
195 nav_point = nav_map.makeelement(NCXNS('navPoint'))
196 nav_point.set('id', 'NavPoint-%d' % counter)
197 nav_point.set('playOrder', str(counter))
199 nav_label = nav_map.makeelement(NCXNS('navLabel'))
200 text = nav_map.makeelement(NCXNS('text'))
201 text.text = child.name
202 nav_label.append(text)
203 nav_point.append(nav_label)
205 content = nav_map.makeelement(NCXNS('content'))
206 src = 'part%d.html' % child.part_number
207 if child.sub_number is not None:
208 src += '#sub%d' % child.sub_number
209 content.set('src', src)
210 nav_point.append(content)
211 nav_map.append(nav_point)
212 counter = child.write_to_xml(nav_point, counter + 1)
216 def used_chars(element):
217 """ Lists characters used in an ETree Element """
218 chars = set((element.text or '') + (element.tail or ''))
219 for child in element:
220 chars = chars.union(used_chars(child))
225 """ divide main content of the XML file into chunks """
227 # prepare a container for each chunk
228 part_xml = etree.Element('utwor')
229 etree.SubElement(part_xml, 'master')
230 main_xml_part = part_xml[0] # master
232 last_node_part = False
233 for one_part in main_text:
235 if name == 'naglowek_czesc':
237 last_node_part = True
238 main_xml_part[:] = [deepcopy(one_part)]
239 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
241 main_xml_part[:] = [deepcopy(one_part)]
243 main_xml_part.append(deepcopy(one_part))
244 last_node_part = False
248 def transform_chunk(chunk_xml, chunk_no, annotations):
249 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
252 for element in chunk_xml[0]:
253 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
254 toc.add(node_name(element), chunk_no)
255 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
256 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
257 element.set('sub', str(subnumber))
258 find_annotations(annotations, chunk_xml, chunk_no)
259 replace_by_verse(chunk_xml)
260 html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
261 chars = used_chars(html_tree.getroot())
262 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
263 return output_html, toc, chars
266 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
267 """ produces a EPUB file
269 provider: a DocProvider
270 slug: slug of file to process, available by provider
271 output_file: file-like object or path to output file
272 output_dir: path to directory to save output file to; either this or output_file must be present
273 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
276 def transform_file(input_xml, chunk_counter=1, first=True):
277 """ processes one input file and proceeds to its children """
279 replace_characters(input_xml.getroot())
281 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
283 # every input file will have a TOC entry,
284 # pointing to starting chunk
285 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
288 # write book title page
289 html_tree = xslt(input_xml, res('xsltTitle.xsl'))
290 chars = used_chars(html_tree.getroot())
291 zip.writestr('OPS/title.html',
292 etree.tostring(html_tree, method="html", pretty_print=True))
294 # write title page for every parent
295 html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
296 chars = used_chars(html_tree.getroot())
297 zip.writestr('OPS/part%d.html' % chunk_counter,
298 etree.tostring(html_tree, method="html", pretty_print=True))
299 add_to_manifest(manifest, chunk_counter)
300 add_to_spine(spine, chunk_counter)
303 if len(input_xml.getroot()) > 1:
304 # rdf before style master
305 main_text = input_xml.getroot()[1]
307 # rdf in style master
308 main_text = input_xml.getroot()[0]
309 if main_text.tag == RDFNS('RDF'):
312 if main_text is not None:
313 for chunk_xml in chop(main_text):
314 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
315 toc.extend(chunk_toc)
316 chars = chars.union(chunk_chars)
317 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
318 add_to_manifest(manifest, chunk_counter)
319 add_to_spine(spine, chunk_counter)
323 for child in children:
324 child_xml = etree.parse(provider.by_uri(child))
325 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
326 toc.append(child_toc)
327 chars = chars.union(chunk_chars)
329 return toc, chunk_counter, chars
331 # read metadata from the first file
334 raise ValueError('slug or file_path should be specified, not both')
335 f = open(file_path, 'r')
336 input_xml = etree.parse(f)
340 raise ValueError('either slug or file_path should be specified')
341 input_xml = etree.parse(provider[slug])
343 metadata = input_xml.find('.//'+RDFNS('Description'))
345 raise NoDublinCore('Document has no DublinCore - which is required.')
346 book_info = BookInfo.from_element(input_xml)
347 metadata = etree.ElementTree(metadata)
349 # if output to dir, create the file
350 if output_dir is not None:
352 author = unicode(book_info.author)
353 output_dir = os.path.join(output_dir, author)
355 os.makedirs(output_dir)
359 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
361 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
363 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
365 # write static elements
366 mime = zipfile.ZipInfo()
367 mime.filename = 'mimetype'
368 mime.compress_type = zipfile.ZIP_STORED
370 zip.writestr(mime, 'application/epub+zip')
371 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
372 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
373 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
374 'media-type="application/oebps-package+xml" />' \
375 '</rootfiles></container>')
376 for fname in 'style.css', 'logo_wolnelektury.png':
377 zip.write(res(fname), os.path.join('OPS', fname))
379 opf = xslt(metadata, res('xsltContent.xsl'))
380 manifest = opf.find('.//' + OPFNS('manifest'))
381 spine = opf.find('.//' + OPFNS('spine'))
383 annotations = etree.Element('annotations')
385 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
386 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
387 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
388 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
389 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
390 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
391 '</navPoint></navMap></ncx>')
392 nav_map = toc_file[-1]
394 toc, chunk_counter, chars = transform_file(input_xml)
397 toc.add(u"Początek utworu", 1)
398 toc_counter = toc.write_to_xml(nav_map, 2)
400 # Last modifications in container files and EPUB creation
401 if len(annotations) > 0:
402 nav_map.append(etree.fromstring(
403 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
404 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
405 manifest.append(etree.fromstring(
406 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
407 spine.append(etree.fromstring(
408 '<itemref idref="annotations" />'))
409 replace_by_verse(annotations)
410 html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
411 chars = chars.union(used_chars(html_tree.getroot()))
412 zip.writestr('OPS/annotations.html', etree.tostring(
413 html_tree, method="html", pretty_print=True))
416 tmpdir = mkdtemp('-librarian-epub')
419 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
420 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
421 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
423 print "Running font-optimizer"
424 subprocess.check_call(optimizer_call)
426 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
427 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
431 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
433 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
434 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
435 for st in attributes:
436 meta = toc_file.makeelement(NCXNS('meta'))
438 meta.set('content', '0')
439 toc_file[0].append(meta)
440 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
441 toc_file[0][1].set('content', str(toc.depth()))
442 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
443 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))