1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from copy import deepcopy
12 from lxml import etree
14 from tempfile import mkdtemp
15 from shutil import rmtree
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag == 'extra':
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'podtytul'):
109 find_annotations(annotations, child, part_no)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
115 stanzas = tree.findall('.//' + WLNS('strofa'))
117 for child_node in node:
118 if child_node.tag in ('slowo_obce', 'wyroznienie'):
119 foreign_verses = inner_xml(child_node).split('/\n')
120 if len(foreign_verses) > 1:
122 for foreign_verse in foreign_verses:
123 if foreign_verse.startswith('<wers'):
124 new_foreign += foreign_verse
126 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127 set_inner_xml(child_node, new_foreign)
128 verses = inner_xml(node).split('/\n')
130 modified_inner_xml = ''
132 if verse.startswith('<wers') or verse.startswith('<extra'):
133 modified_inner_xml += verse
135 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136 set_inner_xml(node, modified_inner_xml)
139 def add_to_manifest(manifest, partno):
140 """ Adds a node to the manifest section in content.opf file """
142 partstr = 'part%d' % partno
143 e = manifest.makeelement(OPFNS('item'), attrib={
145 'href': partstr + '.html',
146 'media-type': 'application/xhtml+xml',
151 def add_to_spine(spine, partno):
152 """ Adds a node to the spine section in content.opf file """
154 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159 def __init__(self, name=None, part_number=None):
162 self.part_number = part_number
163 self.sub_number = None
165 def add(self, name, part_number, level=0, is_part=True):
166 if level > 0 and self.children:
167 return self.children[-1].add(name, part_number, level-1, is_part)
170 t.part_number = part_number
171 self.children.append(t)
173 t.sub_number = len(self.children) + 1
176 def append(self, toc):
177 self.children.append(toc)
179 def extend(self, toc):
180 self.children.extend(toc.children)
184 return max((c.depth() for c in self.children)) + 1
188 def write_to_xml(self, nav_map, counter):
189 for child in self.children:
190 nav_point = nav_map.makeelement(NCXNS('navPoint'))
191 nav_point.set('id', 'NavPoint-%d' % counter)
192 nav_point.set('playOrder', str(counter))
194 nav_label = nav_map.makeelement(NCXNS('navLabel'))
195 text = nav_map.makeelement(NCXNS('text'))
196 text.text = child.name
197 nav_label.append(text)
198 nav_point.append(nav_label)
200 content = nav_map.makeelement(NCXNS('content'))
201 src = 'part%d.html' % child.part_number
202 if child.sub_number is not None:
203 src += '#sub%d' % child.sub_number
204 content.set('src', src)
205 nav_point.append(content)
206 nav_map.append(nav_point)
207 counter = child.write_to_xml(nav_point, counter + 1)
211 def used_chars(element):
212 """ Lists characters used in an ETree Element """
213 chars = set((element.text or '') + (element.tail or ''))
214 for child in element:
215 chars = chars.union(used_chars(child))
220 """ divide main content of the XML file into chunks """
222 # prepare a container for each chunk
223 part_xml = etree.Element('utwor')
224 etree.SubElement(part_xml, 'master')
225 main_xml_part = part_xml[0] # master
227 last_node_part = False
228 for one_part in main_text:
230 if name == 'naglowek_czesc':
232 last_node_part = True
233 main_xml_part[:] = [deepcopy(one_part)]
234 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
236 main_xml_part[:] = [deepcopy(one_part)]
238 main_xml_part.append(deepcopy(one_part))
239 last_node_part = False
243 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
244 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
247 for element in chunk_xml[0]:
248 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
249 toc.add(node_name(element), chunk_no)
250 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
251 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
252 element.set('sub', str(subnumber))
254 if not _empty_html_static:
255 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
257 output_html = _empty_html_static[0]
259 find_annotations(annotations, chunk_xml, chunk_no)
260 replace_by_verse(chunk_xml)
261 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
262 chars = used_chars(html_tree.getroot())
263 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
264 return output_html, toc, chars
267 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None):
268 """ produces a EPUB file
270 provider: a DocProvider
271 slug: slug of file to process, available by provider
272 output_file: file-like object or path to output file
273 output_dir: path to directory to save output file to; either this or output_file must be present
274 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
275 sample=n: generate sample e-book (with at least n paragraphs)
278 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
279 """ processes one input file and proceeds to its children """
281 replace_characters(input_xml.getroot())
283 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
285 # every input file will have a TOC entry,
286 # pointing to starting chunk
287 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
290 # write book title page
291 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
292 chars = used_chars(html_tree.getroot())
293 zip.writestr('OPS/title.html',
294 etree.tostring(html_tree, method="html", pretty_print=True))
296 # write title page for every parent
297 if sample is not None and sample <= 0:
299 html_string = open(get_resource('epub/emptyChunk.html')).read()
301 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
302 chars = used_chars(html_tree.getroot())
303 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
304 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
305 add_to_manifest(manifest, chunk_counter)
306 add_to_spine(spine, chunk_counter)
309 if len(input_xml.getroot()) > 1:
310 # rdf before style master
311 main_text = input_xml.getroot()[1]
313 # rdf in style master
314 main_text = input_xml.getroot()[0]
315 if main_text.tag == RDFNS('RDF'):
318 if main_text is not None:
319 for chunk_xml in chop(main_text):
321 if sample is not None:
325 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
326 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
328 toc.extend(chunk_toc)
329 chars = chars.union(chunk_chars)
330 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
331 add_to_manifest(manifest, chunk_counter)
332 add_to_spine(spine, chunk_counter)
336 for child in children:
337 child_xml = etree.parse(provider.by_uri(child))
338 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
339 toc.append(child_toc)
340 chars = chars.union(chunk_chars)
342 return toc, chunk_counter, chars, sample
344 # read metadata from the first file
347 raise ValueError('slug or file_path should be specified, not both')
348 f = open(file_path, 'r')
349 input_xml = etree.parse(f)
353 raise ValueError('either slug or file_path should be specified')
354 input_xml = etree.parse(provider[slug])
356 metadata = input_xml.find('.//'+RDFNS('Description'))
358 raise NoDublinCore('Document has no DublinCore - which is required.')
359 book_info = BookInfo.from_element(input_xml)
360 metadata = etree.ElementTree(metadata)
362 # if output to dir, create the file
363 if output_dir is not None:
365 author = unicode(book_info.author)
366 output_dir = os.path.join(output_dir, author)
368 os.makedirs(output_dir)
372 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
374 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
376 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
378 # write static elements
379 mime = zipfile.ZipInfo()
380 mime.filename = 'mimetype'
381 mime.compress_type = zipfile.ZIP_STORED
383 zip.writestr(mime, 'application/epub+zip')
384 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
385 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
386 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
387 'media-type="application/oebps-package+xml" />' \
388 '</rootfiles></container>')
389 zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
390 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
392 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
393 manifest = opf.find('.//' + OPFNS('manifest'))
394 spine = opf.find('.//' + OPFNS('spine'))
396 annotations = etree.Element('annotations')
398 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
399 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
400 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
401 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
402 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
403 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
404 '</navPoint></navMap></ncx>')
405 nav_map = toc_file[-1]
407 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
410 toc.add(u"Początek utworu", 1)
411 toc_counter = toc.write_to_xml(nav_map, 2)
413 # Last modifications in container files and EPUB creation
414 if len(annotations) > 0:
415 nav_map.append(etree.fromstring(
416 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
417 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
418 manifest.append(etree.fromstring(
419 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
420 spine.append(etree.fromstring(
421 '<itemref idref="annotations" />'))
422 replace_by_verse(annotations)
423 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
424 chars = chars.union(used_chars(html_tree.getroot()))
425 zip.writestr('OPS/annotations.html', etree.tostring(
426 html_tree, method="html", pretty_print=True))
429 tmpdir = mkdtemp('-librarian-epub')
432 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
433 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
434 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
435 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
437 print "Running font-optimizer"
438 subprocess.check_call(optimizer_call)
440 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
441 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
445 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
447 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
448 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
449 for st in attributes:
450 meta = toc_file.makeelement(NCXNS('meta'))
452 meta.set('content', '0')
453 toc_file[0].append(meta)
454 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
455 toc_file[0][1].set('content', str(toc.depth()))
456 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
457 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))