1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp
16 from shutil import rmtree
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 node.text = replace_chars(node.text)
88 node.tail = replace_chars(node.tail)
90 replace_characters(child)
93 def find_annotations(annotations, source, part_no):
95 if child.tag in ('pe', 'pa', 'pt', 'pr'):
96 annotation = deepcopy(child)
97 number = str(len(annotations)+1)
98 annotation.set('number', number)
99 annotation.set('part', str(part_no))
101 annotations.append(annotation)
106 if child.tag not in ('extra', 'uwaga'):
107 find_annotations(annotations, child, part_no)
110 def replace_by_verse(tree):
111 """ Find stanzas and create new verses in place of a '/' character """
113 stanzas = tree.findall('.//' + WLNS('strofa'))
115 for child_node in node:
116 if child_node.tag in ('slowo_obce', 'wyroznienie'):
117 foreign_verses = inner_xml(child_node).split('/\n')
118 if len(foreign_verses) > 1:
120 for foreign_verse in foreign_verses:
121 if foreign_verse.startswith('<wers'):
122 new_foreign += foreign_verse
124 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
125 set_inner_xml(child_node, new_foreign)
126 verses = inner_xml(node).split('/\n')
128 modified_inner_xml = ''
130 if verse.startswith('<wers') or verse.startswith('<extra'):
131 modified_inner_xml += verse
133 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
134 set_inner_xml(node, modified_inner_xml)
137 def add_to_manifest(manifest, partno):
138 """ Adds a node to the manifest section in content.opf file """
140 partstr = 'part%d' % partno
141 e = manifest.makeelement(OPFNS('item'), attrib={
143 'href': partstr + '.html',
144 'media-type': 'application/xhtml+xml',
149 def add_to_spine(spine, partno):
150 """ Adds a node to the spine section in content.opf file """
152 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
157 def __init__(self, name=None, part_number=None):
160 self.part_number = part_number
161 self.sub_number = None
163 def add(self, name, part_number, level=0, is_part=True):
164 if level > 0 and self.children:
165 return self.children[-1].add(name, part_number, level-1, is_part)
168 t.part_number = part_number
169 self.children.append(t)
171 t.sub_number = len(self.children) + 1
174 def append(self, toc):
175 self.children.append(toc)
177 def extend(self, toc):
178 self.children.extend(toc.children)
182 return max((c.depth() for c in self.children)) + 1
186 def write_to_xml(self, nav_map, counter):
187 for child in self.children:
188 nav_point = nav_map.makeelement(NCXNS('navPoint'))
189 nav_point.set('id', 'NavPoint-%d' % counter)
190 nav_point.set('playOrder', str(counter))
192 nav_label = nav_map.makeelement(NCXNS('navLabel'))
193 text = nav_map.makeelement(NCXNS('text'))
194 text.text = child.name
195 nav_label.append(text)
196 nav_point.append(nav_label)
198 content = nav_map.makeelement(NCXNS('content'))
199 src = 'part%d.html' % child.part_number
200 if child.sub_number is not None:
201 src += '#sub%d' % child.sub_number
202 content.set('src', src)
203 nav_point.append(content)
204 nav_map.append(nav_point)
205 counter = child.write_to_xml(nav_point, counter + 1)
209 def used_chars(element):
210 """ Lists characters used in an ETree Element """
211 chars = set((element.text or '') + (element.tail or ''))
212 for child in element:
213 chars = chars.union(used_chars(child))
218 """ divide main content of the XML file into chunks """
220 # prepare a container for each chunk
221 part_xml = etree.Element('utwor')
222 etree.SubElement(part_xml, 'master')
223 main_xml_part = part_xml[0] # master
225 last_node_part = False
226 for one_part in main_text:
228 if name == 'naglowek_czesc':
230 last_node_part = True
231 main_xml_part[:] = [deepcopy(one_part)]
232 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
234 main_xml_part[:] = [deepcopy(one_part)]
236 main_xml_part.append(deepcopy(one_part))
237 last_node_part = False
241 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
242 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
245 for element in chunk_xml[0]:
246 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
247 toc.add(node_name(element), chunk_no)
248 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
249 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
250 element.set('sub', str(subnumber))
252 if not _empty_html_static:
253 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
255 output_html = _empty_html_static[0]
257 find_annotations(annotations, chunk_xml, chunk_no)
258 replace_by_verse(chunk_xml)
259 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
260 chars = used_chars(html_tree.getroot())
261 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
262 return output_html, toc, chars
265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
266 sample=None, cover=None, flags=None):
267 """ produces a EPUB file
269 provider: a DocProvider
270 slug: slug of file to process, available by provider
271 output_file: file-like object or path to output file
272 output_dir: path to directory to save output file to; either this or output_file must be present
273 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
274 sample=n: generate sample e-book (with at least n paragraphs)
275 cover: a cover.Cover object
276 flags: less-advertising,
279 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
280 """ processes one input file and proceeds to its children """
282 replace_characters(input_xml.getroot())
284 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
286 # every input file will have a TOC entry,
287 # pointing to starting chunk
288 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
291 # write book title page
292 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
293 chars = used_chars(html_tree.getroot())
294 zip.writestr('OPS/title.html',
295 etree.tostring(html_tree, method="html", pretty_print=True))
297 # write title page for every parent
298 if sample is not None and sample <= 0:
300 html_string = open(get_resource('epub/emptyChunk.html')).read()
302 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
303 chars = used_chars(html_tree.getroot())
304 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
305 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
306 add_to_manifest(manifest, chunk_counter)
307 add_to_spine(spine, chunk_counter)
310 if len(input_xml.getroot()) > 1:
311 # rdf before style master
312 main_text = input_xml.getroot()[1]
314 # rdf in style master
315 main_text = input_xml.getroot()[0]
316 if main_text.tag == RDFNS('RDF'):
319 if main_text is not None:
320 for chunk_xml in chop(main_text):
322 if sample is not None:
326 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
327 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
329 toc.extend(chunk_toc)
330 chars = chars.union(chunk_chars)
331 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
332 add_to_manifest(manifest, chunk_counter)
333 add_to_spine(spine, chunk_counter)
337 for child in children:
338 child_xml = etree.parse(provider.by_uri(child))
339 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
340 toc.append(child_toc)
341 chars = chars.union(chunk_chars)
343 return toc, chunk_counter, chars, sample
345 # read metadata from the first file
348 raise ValueError('slug or file_path should be specified, not both')
349 f = open(file_path, 'r')
350 input_xml = etree.parse(f)
354 raise ValueError('either slug or file_path should be specified')
355 input_xml = etree.parse(provider[slug])
359 input_xml.getroot().set(flag, 'yes')
361 metadata = input_xml.find('.//'+RDFNS('Description'))
363 raise NoDublinCore('Document has no DublinCore - which is required.')
364 book_info = BookInfo.from_element(input_xml)
365 metadata = etree.ElementTree(metadata)
367 # if output to dir, create the file
368 if output_dir is not None:
370 author = unicode(book_info.author)
371 output_dir = os.path.join(output_dir, author)
373 os.makedirs(output_dir)
377 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
379 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
381 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
383 # write static elements
384 mime = zipfile.ZipInfo()
385 mime.filename = 'mimetype'
386 mime.compress_type = zipfile.ZIP_STORED
388 zip.writestr(mime, 'application/epub+zip')
389 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
390 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
391 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
392 'media-type="application/oebps-package+xml" />' \
393 '</rootfiles></container>')
394 zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
395 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
397 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
398 manifest = opf.find('.//' + OPFNS('manifest'))
399 spine = opf.find('.//' + OPFNS('spine'))
402 cover_file = StringIO()
403 c = cover(book_info.author.readable(), book_info.title)
405 c_name = 'cover.%s' % c.ext()
406 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
409 cover_tree = etree.parse(get_resource('epub/cover.html'))
410 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
411 zip.writestr('OPS/cover.html', etree.tostring(
412 cover_tree, method="html", pretty_print=True))
414 manifest.append(etree.fromstring(
415 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
416 manifest.append(etree.fromstring(
417 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
418 spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
419 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
420 opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
423 annotations = etree.Element('annotations')
425 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
426 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
427 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
428 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
429 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
430 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
431 '</navPoint></navMap></ncx>')
432 nav_map = toc_file[-1]
434 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
437 toc.add(u"Początek utworu", 1)
438 toc_counter = toc.write_to_xml(nav_map, 2)
440 # Last modifications in container files and EPUB creation
441 if len(annotations) > 0:
442 nav_map.append(etree.fromstring(
443 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
444 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
446 manifest.append(etree.fromstring(
447 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
448 spine.append(etree.fromstring(
449 '<itemref idref="annotations" />'))
450 replace_by_verse(annotations)
451 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
452 chars = chars.union(used_chars(html_tree.getroot()))
453 zip.writestr('OPS/annotations.html', etree.tostring(
454 html_tree, method="html", pretty_print=True))
456 nav_map.append(etree.fromstring(
457 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
458 '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
459 manifest.append(etree.fromstring(
460 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
461 spine.append(etree.fromstring(
462 '<itemref idref="last" />'))
463 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
464 chars.update(used_chars(html_tree.getroot()))
465 zip.writestr('OPS/last.html', etree.tostring(
466 html_tree, method="html", pretty_print=True))
469 tmpdir = mkdtemp('-librarian-epub')
472 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
473 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
474 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
475 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
477 print "Running font-optimizer"
478 subprocess.check_call(optimizer_call)
480 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
481 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
485 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
487 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
488 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
489 for st in attributes:
490 meta = toc_file.makeelement(NCXNS('meta'))
492 meta.set('content', '0')
493 toc_file[0].append(meta)
494 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
495 toc_file[0][1].set('content', str(toc.depth()))
496 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
497 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))