1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp
16 from shutil import rmtree
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 def replace_by_verse(tree):
115 """ Find stanzas and create new verses in place of a '/' character """
117 stanzas = tree.findall('.//' + WLNS('strofa'))
119 for child_node in node:
120 if child_node.tag in ('slowo_obce', 'wyroznienie'):
121 foreign_verses = inner_xml(child_node).split('/\n')
122 if len(foreign_verses) > 1:
124 for foreign_verse in foreign_verses:
125 if foreign_verse.startswith('<wers'):
126 new_foreign += foreign_verse
128 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
129 set_inner_xml(child_node, new_foreign)
130 verses = inner_xml(node).split('/\n')
132 modified_inner_xml = ''
134 if verse.startswith('<wers') or verse.startswith('<extra'):
135 modified_inner_xml += verse
137 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
138 set_inner_xml(node, modified_inner_xml)
141 def add_to_manifest(manifest, partno):
142 """ Adds a node to the manifest section in content.opf file """
144 partstr = 'part%d' % partno
145 e = manifest.makeelement(OPFNS('item'), attrib={
147 'href': partstr + '.html',
148 'media-type': 'application/xhtml+xml',
153 def add_to_spine(spine, partno):
154 """ Adds a node to the spine section in content.opf file """
156 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
161 def __init__(self, name=None, part_href=None):
164 self.part_href = part_href
165 self.sub_number = None
167 def add(self, name, part_href, level=0, is_part=True, index=None):
168 assert level == 0 or index is None
169 if level > 0 and self.children:
170 return self.children[-1].add(name, part_href, level-1, is_part)
173 t.part_href = part_href
174 if index is not None:
175 self.children.insert(index, t)
177 self.children.append(t)
179 t.sub_number = len(self.children) + 1
182 def append(self, toc):
183 self.children.append(toc)
185 def extend(self, toc):
186 self.children.extend(toc.children)
190 return max((c.depth() for c in self.children)) + 1
196 if self.sub_number is not None:
197 src += '#sub%d' % self.sub_number
200 def write_to_xml(self, nav_map, counter=1):
201 for child in self.children:
202 nav_point = nav_map.makeelement(NCXNS('navPoint'))
203 nav_point.set('id', 'NavPoint-%d' % counter)
204 nav_point.set('playOrder', str(counter))
206 nav_label = nav_map.makeelement(NCXNS('navLabel'))
207 text = nav_map.makeelement(NCXNS('text'))
208 text.text = child.name
209 nav_label.append(text)
210 nav_point.append(nav_label)
212 content = nav_map.makeelement(NCXNS('content'))
213 content.set('src', child.href())
214 nav_point.append(content)
215 nav_map.append(nav_point)
216 counter = child.write_to_xml(nav_point, counter + 1)
219 def html_part(self, depth=0):
221 for child in self.children:
223 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
224 (depth, child.href(), child.name))
225 texts.append(child.html_part(depth+1))
226 return "\n".join(texts)
229 with open(get_resource('epub/toc.html')) as f:
230 t = unicode(f.read(), 'utf-8')
231 return t % self.html_part()
234 def used_chars(element):
235 """ Lists characters used in an ETree Element """
236 chars = set((element.text or '') + (element.tail or ''))
237 for child in element:
238 chars = chars.union(used_chars(child))
243 """ divide main content of the XML file into chunks """
245 # prepare a container for each chunk
246 part_xml = etree.Element('utwor')
247 etree.SubElement(part_xml, 'master')
248 main_xml_part = part_xml[0] # master
250 last_node_part = False
251 for one_part in main_text:
253 if name == 'naglowek_czesc':
255 last_node_part = True
256 main_xml_part[:] = [deepcopy(one_part)]
257 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
259 main_xml_part[:] = [deepcopy(one_part)]
261 main_xml_part.append(deepcopy(one_part))
262 last_node_part = False
266 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
267 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
270 for element in chunk_xml[0]:
271 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
272 toc.add(node_name(element), "part%d.html" % chunk_no)
273 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
274 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
275 element.set('sub', str(subnumber))
277 if not _empty_html_static:
278 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
280 output_html = _empty_html_static[0]
282 find_annotations(annotations, chunk_xml, chunk_no)
283 replace_by_verse(chunk_xml)
284 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
285 chars = used_chars(html_tree.getroot())
286 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
287 return output_html, toc, chars
290 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
291 style=None, html_toc=False,
292 sample=None, cover=None, flags=None):
293 """ produces a EPUB file
295 provider: a DocProvider
296 slug: slug of file to process, available by provider
297 output_file: file-like object or path to output file
298 output_dir: path to directory to save output file to; either this or output_file must be present
299 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
300 sample=n: generate sample e-book (with at least n paragraphs)
301 cover: a cover.Cover object
302 flags: less-advertising, without-fonts
305 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
306 """ processes one input file and proceeds to its children """
308 replace_characters(input_xml.getroot())
310 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
312 # every input file will have a TOC entry,
313 # pointing to starting chunk
314 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), "part%d.html" % chunk_counter)
317 # write book title page
318 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
319 chars = used_chars(html_tree.getroot())
320 zip.writestr('OPS/title.html',
321 etree.tostring(html_tree, method="html", pretty_print=True))
322 # add a title page TOC entry
323 toc.add(u"Strona tytułowa", "title.html")
325 # write title page for every parent
326 if sample is not None and sample <= 0:
328 html_string = open(get_resource('epub/emptyChunk.html')).read()
330 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
331 chars = used_chars(html_tree.getroot())
332 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
333 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
334 add_to_manifest(manifest, chunk_counter)
335 add_to_spine(spine, chunk_counter)
338 if len(input_xml.getroot()) > 1:
339 # rdf before style master
340 main_text = input_xml.getroot()[1]
342 # rdf in style master
343 main_text = input_xml.getroot()[0]
344 if main_text.tag == RDFNS('RDF'):
347 if main_text is not None:
348 for chunk_xml in chop(main_text):
350 if sample is not None:
354 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
355 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
357 toc.extend(chunk_toc)
358 chars = chars.union(chunk_chars)
359 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
360 add_to_manifest(manifest, chunk_counter)
361 add_to_spine(spine, chunk_counter)
365 for child in children:
366 child_xml = etree.parse(provider.by_uri(child))
367 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
368 toc.append(child_toc)
369 chars = chars.union(chunk_chars)
371 return toc, chunk_counter, chars, sample
373 # read metadata from the first file
376 raise ValueError('slug or file_path should be specified, not both')
377 f = open(file_path, 'r')
378 input_xml = etree.parse(f)
382 raise ValueError('either slug or file_path should be specified')
383 input_xml = etree.parse(provider[slug])
387 input_xml.getroot().set(flag, 'yes')
389 metadata = input_xml.find('.//'+RDFNS('Description'))
391 raise NoDublinCore('Document has no DublinCore - which is required.')
392 book_info = BookInfo.from_element(input_xml)
393 metadata = etree.ElementTree(metadata)
395 # if output to dir, create the file
396 if output_dir is not None:
398 author = unicode(book_info.author)
399 output_dir = os.path.join(output_dir, author)
401 os.makedirs(output_dir)
405 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
407 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
409 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
411 # write static elements
412 mime = zipfile.ZipInfo()
413 mime.filename = 'mimetype'
414 mime.compress_type = zipfile.ZIP_STORED
416 zip.writestr(mime, 'application/epub+zip')
417 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
418 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
419 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
420 'media-type="application/oebps-package+xml" />' \
421 '</rootfiles></container>')
422 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
423 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
425 style = get_resource('epub/style.css')
426 zip.write(style, os.path.join('OPS', 'style.css'))
428 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
429 manifest = opf.find('.//' + OPFNS('manifest'))
430 guide = opf.find('.//' + OPFNS('guide'))
431 spine = opf.find('.//' + OPFNS('spine'))
434 cover_file = StringIO()
435 c = cover(book_info.author.readable(), book_info.title)
437 c_name = 'cover.%s' % c.ext()
438 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
441 cover_tree = etree.parse(get_resource('epub/cover.html'))
442 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
443 zip.writestr('OPS/cover.html', etree.tostring(
444 cover_tree, method="html", pretty_print=True))
446 manifest.append(etree.fromstring(
447 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
448 manifest.append(etree.fromstring(
449 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
450 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
451 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
452 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
455 annotations = etree.Element('annotations')
457 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
458 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
459 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
460 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
462 nav_map = toc_file[-1]
465 manifest.append(etree.fromstring(
466 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
467 spine.append(etree.fromstring(
468 '<itemref idref="html_toc" />'))
469 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
471 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
473 if len(toc.children) < 2:
474 toc.add(u"Początek utworu", "part1.html")
476 # Last modifications in container files and EPUB creation
477 if len(annotations) > 0:
478 toc.add("Przypisy", "annotations.html")
479 manifest.append(etree.fromstring(
480 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
481 spine.append(etree.fromstring(
482 '<itemref idref="annotations" />'))
483 replace_by_verse(annotations)
484 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
485 chars = chars.union(used_chars(html_tree.getroot()))
486 zip.writestr('OPS/annotations.html', etree.tostring(
487 html_tree, method="html", pretty_print=True))
489 toc.add("Strona redakcyjna", "last.html")
490 manifest.append(etree.fromstring(
491 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
492 spine.append(etree.fromstring(
493 '<itemref idref="last" />'))
494 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
495 chars.update(used_chars(html_tree.getroot()))
496 zip.writestr('OPS/last.html', etree.tostring(
497 html_tree, method="html", pretty_print=True))
499 if not flags or not 'without-fonts' in flags:
501 tmpdir = mkdtemp('-librarian-epub')
504 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
505 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
506 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
507 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
509 print "Running font-optimizer"
510 subprocess.check_call(optimizer_call)
512 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
513 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
514 manifest.append(etree.fromstring(
515 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
519 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
521 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
522 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
523 for st in attributes:
524 meta = toc_file.makeelement(NCXNS('meta'))
526 meta.set('content', '0')
527 toc_file[0].append(meta)
528 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
529 toc_file[0][1].set('content', str(toc.depth()))
530 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
534 toc.add(u"Spis treści", "toc.html", index=1)
535 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
536 toc.write_to_xml(nav_map)
537 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))