1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 class Stanza(object):
116 Converts / verse endings into verse elements in a stanza.
118 Slashes may only occur directly in the stanza. Any slashes in subelements
119 will be ignored, and the subelements will be put inside verse elements.
121 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122 >>> Stanza(s).versify()
123 >>> print etree.tostring(s)
124 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
128 def __init__(self, stanza_elem):
129 self.stanza = stanza_elem
131 self.open_verse = None
134 self.push_text(self.stanza.text)
135 for elem in self.stanza:
137 self.push_text(elem.tail)
138 tail = self.stanza.tail
140 self.stanza.tail = tail
141 self.stanza.extend(self.verses)
143 def open_normal_verse(self):
144 self.open_verse = self.stanza.makeelement("wers_normalny")
145 self.verses.append(self.open_verse)
147 def get_open_verse(self):
148 if self.open_verse is None:
149 self.open_normal_verse()
150 return self.open_verse
152 def push_text(self, text):
155 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157 self.open_normal_verse()
158 verse = self.get_open_verse()
160 verse[-1].tail = (verse[-1].tail or "") + verse_text
162 verse.text = (verse.text or "") + verse_text
164 def push_elem(self, elem):
165 if elem.tag.startswith("wers"):
166 verse = deepcopy(elem)
168 self.verses.append(verse)
169 self.open_verse = verse
171 appended = deepcopy(elem)
173 self.get_open_verse().append(appended)
176 def replace_by_verse(tree):
177 """ Find stanzas and create new verses in place of a '/' character """
179 stanzas = tree.findall('.//' + WLNS('strofa'))
180 for stanza in stanzas:
181 Stanza(stanza).versify()
184 def add_to_manifest(manifest, partno):
185 """ Adds a node to the manifest section in content.opf file """
187 partstr = 'part%d' % partno
188 e = manifest.makeelement(OPFNS('item'), attrib={
190 'href': partstr + '.html',
191 'media-type': 'application/xhtml+xml',
196 def add_to_spine(spine, partno):
197 """ Adds a node to the spine section in content.opf file """
199 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
204 def __init__(self, name=None, part_href=None):
207 self.part_href = part_href
208 self.sub_number = None
210 def add(self, name, part_href, level=0, is_part=True, index=None):
211 assert level == 0 or index is None
212 if level > 0 and self.children:
213 return self.children[-1].add(name, part_href, level-1, is_part)
216 t.part_href = part_href
217 if index is not None:
218 self.children.insert(index, t)
220 self.children.append(t)
222 t.sub_number = len(self.children) + 1
225 def append(self, toc):
226 self.children.append(toc)
228 def extend(self, toc):
229 self.children.extend(toc.children)
233 return max((c.depth() for c in self.children)) + 1
239 if self.sub_number is not None:
240 src += '#sub%d' % self.sub_number
243 def write_to_xml(self, nav_map, counter=1):
244 for child in self.children:
245 nav_point = nav_map.makeelement(NCXNS('navPoint'))
246 nav_point.set('id', 'NavPoint-%d' % counter)
247 nav_point.set('playOrder', str(counter))
249 nav_label = nav_map.makeelement(NCXNS('navLabel'))
250 text = nav_map.makeelement(NCXNS('text'))
251 text.text = child.name
252 nav_label.append(text)
253 nav_point.append(nav_label)
255 content = nav_map.makeelement(NCXNS('content'))
256 content.set('src', child.href())
257 nav_point.append(content)
258 nav_map.append(nav_point)
259 counter = child.write_to_xml(nav_point, counter + 1)
262 def html_part(self, depth=0):
264 for child in self.children:
266 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267 (depth, child.href(), child.name))
268 texts.append(child.html_part(depth+1))
269 return "\n".join(texts)
272 with open(get_resource('epub/toc.html')) as f:
273 t = unicode(f.read(), 'utf-8')
274 return t % self.html_part()
277 def used_chars(element):
278 """ Lists characters used in an ETree Element """
279 chars = set((element.text or '') + (element.tail or ''))
280 for child in element:
281 chars = chars.union(used_chars(child))
286 """ divide main content of the XML file into chunks """
288 # prepare a container for each chunk
289 part_xml = etree.Element('utwor')
290 etree.SubElement(part_xml, 'master')
291 main_xml_part = part_xml[0] # master
293 last_node_part = False
295 # the below loop are workaround for a problem with epubs in drama ebooks without acts
298 for one_part in main_text:
300 if name == 'naglowek_scena':
302 elif name == 'naglowek_akt':
305 for one_part in main_text:
307 if is_act is False and is_scene is True:
308 if name == 'naglowek_czesc':
310 last_node_part = True
311 main_xml_part[:] = [deepcopy(one_part)]
312 elif not last_node_part and name in ("naglowek_scena"):
314 main_xml_part[:] = [deepcopy(one_part)]
316 main_xml_part.append(deepcopy(one_part))
317 last_node_part = False
319 if name == 'naglowek_czesc':
321 last_node_part = True
322 main_xml_part[:] = [deepcopy(one_part)]
323 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
325 main_xml_part[:] = [deepcopy(one_part)]
327 main_xml_part.append(deepcopy(one_part))
328 last_node_part = False
332 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
333 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
336 for element in chunk_xml[0]:
337 if element.tag in ("naglowek_czesc"):
338 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
339 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
340 toc.add(node_name(element), "part%d.html" % chunk_no)
341 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
342 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
343 element.set('sub', str(subnumber))
345 if not _empty_html_static:
346 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
348 output_html = _empty_html_static[0]
350 find_annotations(annotations, chunk_xml, chunk_no)
351 replace_by_verse(chunk_xml)
352 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
353 chars = used_chars(html_tree.getroot())
354 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
355 return output_html, toc, chars
358 def transform(wldoc, verbose=False,
359 style=None, html_toc=False,
360 sample=None, cover=None, flags=None):
361 """ produces a EPUB file
363 sample=n: generate sample e-book (with at least n paragraphs)
364 cover: a cover.Cover factory or True for default
365 flags: less-advertising, without-fonts, working-copy, with-full-fonts
368 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
369 """ processes one input file and proceeds to its children """
371 replace_characters(wldoc.edoc.getroot())
373 # every input file will have a TOC entry,
374 # pointing to starting chunk
375 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
378 # write book title page
379 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
380 chars = used_chars(html_tree.getroot())
381 zip.writestr('OPS/title.html',
382 etree.tostring(html_tree, method="html", pretty_print=True))
383 # add a title page TOC entry
384 toc.add(u"Strona tytułowa", "title.html")
385 elif wldoc.book_info.parts:
386 # write title page for every parent
387 if sample is not None and sample <= 0:
389 html_string = open(get_resource('epub/emptyChunk.html')).read()
391 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
392 chars = used_chars(html_tree.getroot())
393 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
394 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
395 add_to_manifest(manifest, chunk_counter)
396 add_to_spine(spine, chunk_counter)
399 if len(wldoc.edoc.getroot()) > 1:
400 # rdf before style master
401 main_text = wldoc.edoc.getroot()[1]
403 # rdf in style master
404 main_text = wldoc.edoc.getroot()[0]
405 if main_text.tag == RDFNS('RDF'):
408 if main_text is not None:
409 for chunk_xml in chop(main_text):
411 if sample is not None:
415 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
416 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
418 toc.extend(chunk_toc)
419 chars = chars.union(chunk_chars)
420 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
421 add_to_manifest(manifest, chunk_counter)
422 add_to_spine(spine, chunk_counter)
425 for child in wldoc.parts():
426 child_toc, chunk_counter, chunk_chars, sample = transform_file(
427 child, chunk_counter, first=False, sample=sample)
428 toc.append(child_toc)
429 chars = chars.union(chunk_chars)
431 return toc, chunk_counter, chars, sample
434 document = deepcopy(wldoc)
439 document.edoc.getroot().set(flag, 'yes')
442 document.edoc.getroot().set('editors', u', '.join(sorted(
443 editor.readable() for editor in document.editors())))
444 if document.book_info.funders:
445 document.edoc.getroot().set('funders', u', '.join(
446 document.book_info.funders))
447 if document.book_info.thanks:
448 document.edoc.getroot().set('thanks', document.book_info.thanks)
450 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
451 manifest = opf.find('.//' + OPFNS('manifest'))
452 guide = opf.find('.//' + OPFNS('guide'))
453 spine = opf.find('.//' + OPFNS('spine'))
455 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
456 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
458 # write static elements
459 mime = zipfile.ZipInfo()
460 mime.filename = 'mimetype'
461 mime.compress_type = zipfile.ZIP_STORED
463 zip.writestr(mime, 'application/epub+zip')
464 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
465 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
466 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
467 'media-type="application/oebps-package+xml" />' \
468 '</rootfiles></container>')
469 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
470 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
472 style = get_resource('epub/style.css')
473 zip.write(style, os.path.join('OPS', 'style.css'))
477 cover = DefaultEbookCover
479 cover_file = StringIO()
480 bound_cover = cover(document.book_info)
481 bound_cover.save(cover_file)
482 cover_name = 'cover.%s' % bound_cover.ext()
483 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
486 cover_tree = etree.parse(get_resource('epub/cover.html'))
487 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
488 zip.writestr('OPS/cover.html', etree.tostring(
489 cover_tree, method="html", pretty_print=True))
491 if bound_cover.uses_dc_cover:
492 if document.book_info.cover_by:
493 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
494 if document.book_info.cover_source:
495 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
497 manifest.append(etree.fromstring(
498 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
499 manifest.append(etree.fromstring(
500 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
501 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
502 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
503 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
506 annotations = etree.Element('annotations')
508 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
509 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
510 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
511 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
513 nav_map = toc_file[-1]
516 manifest.append(etree.fromstring(
517 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
518 spine.append(etree.fromstring(
519 '<itemref idref="html_toc" />'))
520 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
522 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
524 if len(toc.children) < 2:
525 toc.add(u"Początek utworu", "part1.html")
527 # Last modifications in container files and EPUB creation
528 if len(annotations) > 0:
529 toc.add("Przypisy", "annotations.html")
530 manifest.append(etree.fromstring(
531 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
532 spine.append(etree.fromstring(
533 '<itemref idref="annotations" />'))
534 replace_by_verse(annotations)
535 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
536 chars = chars.union(used_chars(html_tree.getroot()))
537 zip.writestr('OPS/annotations.html', etree.tostring(
538 html_tree, method="html", pretty_print=True))
540 toc.add("Wesprzyj Wolne Lektury", "support.html")
541 manifest.append(etree.fromstring(
542 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
543 spine.append(etree.fromstring(
544 '<itemref idref="support" />'))
545 html_string = open(get_resource('epub/support.html')).read()
546 chars.update(used_chars(etree.fromstring(html_string)))
547 zip.writestr('OPS/support.html', html_string)
549 toc.add("Strona redakcyjna", "last.html")
550 manifest.append(etree.fromstring(
551 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
552 spine.append(etree.fromstring(
553 '<itemref idref="last" />'))
554 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
555 chars.update(used_chars(html_tree.getroot()))
556 zip.writestr('OPS/last.html', etree.tostring(
557 html_tree, method="html", pretty_print=True))
559 if not flags or not 'without-fonts' in flags:
561 tmpdir = mkdtemp('-librarian-epub')
567 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
568 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
569 if not flags or not 'with-full-fonts' in flags:
570 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
571 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
573 print "Running font-optimizer"
574 subprocess.check_call(optimizer_call)
576 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
577 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
579 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
580 manifest.append(etree.fromstring(
581 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
585 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
586 title = document.book_info.title
587 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
588 for st in attributes:
589 meta = toc_file.makeelement(NCXNS('meta'))
591 meta.set('content', '0')
592 toc_file[0].append(meta)
593 toc_file[0][0].set('content', str(document.book_info.url))
594 toc_file[0][1].set('content', str(toc.depth()))
595 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
599 toc.add(u"Spis treści", "toc.html", index=1)
600 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
601 toc.write_to_xml(nav_map)
602 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
605 return OutputFile.from_filename(output_file.name)