1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
29 hyph = Hyphenator(get_resource('res/hyph-dictionaries/hyph_pl_PL.dic'))
31 def hyphenate_and_fix_conjunctions(source_tree):
32 """ hyphenate only powiesc, opowiadanie and wywiad tag"""
33 texts = etree.XPath('//*[self::powiesc|self::opowiadanie|self::wywiad]//text()')(source_tree)
35 parent = t.getparent()
37 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
39 newt += hyph.inserted(w, u'\u00AD')
40 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
47 """ returns node's text and children as a string
49 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
53 nt = node.text if node.text is not None else ''
54 return ''.join([nt] + [etree.tostring(child) for child in node])
56 def set_inner_xml(node, text):
57 """ sets node's text and children from a string
59 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
60 >>> set_inner_xml(e, 'x<b>y</b>z')
61 >>> print etree.tostring(e)
65 p = etree.fromstring('<x>%s</x>' % text)
71 """ Find out a node's name
73 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
77 tempnode = deepcopy(node)
79 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
80 for e in tempnode.findall('.//%s' % p):
84 etree.strip_tags(tempnode, '*')
89 if isinstance(xml, etree._Element):
90 xml = etree.ElementTree(xml)
91 with open(sheet) as xsltf:
92 return xml.xslt(etree.parse(xsltf))
95 def replace_characters(node):
96 def replace_chars(text):
99 return text.replace(u"\ufeff", u"")\
100 .replace("---", u"\u2014")\
101 .replace("--", u"\u2013")\
102 .replace(",,", u"\u201E")\
103 .replace('"', u"\u201D")\
104 .replace("'", u"\u2019")
105 if node.tag in ('uwaga', 'extra'):
109 node.text = replace_chars(node.text)
110 node.tail = replace_chars(node.tail)
112 replace_characters(child)
115 def find_annotations(annotations, source, part_no):
117 if child.tag in ('pe', 'pa', 'pt', 'pr'):
118 annotation = deepcopy(child)
119 number = str(len(annotations)+1)
120 annotation.set('number', number)
121 annotation.set('part', str(part_no))
123 annotations.append(annotation)
128 if child.tag not in ('extra', 'uwaga'):
129 find_annotations(annotations, child, part_no)
132 class Stanza(object):
134 Converts / verse endings into verse elements in a stanza.
136 Slashes may only occur directly in the stanza. Any slashes in subelements
137 will be ignored, and the subelements will be put inside verse elements.
139 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
140 >>> Stanza(s).versify()
141 >>> print etree.tostring(s)
142 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
143 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
146 def __init__(self, stanza_elem):
147 self.stanza = stanza_elem
149 self.open_verse = None
152 self.push_text(self.stanza.text)
153 for elem in self.stanza:
155 self.push_text(elem.tail)
156 tail = self.stanza.tail
158 self.stanza.tail = tail
159 self.stanza.extend(self.verses)
161 def open_normal_verse(self):
162 self.open_verse = self.stanza.makeelement("wers_normalny")
163 self.verses.append(self.open_verse)
165 def get_open_verse(self):
166 if self.open_verse is None:
167 self.open_normal_verse()
168 return self.open_verse
170 def push_text(self, text):
173 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
175 self.open_normal_verse()
176 verse = self.get_open_verse()
178 verse[-1].tail = (verse[-1].tail or "") + verse_text
180 verse.text = (verse.text or "") + verse_text
182 def push_elem(self, elem):
183 if elem.tag.startswith("wers"):
184 verse = deepcopy(elem)
186 self.verses.append(verse)
187 self.open_verse = verse
189 appended = deepcopy(elem)
191 self.get_open_verse().append(appended)
194 def replace_by_verse(tree):
195 """ Find stanzas and create new verses in place of a '/' character """
197 stanzas = tree.findall('.//' + WLNS('strofa'))
198 for stanza in stanzas:
199 Stanza(stanza).versify()
202 def add_to_manifest(manifest, partno):
203 """ Adds a node to the manifest section in content.opf file """
205 partstr = 'part%d' % partno
206 e = manifest.makeelement(OPFNS('item'), attrib={
208 'href': partstr + '.html',
209 'media-type': 'application/xhtml+xml',
214 def add_to_spine(spine, partno):
215 """ Adds a node to the spine section in content.opf file """
217 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
222 def __init__(self, name=None, part_href=None):
225 self.part_href = part_href
226 self.sub_number = None
228 def add(self, name, part_href, level=0, is_part=True, index=None):
229 assert level == 0 or index is None
230 if level > 0 and self.children:
231 return self.children[-1].add(name, part_href, level-1, is_part)
234 t.part_href = part_href
235 if index is not None:
236 self.children.insert(index, t)
238 self.children.append(t)
240 t.sub_number = len(self.children) + 1
243 def append(self, toc):
244 self.children.append(toc)
246 def extend(self, toc):
247 self.children.extend(toc.children)
251 return max((c.depth() for c in self.children)) + 1
257 if self.sub_number is not None:
258 src += '#sub%d' % self.sub_number
261 def write_to_xml(self, nav_map, counter=1):
262 for child in self.children:
263 nav_point = nav_map.makeelement(NCXNS('navPoint'))
264 nav_point.set('id', 'NavPoint-%d' % counter)
265 nav_point.set('playOrder', str(counter))
267 nav_label = nav_map.makeelement(NCXNS('navLabel'))
268 text = nav_map.makeelement(NCXNS('text'))
269 text.text = re.sub(r'\n', ' ', child.name)
270 nav_label.append(text)
271 nav_point.append(nav_label)
273 content = nav_map.makeelement(NCXNS('content'))
274 content.set('src', child.href())
275 nav_point.append(content)
276 nav_map.append(nav_point)
277 counter = child.write_to_xml(nav_point, counter + 1)
280 def html_part(self, depth=0):
282 for child in self.children:
284 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
285 (depth, child.href(), child.name))
286 texts.append(child.html_part(depth+1))
287 return "\n".join(texts)
290 with open(get_resource('epub/toc.html')) as f:
291 t = unicode(f.read(), 'utf-8')
292 return t % self.html_part()
295 def used_chars(element):
296 """ Lists characters used in an ETree Element """
297 chars = set((element.text or '') + (element.tail or ''))
298 for child in element:
299 chars = chars.union(used_chars(child))
304 """ divide main content of the XML file into chunks """
306 # prepare a container for each chunk
307 part_xml = etree.Element('utwor')
308 etree.SubElement(part_xml, 'master')
309 main_xml_part = part_xml[0] # master
311 last_node_part = False
313 # the below loop are workaround for a problem with epubs in drama ebooks without acts
316 for one_part in main_text:
318 if name == 'naglowek_scena':
320 elif name == 'naglowek_akt':
323 for one_part in main_text:
325 if is_act is False and is_scene is True:
326 if name == 'naglowek_czesc':
328 last_node_part = True
329 main_xml_part[:] = [deepcopy(one_part)]
330 elif not last_node_part and name == "naglowek_scena":
332 main_xml_part[:] = [deepcopy(one_part)]
334 main_xml_part.append(deepcopy(one_part))
335 last_node_part = False
337 if name == 'naglowek_czesc':
339 last_node_part = True
340 main_xml_part[:] = [deepcopy(one_part)]
341 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
343 main_xml_part[:] = [deepcopy(one_part)]
345 main_xml_part.append(deepcopy(one_part))
346 last_node_part = False
350 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
351 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
354 for element in chunk_xml[0]:
355 if element.tag == "naglowek_czesc":
356 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
357 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
358 toc.add(node_name(element), "part%d.html" % chunk_no)
359 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
360 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
361 element.set('sub', str(subnumber))
363 if not _empty_html_static:
364 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
366 output_html = _empty_html_static[0]
368 find_annotations(annotations, chunk_xml, chunk_no)
369 replace_by_verse(chunk_xml)
370 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
371 chars = used_chars(html_tree.getroot())
372 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
373 return output_html, toc, chars
376 def transform(wldoc, verbose=False,
377 style=None, html_toc=False,
378 sample=None, cover=None, flags=None):
379 """ produces a EPUB file
381 sample=n: generate sample e-book (with at least n paragraphs)
382 cover: a cover.Cover factory or True for default
383 flags: less-advertising, without-fonts, working-copy, with-full-fonts
386 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
387 """ processes one input file and proceeds to its children """
389 replace_characters(wldoc.edoc.getroot())
390 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot())
392 # every input file will have a TOC entry,
393 # pointing to starting chunk
394 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
397 # write book title page
398 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
399 chars = used_chars(html_tree.getroot())
400 zip.writestr('OPS/title.html',
401 etree.tostring(html_tree, method="html", pretty_print=True))
402 # add a title page TOC entry
403 toc.add(u"Strona tytułowa", "title.html")
404 elif wldoc.book_info.parts:
405 # write title page for every parent
406 if sample is not None and sample <= 0:
408 html_string = open(get_resource('epub/emptyChunk.html')).read()
410 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
411 chars = used_chars(html_tree.getroot())
412 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
413 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
414 add_to_manifest(manifest, chunk_counter)
415 add_to_spine(spine, chunk_counter)
418 if len(wldoc.edoc.getroot()) > 1:
419 # rdf before style master
420 main_text = wldoc.edoc.getroot()[1]
422 # rdf in style master
423 main_text = wldoc.edoc.getroot()[0]
424 if main_text.tag == RDFNS('RDF'):
427 if main_text is not None:
428 for chunk_xml in chop(main_text):
430 if sample is not None:
434 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
435 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
437 toc.extend(chunk_toc)
438 chars = chars.union(chunk_chars)
439 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
440 add_to_manifest(manifest, chunk_counter)
441 add_to_spine(spine, chunk_counter)
444 for child in wldoc.parts():
445 child_toc, chunk_counter, chunk_chars, sample = transform_file(
446 child, chunk_counter, first=False, sample=sample)
447 toc.append(child_toc)
448 chars = chars.union(chunk_chars)
450 return toc, chunk_counter, chars, sample
453 document = deepcopy(wldoc)
458 document.edoc.getroot().set(flag, 'yes')
461 document.edoc.getroot().set('editors', u', '.join(sorted(
462 editor.readable() for editor in document.editors())))
463 if document.book_info.funders:
464 document.edoc.getroot().set('funders', u', '.join(
465 document.book_info.funders))
466 if document.book_info.thanks:
467 document.edoc.getroot().set('thanks', document.book_info.thanks)
469 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
470 manifest = opf.find('.//' + OPFNS('manifest'))
471 guide = opf.find('.//' + OPFNS('guide'))
472 spine = opf.find('.//' + OPFNS('spine'))
474 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
475 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
477 # write static elements
478 mime = zipfile.ZipInfo()
479 mime.filename = 'mimetype'
480 mime.compress_type = zipfile.ZIP_STORED
482 zip.writestr(mime, 'application/epub+zip')
483 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
484 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
485 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
486 'media-type="application/oebps-package+xml" />' \
487 '</rootfiles></container>')
488 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
489 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
491 style = get_resource('epub/style.css')
492 zip.write(style, os.path.join('OPS', 'style.css'))
496 cover = DefaultEbookCover
498 cover_file = StringIO()
499 bound_cover = cover(document.book_info)
500 bound_cover.save(cover_file)
501 cover_name = 'cover.%s' % bound_cover.ext()
502 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
505 cover_tree = etree.parse(get_resource('epub/cover.html'))
506 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
507 zip.writestr('OPS/cover.html', etree.tostring(
508 cover_tree, method="html", pretty_print=True))
510 if bound_cover.uses_dc_cover:
511 if document.book_info.cover_by:
512 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
513 if document.book_info.cover_source:
514 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
516 manifest.append(etree.fromstring(
517 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
518 manifest.append(etree.fromstring(
519 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
520 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
521 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
522 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
525 annotations = etree.Element('annotations')
527 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
528 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
529 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
530 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
532 nav_map = toc_file[-1]
535 manifest.append(etree.fromstring(
536 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
537 spine.append(etree.fromstring(
538 '<itemref idref="html_toc" />'))
539 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
541 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
543 if len(toc.children) < 2:
544 toc.add(u"Początek utworu", "part1.html")
546 # Last modifications in container files and EPUB creation
547 if len(annotations) > 0:
548 toc.add("Przypisy", "annotations.html")
549 manifest.append(etree.fromstring(
550 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
551 spine.append(etree.fromstring(
552 '<itemref idref="annotations" />'))
553 replace_by_verse(annotations)
554 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
555 chars = chars.union(used_chars(html_tree.getroot()))
556 zip.writestr('OPS/annotations.html', etree.tostring(
557 html_tree, method="html", pretty_print=True))
559 toc.add("Wesprzyj Wolne Lektury", "support.html")
560 manifest.append(etree.fromstring(
561 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
562 spine.append(etree.fromstring(
563 '<itemref idref="support" />'))
564 html_string = open(get_resource('epub/support.html')).read()
565 chars.update(used_chars(etree.fromstring(html_string)))
566 zip.writestr('OPS/support.html', html_string)
568 toc.add("Strona redakcyjna", "last.html")
569 manifest.append(etree.fromstring(
570 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
571 spine.append(etree.fromstring(
572 '<itemref idref="last" />'))
573 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
574 chars.update(used_chars(html_tree.getroot()))
575 zip.writestr('OPS/last.html', etree.tostring(
576 html_tree, method="html", pretty_print=True))
578 if not flags or not 'without-fonts' in flags:
580 tmpdir = mkdtemp('-librarian-epub')
586 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
587 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
588 if not flags or not 'with-full-fonts' in flags:
589 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
590 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
592 print "Running font-optimizer"
593 subprocess.check_call(optimizer_call)
595 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
596 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
598 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
599 manifest.append(etree.fromstring(
600 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
604 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
605 title = document.book_info.title
606 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
607 for st in attributes:
608 meta = toc_file.makeelement(NCXNS('meta'))
610 meta.set('content', '0')
611 toc_file[0].append(meta)
612 toc_file[0][0].set('content', str(document.book_info.url))
613 toc_file[0][1].set('content', str(toc.depth()))
614 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
618 toc.add(u"Spis treści", "toc.html", index=1)
619 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
620 toc.write_to_xml(nav_map)
621 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
624 return OutputFile.from_filename(output_file.name)