1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
29 def set_hyph_language(source_tree):
30 def get_short_lng_code(text):
33 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
35 list = line.strip().split('|')
42 bibl_lng = etree.XPath('//dc:language//text()', namespaces = {'dc':str(DCNS)})(source_tree)
43 short_lng = get_short_lng_code(bibl_lng[0])
45 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic'))
49 def hyphenate_and_fix_conjunctions(source_tree, hyph):
51 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
53 parent = t.getparent()
55 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
57 newt += hyph.inserted(w, u'\u00AD')
58 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
65 """ returns node's text and children as a string
67 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
71 nt = node.text if node.text is not None else ''
72 return ''.join([nt] + [etree.tostring(child) for child in node])
74 def set_inner_xml(node, text):
75 """ sets node's text and children from a string
77 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
78 >>> set_inner_xml(e, 'x<b>y</b>z')
79 >>> print etree.tostring(e)
83 p = etree.fromstring('<x>%s</x>' % text)
89 """ Find out a node's name
91 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
95 tempnode = deepcopy(node)
97 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
98 for e in tempnode.findall('.//%s' % p):
102 etree.strip_tags(tempnode, '*')
106 def xslt(xml, sheet):
107 if isinstance(xml, etree._Element):
108 xml = etree.ElementTree(xml)
109 with open(sheet) as xsltf:
110 return xml.xslt(etree.parse(xsltf))
113 def replace_characters(node):
114 def replace_chars(text):
117 return text.replace(u"\ufeff", u"")\
118 .replace("---", u"\u2014")\
119 .replace("--", u"\u2013")\
120 .replace(",,", u"\u201E")\
121 .replace('"', u"\u201D")\
122 .replace("'", u"\u2019")
123 if node.tag in ('uwaga', 'extra'):
127 node.text = replace_chars(node.text)
128 node.tail = replace_chars(node.tail)
130 replace_characters(child)
133 def find_annotations(annotations, source, part_no):
135 if child.tag in ('pe', 'pa', 'pt', 'pr'):
136 annotation = deepcopy(child)
137 number = str(len(annotations)+1)
138 annotation.set('number', number)
139 annotation.set('part', str(part_no))
141 annotations.append(annotation)
146 if child.tag not in ('extra', 'uwaga'):
147 find_annotations(annotations, child, part_no)
150 class Stanza(object):
152 Converts / verse endings into verse elements in a stanza.
154 Slashes may only occur directly in the stanza. Any slashes in subelements
155 will be ignored, and the subelements will be put inside verse elements.
157 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
158 >>> Stanza(s).versify()
159 >>> print etree.tostring(s)
160 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
161 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
164 def __init__(self, stanza_elem):
165 self.stanza = stanza_elem
167 self.open_verse = None
170 self.push_text(self.stanza.text)
171 for elem in self.stanza:
173 self.push_text(elem.tail)
174 tail = self.stanza.tail
176 self.stanza.tail = tail
177 self.stanza.extend(self.verses)
179 def open_normal_verse(self):
180 self.open_verse = self.stanza.makeelement("wers_normalny")
181 self.verses.append(self.open_verse)
183 def get_open_verse(self):
184 if self.open_verse is None:
185 self.open_normal_verse()
186 return self.open_verse
188 def push_text(self, text):
191 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
193 self.open_normal_verse()
194 verse = self.get_open_verse()
196 verse[-1].tail = (verse[-1].tail or "") + verse_text
198 verse.text = (verse.text or "") + verse_text
200 def push_elem(self, elem):
201 if elem.tag.startswith("wers"):
202 verse = deepcopy(elem)
204 self.verses.append(verse)
205 self.open_verse = verse
207 appended = deepcopy(elem)
209 self.get_open_verse().append(appended)
212 def replace_by_verse(tree):
213 """ Find stanzas and create new verses in place of a '/' character """
215 stanzas = tree.findall('.//' + WLNS('strofa'))
216 for stanza in stanzas:
217 Stanza(stanza).versify()
220 def add_to_manifest(manifest, partno):
221 """ Adds a node to the manifest section in content.opf file """
223 partstr = 'part%d' % partno
224 e = manifest.makeelement(OPFNS('item'), attrib={
226 'href': partstr + '.html',
227 'media-type': 'application/xhtml+xml',
232 def add_to_spine(spine, partno):
233 """ Adds a node to the spine section in content.opf file """
235 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
240 def __init__(self, name=None, part_href=None):
243 self.part_href = part_href
244 self.sub_number = None
246 def add(self, name, part_href, level=0, is_part=True, index=None):
247 assert level == 0 or index is None
248 if level > 0 and self.children:
249 return self.children[-1].add(name, part_href, level-1, is_part)
252 t.part_href = part_href
253 if index is not None:
254 self.children.insert(index, t)
256 self.children.append(t)
258 t.sub_number = len(self.children) + 1
261 def append(self, toc):
262 self.children.append(toc)
264 def extend(self, toc):
265 self.children.extend(toc.children)
269 return max((c.depth() for c in self.children)) + 1
275 if self.sub_number is not None:
276 src += '#sub%d' % self.sub_number
279 def write_to_xml(self, nav_map, counter=1):
280 for child in self.children:
281 nav_point = nav_map.makeelement(NCXNS('navPoint'))
282 nav_point.set('id', 'NavPoint-%d' % counter)
283 nav_point.set('playOrder', str(counter))
285 nav_label = nav_map.makeelement(NCXNS('navLabel'))
286 text = nav_map.makeelement(NCXNS('text'))
287 text.text = re.sub(r'\n', ' ', child.name)
288 nav_label.append(text)
289 nav_point.append(nav_label)
291 content = nav_map.makeelement(NCXNS('content'))
292 content.set('src', child.href())
293 nav_point.append(content)
294 nav_map.append(nav_point)
295 counter = child.write_to_xml(nav_point, counter + 1)
298 def html_part(self, depth=0):
300 for child in self.children:
302 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
303 (depth, child.href(), child.name))
304 texts.append(child.html_part(depth+1))
305 return "\n".join(texts)
308 with open(get_resource('epub/toc.html')) as f:
309 t = unicode(f.read(), 'utf-8')
310 return t % self.html_part()
313 def used_chars(element):
314 """ Lists characters used in an ETree Element """
315 chars = set((element.text or '') + (element.tail or ''))
316 for child in element:
317 chars = chars.union(used_chars(child))
322 """ divide main content of the XML file into chunks """
324 # prepare a container for each chunk
325 part_xml = etree.Element('utwor')
326 etree.SubElement(part_xml, 'master')
327 main_xml_part = part_xml[0] # master
329 last_node_part = False
331 # the below loop are workaround for a problem with epubs in drama ebooks without acts
334 for one_part in main_text:
336 if name == 'naglowek_scena':
338 elif name == 'naglowek_akt':
341 for one_part in main_text:
343 if is_act is False and is_scene is True:
344 if name == 'naglowek_czesc':
346 last_node_part = True
347 main_xml_part[:] = [deepcopy(one_part)]
348 elif not last_node_part and name == "naglowek_scena":
350 main_xml_part[:] = [deepcopy(one_part)]
352 main_xml_part.append(deepcopy(one_part))
353 last_node_part = False
355 if name == 'naglowek_czesc':
357 last_node_part = True
358 main_xml_part[:] = [deepcopy(one_part)]
359 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
361 main_xml_part[:] = [deepcopy(one_part)]
363 main_xml_part.append(deepcopy(one_part))
364 last_node_part = False
368 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
369 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
372 for element in chunk_xml[0]:
373 if element.tag == "naglowek_czesc":
374 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
375 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
376 toc.add(node_name(element), "part%d.html" % chunk_no)
377 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
378 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
379 element.set('sub', str(subnumber))
381 if not _empty_html_static:
382 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
384 output_html = _empty_html_static[0]
386 find_annotations(annotations, chunk_xml, chunk_no)
387 replace_by_verse(chunk_xml)
388 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
389 chars = used_chars(html_tree.getroot())
390 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
391 return output_html, toc, chars
394 def transform(wldoc, verbose=False,
395 style=None, html_toc=False,
396 sample=None, cover=None, flags=None):
397 """ produces a EPUB file
399 sample=n: generate sample e-book (with at least n paragraphs)
400 cover: a cover.Cover factory or True for default
401 flags: less-advertising, without-fonts, working-copy, with-full-fonts
404 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
405 """ processes one input file and proceeds to its children """
407 replace_characters(wldoc.edoc.getroot())
409 hyphenator = set_hyph_language(wldoc.edoc.getroot())
410 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
413 # every input file will have a TOC entry,
414 # pointing to starting chunk
415 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
418 # write book title page
419 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
420 chars = used_chars(html_tree.getroot())
421 zip.writestr('OPS/title.html',
422 etree.tostring(html_tree, method="html", pretty_print=True))
423 # add a title page TOC entry
424 toc.add(u"Strona tytułowa", "title.html")
425 elif wldoc.book_info.parts:
426 # write title page for every parent
427 if sample is not None and sample <= 0:
429 html_string = open(get_resource('epub/emptyChunk.html')).read()
431 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
432 chars = used_chars(html_tree.getroot())
433 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
434 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
435 add_to_manifest(manifest, chunk_counter)
436 add_to_spine(spine, chunk_counter)
439 if len(wldoc.edoc.getroot()) > 1:
440 # rdf before style master
441 main_text = wldoc.edoc.getroot()[1]
443 # rdf in style master
444 main_text = wldoc.edoc.getroot()[0]
445 if main_text.tag == RDFNS('RDF'):
448 if main_text is not None:
449 for chunk_xml in chop(main_text):
451 if sample is not None:
455 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
456 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
458 toc.extend(chunk_toc)
459 chars = chars.union(chunk_chars)
460 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
461 add_to_manifest(manifest, chunk_counter)
462 add_to_spine(spine, chunk_counter)
465 for child in wldoc.parts():
466 child_toc, chunk_counter, chunk_chars, sample = transform_file(
467 child, chunk_counter, first=False, sample=sample)
468 toc.append(child_toc)
469 chars = chars.union(chunk_chars)
471 return toc, chunk_counter, chars, sample
474 document = deepcopy(wldoc)
479 document.edoc.getroot().set(flag, 'yes')
482 document.edoc.getroot().set('editors', u', '.join(sorted(
483 editor.readable() for editor in document.editors())))
484 if document.book_info.funders:
485 document.edoc.getroot().set('funders', u', '.join(
486 document.book_info.funders))
487 if document.book_info.thanks:
488 document.edoc.getroot().set('thanks', document.book_info.thanks)
490 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
491 manifest = opf.find('.//' + OPFNS('manifest'))
492 guide = opf.find('.//' + OPFNS('guide'))
493 spine = opf.find('.//' + OPFNS('spine'))
495 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
496 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
498 # write static elements
499 mime = zipfile.ZipInfo()
500 mime.filename = 'mimetype'
501 mime.compress_type = zipfile.ZIP_STORED
503 zip.writestr(mime, 'application/epub+zip')
504 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
505 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
506 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
507 'media-type="application/oebps-package+xml" />' \
508 '</rootfiles></container>')
509 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
510 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
512 style = get_resource('epub/style.css')
513 zip.write(style, os.path.join('OPS', 'style.css'))
517 cover = DefaultEbookCover
519 cover_file = StringIO()
520 bound_cover = cover(document.book_info)
521 bound_cover.save(cover_file)
522 cover_name = 'cover.%s' % bound_cover.ext()
523 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
526 cover_tree = etree.parse(get_resource('epub/cover.html'))
527 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
528 zip.writestr('OPS/cover.html', etree.tostring(
529 cover_tree, method="html", pretty_print=True))
531 if bound_cover.uses_dc_cover:
532 if document.book_info.cover_by:
533 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
534 if document.book_info.cover_source:
535 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
537 manifest.append(etree.fromstring(
538 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
539 manifest.append(etree.fromstring(
540 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
541 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
542 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
543 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
546 annotations = etree.Element('annotations')
548 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
549 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
550 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
551 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
553 nav_map = toc_file[-1]
556 manifest.append(etree.fromstring(
557 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
558 spine.append(etree.fromstring(
559 '<itemref idref="html_toc" />'))
560 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
562 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
564 if len(toc.children) < 2:
565 toc.add(u"Początek utworu", "part1.html")
567 # Last modifications in container files and EPUB creation
568 if len(annotations) > 0:
569 toc.add("Przypisy", "annotations.html")
570 manifest.append(etree.fromstring(
571 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
572 spine.append(etree.fromstring(
573 '<itemref idref="annotations" />'))
574 replace_by_verse(annotations)
575 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
576 chars = chars.union(used_chars(html_tree.getroot()))
577 zip.writestr('OPS/annotations.html', etree.tostring(
578 html_tree, method="html", pretty_print=True))
580 toc.add("Wesprzyj Wolne Lektury", "support.html")
581 manifest.append(etree.fromstring(
582 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
583 spine.append(etree.fromstring(
584 '<itemref idref="support" />'))
585 html_string = open(get_resource('epub/support.html')).read()
586 chars.update(used_chars(etree.fromstring(html_string)))
587 zip.writestr('OPS/support.html', html_string)
589 toc.add("Strona redakcyjna", "last.html")
590 manifest.append(etree.fromstring(
591 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
592 spine.append(etree.fromstring(
593 '<itemref idref="last" />'))
594 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
595 chars.update(used_chars(html_tree.getroot()))
596 zip.writestr('OPS/last.html', etree.tostring(
597 html_tree, method="html", pretty_print=True))
599 if not flags or not 'without-fonts' in flags:
601 tmpdir = mkdtemp('-librarian-epub')
607 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
608 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
609 if not flags or not 'with-full-fonts' in flags:
610 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
611 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
613 print "Running font-optimizer"
614 subprocess.check_call(optimizer_call)
616 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
617 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
619 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
620 manifest.append(etree.fromstring(
621 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
625 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
626 title = document.book_info.title
627 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
628 for st in attributes:
629 meta = toc_file.makeelement(NCXNS('meta'))
631 meta.set('content', '0')
632 toc_file[0].append(meta)
633 toc_file[0][0].set('content', str(document.book_info.url))
634 toc_file[0][1].set('content', str(toc.depth()))
635 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
639 toc.add(u"Spis treści", "toc.html", index=1)
640 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
641 toc.write_to_xml(nav_map)
642 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
645 return OutputFile.from_filename(output_file.name)