1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
29 def set_hyph_language(source_tree):
30 def get_short_lng_code(text):
33 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
35 list = line.strip().split('|')
42 bibl_lng = etree.XPath('//dc:language//text()', namespaces = {'dc':str(DCNS)})(source_tree)
43 short_lng = get_short_lng_code(bibl_lng[0])
45 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic'))
49 def hyphenate_and_fix_conjunctions(source_tree, hyph):
50 """ hyphenate only powiesc, opowiadanie and wywiad tag"""
52 texts = etree.XPath('//*[self::powiesc|self::opowiadanie|self::wywiad]//text()')(source_tree)
54 parent = t.getparent()
56 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
58 newt += hyph.inserted(w, u'\u00AD')
59 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
66 """ returns node's text and children as a string
68 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
72 nt = node.text if node.text is not None else ''
73 return ''.join([nt] + [etree.tostring(child) for child in node])
75 def set_inner_xml(node, text):
76 """ sets node's text and children from a string
78 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
79 >>> set_inner_xml(e, 'x<b>y</b>z')
80 >>> print etree.tostring(e)
84 p = etree.fromstring('<x>%s</x>' % text)
90 """ Find out a node's name
92 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
96 tempnode = deepcopy(node)
98 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
99 for e in tempnode.findall('.//%s' % p):
103 etree.strip_tags(tempnode, '*')
107 def xslt(xml, sheet):
108 if isinstance(xml, etree._Element):
109 xml = etree.ElementTree(xml)
110 with open(sheet) as xsltf:
111 return xml.xslt(etree.parse(xsltf))
114 def replace_characters(node):
115 def replace_chars(text):
118 return text.replace(u"\ufeff", u"")\
119 .replace("---", u"\u2014")\
120 .replace("--", u"\u2013")\
121 .replace(",,", u"\u201E")\
122 .replace('"', u"\u201D")\
123 .replace("'", u"\u2019")
124 if node.tag in ('uwaga', 'extra'):
128 node.text = replace_chars(node.text)
129 node.tail = replace_chars(node.tail)
131 replace_characters(child)
134 def find_annotations(annotations, source, part_no):
136 if child.tag in ('pe', 'pa', 'pt', 'pr'):
137 annotation = deepcopy(child)
138 number = str(len(annotations)+1)
139 annotation.set('number', number)
140 annotation.set('part', str(part_no))
142 annotations.append(annotation)
147 if child.tag not in ('extra', 'uwaga'):
148 find_annotations(annotations, child, part_no)
151 class Stanza(object):
153 Converts / verse endings into verse elements in a stanza.
155 Slashes may only occur directly in the stanza. Any slashes in subelements
156 will be ignored, and the subelements will be put inside verse elements.
158 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
159 >>> Stanza(s).versify()
160 >>> print etree.tostring(s)
161 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
162 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
165 def __init__(self, stanza_elem):
166 self.stanza = stanza_elem
168 self.open_verse = None
171 self.push_text(self.stanza.text)
172 for elem in self.stanza:
174 self.push_text(elem.tail)
175 tail = self.stanza.tail
177 self.stanza.tail = tail
178 self.stanza.extend(self.verses)
180 def open_normal_verse(self):
181 self.open_verse = self.stanza.makeelement("wers_normalny")
182 self.verses.append(self.open_verse)
184 def get_open_verse(self):
185 if self.open_verse is None:
186 self.open_normal_verse()
187 return self.open_verse
189 def push_text(self, text):
192 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
194 self.open_normal_verse()
195 verse = self.get_open_verse()
197 verse[-1].tail = (verse[-1].tail or "") + verse_text
199 verse.text = (verse.text or "") + verse_text
201 def push_elem(self, elem):
202 if elem.tag.startswith("wers"):
203 verse = deepcopy(elem)
205 self.verses.append(verse)
206 self.open_verse = verse
208 appended = deepcopy(elem)
210 self.get_open_verse().append(appended)
213 def replace_by_verse(tree):
214 """ Find stanzas and create new verses in place of a '/' character """
216 stanzas = tree.findall('.//' + WLNS('strofa'))
217 for stanza in stanzas:
218 Stanza(stanza).versify()
221 def add_to_manifest(manifest, partno):
222 """ Adds a node to the manifest section in content.opf file """
224 partstr = 'part%d' % partno
225 e = manifest.makeelement(OPFNS('item'), attrib={
227 'href': partstr + '.html',
228 'media-type': 'application/xhtml+xml',
233 def add_to_spine(spine, partno):
234 """ Adds a node to the spine section in content.opf file """
236 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
241 def __init__(self, name=None, part_href=None):
244 self.part_href = part_href
245 self.sub_number = None
247 def add(self, name, part_href, level=0, is_part=True, index=None):
248 assert level == 0 or index is None
249 if level > 0 and self.children:
250 return self.children[-1].add(name, part_href, level-1, is_part)
253 t.part_href = part_href
254 if index is not None:
255 self.children.insert(index, t)
257 self.children.append(t)
259 t.sub_number = len(self.children) + 1
262 def append(self, toc):
263 self.children.append(toc)
265 def extend(self, toc):
266 self.children.extend(toc.children)
270 return max((c.depth() for c in self.children)) + 1
276 if self.sub_number is not None:
277 src += '#sub%d' % self.sub_number
280 def write_to_xml(self, nav_map, counter=1):
281 for child in self.children:
282 nav_point = nav_map.makeelement(NCXNS('navPoint'))
283 nav_point.set('id', 'NavPoint-%d' % counter)
284 nav_point.set('playOrder', str(counter))
286 nav_label = nav_map.makeelement(NCXNS('navLabel'))
287 text = nav_map.makeelement(NCXNS('text'))
288 text.text = re.sub(r'\n', ' ', child.name)
289 nav_label.append(text)
290 nav_point.append(nav_label)
292 content = nav_map.makeelement(NCXNS('content'))
293 content.set('src', child.href())
294 nav_point.append(content)
295 nav_map.append(nav_point)
296 counter = child.write_to_xml(nav_point, counter + 1)
299 def html_part(self, depth=0):
301 for child in self.children:
303 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
304 (depth, child.href(), child.name))
305 texts.append(child.html_part(depth+1))
306 return "\n".join(texts)
309 with open(get_resource('epub/toc.html')) as f:
310 t = unicode(f.read(), 'utf-8')
311 return t % self.html_part()
314 def used_chars(element):
315 """ Lists characters used in an ETree Element """
316 chars = set((element.text or '') + (element.tail or ''))
317 for child in element:
318 chars = chars.union(used_chars(child))
323 """ divide main content of the XML file into chunks """
325 # prepare a container for each chunk
326 part_xml = etree.Element('utwor')
327 etree.SubElement(part_xml, 'master')
328 main_xml_part = part_xml[0] # master
330 last_node_part = False
332 # the below loop are workaround for a problem with epubs in drama ebooks without acts
335 for one_part in main_text:
337 if name == 'naglowek_scena':
339 elif name == 'naglowek_akt':
342 for one_part in main_text:
344 if is_act is False and is_scene is True:
345 if name == 'naglowek_czesc':
347 last_node_part = True
348 main_xml_part[:] = [deepcopy(one_part)]
349 elif not last_node_part and name == "naglowek_scena":
351 main_xml_part[:] = [deepcopy(one_part)]
353 main_xml_part.append(deepcopy(one_part))
354 last_node_part = False
356 if name == 'naglowek_czesc':
358 last_node_part = True
359 main_xml_part[:] = [deepcopy(one_part)]
360 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
362 main_xml_part[:] = [deepcopy(one_part)]
364 main_xml_part.append(deepcopy(one_part))
365 last_node_part = False
369 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
370 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
373 for element in chunk_xml[0]:
374 if element.tag == "naglowek_czesc":
375 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
376 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
377 toc.add(node_name(element), "part%d.html" % chunk_no)
378 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
379 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
380 element.set('sub', str(subnumber))
382 if not _empty_html_static:
383 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
385 output_html = _empty_html_static[0]
387 find_annotations(annotations, chunk_xml, chunk_no)
388 replace_by_verse(chunk_xml)
389 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
390 chars = used_chars(html_tree.getroot())
391 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
392 return output_html, toc, chars
395 def transform(wldoc, verbose=False,
396 style=None, html_toc=False,
397 sample=None, cover=None, flags=None):
398 """ produces a EPUB file
400 sample=n: generate sample e-book (with at least n paragraphs)
401 cover: a cover.Cover factory or True for default
402 flags: less-advertising, without-fonts, working-copy, with-full-fonts
405 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
406 """ processes one input file and proceeds to its children """
408 replace_characters(wldoc.edoc.getroot())
410 hyphenator = set_hyph_language(wldoc.edoc.getroot())
411 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
414 # every input file will have a TOC entry,
415 # pointing to starting chunk
416 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
419 # write book title page
420 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
421 chars = used_chars(html_tree.getroot())
422 zip.writestr('OPS/title.html',
423 etree.tostring(html_tree, method="html", pretty_print=True))
424 # add a title page TOC entry
425 toc.add(u"Strona tytułowa", "title.html")
426 elif wldoc.book_info.parts:
427 # write title page for every parent
428 if sample is not None and sample <= 0:
430 html_string = open(get_resource('epub/emptyChunk.html')).read()
432 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
433 chars = used_chars(html_tree.getroot())
434 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
435 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
436 add_to_manifest(manifest, chunk_counter)
437 add_to_spine(spine, chunk_counter)
440 if len(wldoc.edoc.getroot()) > 1:
441 # rdf before style master
442 main_text = wldoc.edoc.getroot()[1]
444 # rdf in style master
445 main_text = wldoc.edoc.getroot()[0]
446 if main_text.tag == RDFNS('RDF'):
449 if main_text is not None:
450 for chunk_xml in chop(main_text):
452 if sample is not None:
456 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
457 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
459 toc.extend(chunk_toc)
460 chars = chars.union(chunk_chars)
461 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
462 add_to_manifest(manifest, chunk_counter)
463 add_to_spine(spine, chunk_counter)
466 for child in wldoc.parts():
467 child_toc, chunk_counter, chunk_chars, sample = transform_file(
468 child, chunk_counter, first=False, sample=sample)
469 toc.append(child_toc)
470 chars = chars.union(chunk_chars)
472 return toc, chunk_counter, chars, sample
475 document = deepcopy(wldoc)
480 document.edoc.getroot().set(flag, 'yes')
483 document.edoc.getroot().set('editors', u', '.join(sorted(
484 editor.readable() for editor in document.editors())))
485 if document.book_info.funders:
486 document.edoc.getroot().set('funders', u', '.join(
487 document.book_info.funders))
488 if document.book_info.thanks:
489 document.edoc.getroot().set('thanks', document.book_info.thanks)
491 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
492 manifest = opf.find('.//' + OPFNS('manifest'))
493 guide = opf.find('.//' + OPFNS('guide'))
494 spine = opf.find('.//' + OPFNS('spine'))
496 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
497 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
499 # write static elements
500 mime = zipfile.ZipInfo()
501 mime.filename = 'mimetype'
502 mime.compress_type = zipfile.ZIP_STORED
504 zip.writestr(mime, 'application/epub+zip')
505 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
506 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
507 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
508 'media-type="application/oebps-package+xml" />' \
509 '</rootfiles></container>')
510 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
511 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
513 style = get_resource('epub/style.css')
514 zip.write(style, os.path.join('OPS', 'style.css'))
518 cover = DefaultEbookCover
520 cover_file = StringIO()
521 bound_cover = cover(document.book_info)
522 bound_cover.save(cover_file)
523 cover_name = 'cover.%s' % bound_cover.ext()
524 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
527 cover_tree = etree.parse(get_resource('epub/cover.html'))
528 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
529 zip.writestr('OPS/cover.html', etree.tostring(
530 cover_tree, method="html", pretty_print=True))
532 if bound_cover.uses_dc_cover:
533 if document.book_info.cover_by:
534 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
535 if document.book_info.cover_source:
536 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
538 manifest.append(etree.fromstring(
539 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
540 manifest.append(etree.fromstring(
541 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
542 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
543 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
544 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
547 annotations = etree.Element('annotations')
549 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
550 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
551 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
552 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
554 nav_map = toc_file[-1]
557 manifest.append(etree.fromstring(
558 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
559 spine.append(etree.fromstring(
560 '<itemref idref="html_toc" />'))
561 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
563 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
565 if len(toc.children) < 2:
566 toc.add(u"Początek utworu", "part1.html")
568 # Last modifications in container files and EPUB creation
569 if len(annotations) > 0:
570 toc.add("Przypisy", "annotations.html")
571 manifest.append(etree.fromstring(
572 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
573 spine.append(etree.fromstring(
574 '<itemref idref="annotations" />'))
575 replace_by_verse(annotations)
576 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
577 chars = chars.union(used_chars(html_tree.getroot()))
578 zip.writestr('OPS/annotations.html', etree.tostring(
579 html_tree, method="html", pretty_print=True))
581 toc.add("Wesprzyj Wolne Lektury", "support.html")
582 manifest.append(etree.fromstring(
583 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
584 spine.append(etree.fromstring(
585 '<itemref idref="support" />'))
586 html_string = open(get_resource('epub/support.html')).read()
587 chars.update(used_chars(etree.fromstring(html_string)))
588 zip.writestr('OPS/support.html', html_string)
590 toc.add("Strona redakcyjna", "last.html")
591 manifest.append(etree.fromstring(
592 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
593 spine.append(etree.fromstring(
594 '<itemref idref="last" />'))
595 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
596 chars.update(used_chars(html_tree.getroot()))
597 zip.writestr('OPS/last.html', etree.tostring(
598 html_tree, method="html", pretty_print=True))
600 if not flags or not 'without-fonts' in flags:
602 tmpdir = mkdtemp('-librarian-epub')
608 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
609 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
610 if not flags or not 'with-full-fonts' in flags:
611 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
612 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
614 print "Running font-optimizer"
615 subprocess.check_call(optimizer_call)
617 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
618 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
620 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
621 manifest.append(etree.fromstring(
622 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
626 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
627 title = document.book_info.title
628 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
629 for st in attributes:
630 meta = toc_file.makeelement(NCXNS('meta'))
632 meta.set('content', '0')
633 toc_file[0].append(meta)
634 toc_file[0][0].set('content', str(document.book_info.url))
635 toc_file[0][1].set('content', str(toc.depth()))
636 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
640 toc.add(u"Spis treści", "toc.html", index=1)
641 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
642 toc.write_to_xml(nav_map)
643 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
646 return OutputFile.from_filename(output_file.name)