1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
29 def set_hyph_language(source_tree):
30 def get_short_lng_code(text):
33 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
35 list = line.strip().split('|')
42 bibl_lng = etree.XPath('//dc:language//text()', namespaces = {'dc':str(DCNS)})(source_tree)
43 short_lng = get_short_lng_code(bibl_lng[0])
45 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic'))
49 def hyphenate_and_fix_conjunctions(source_tree, hyph):
51 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
53 parent = t.getparent()
55 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
57 newt += hyph.inserted(w, u'\u00AD')
58 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
65 """ returns node's text and children as a string
67 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
71 nt = node.text if node.text is not None else ''
72 return ''.join([nt] + [etree.tostring(child) for child in node])
74 def set_inner_xml(node, text):
75 """ sets node's text and children from a string
77 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
78 >>> set_inner_xml(e, 'x<b>y</b>z')
79 >>> print etree.tostring(e)
83 p = etree.fromstring('<x>%s</x>' % text)
89 """ Find out a node's name
91 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
95 tempnode = deepcopy(node)
97 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
98 for e in tempnode.findall('.//%s' % p):
102 etree.strip_tags(tempnode, '*')
106 def xslt(xml, sheet):
107 if isinstance(xml, etree._Element):
108 xml = etree.ElementTree(xml)
109 with open(sheet) as xsltf:
110 return xml.xslt(etree.parse(xsltf))
113 def replace_characters(node):
114 def replace_chars(text):
117 return text.replace(u"\ufeff", u"")\
118 .replace("---", u"\u2014")\
119 .replace("--", u"\u2013")\
120 .replace(",,", u"\u201E")\
121 .replace('"', u"\u201D")\
122 .replace("'", u"\u2019")
123 if node.tag in ('uwaga', 'extra'):
127 node.text = replace_chars(node.text)
128 node.tail = replace_chars(node.tail)
130 replace_characters(child)
133 def find_annotations(annotations, source, part_no):
135 if child.tag in ('pe', 'pa', 'pt', 'pr'):
136 annotation = deepcopy(child)
137 number = str(len(annotations)+1)
138 annotation.set('number', number)
139 annotation.set('part', str(part_no))
141 annotations.append(annotation)
146 if child.tag not in ('extra', 'uwaga'):
147 find_annotations(annotations, child, part_no)
150 class Stanza(object):
152 Converts / verse endings into verse elements in a stanza.
154 Slashes may only occur directly in the stanza. Any slashes in subelements
155 will be ignored, and the subelements will be put inside verse elements.
157 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
158 >>> Stanza(s).versify()
159 >>> print etree.tostring(s)
160 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
161 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
164 def __init__(self, stanza_elem):
165 self.stanza = stanza_elem
167 self.open_verse = None
170 self.push_text(self.stanza.text)
171 for elem in self.stanza:
173 self.push_text(elem.tail)
174 tail = self.stanza.tail
176 self.stanza.tail = tail
177 self.stanza.extend(self.verses)
179 def open_normal_verse(self):
180 self.open_verse = self.stanza.makeelement("wers_normalny")
181 self.verses.append(self.open_verse)
183 def get_open_verse(self):
184 if self.open_verse is None:
185 self.open_normal_verse()
186 return self.open_verse
188 def push_text(self, text):
191 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
193 self.open_normal_verse()
194 verse = self.get_open_verse()
196 verse[-1].tail = (verse[-1].tail or "") + verse_text
198 verse.text = (verse.text or "") + verse_text
200 def push_elem(self, elem):
201 if elem.tag.startswith("wers"):
202 verse = deepcopy(elem)
204 self.verses.append(verse)
205 self.open_verse = verse
207 appended = deepcopy(elem)
209 self.get_open_verse().append(appended)
212 def replace_by_verse(tree):
213 """ Find stanzas and create new verses in place of a '/' character """
215 stanzas = tree.findall('.//' + WLNS('strofa'))
216 for stanza in stanzas:
217 Stanza(stanza).versify()
220 def add_to_manifest(manifest, partno):
221 """ Adds a node to the manifest section in content.opf file """
223 partstr = 'part%d' % partno
224 e = manifest.makeelement(OPFNS('item'), attrib={
226 'href': partstr + '.html',
227 'media-type': 'application/xhtml+xml',
232 def add_to_spine(spine, partno):
233 """ Adds a node to the spine section in content.opf file """
235 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
240 def __init__(self, name=None, part_href=None):
243 self.part_href = part_href
244 self.sub_number = None
246 def add(self, name, part_href, level=0, is_part=True, index=None):
247 assert level == 0 or index is None
248 if level > 0 and self.children:
249 return self.children[-1].add(name, part_href, level-1, is_part)
252 t.part_href = part_href
253 if index is not None:
254 self.children.insert(index, t)
256 self.children.append(t)
258 t.sub_number = len(self.children) + 1
261 def append(self, toc):
262 self.children.append(toc)
264 def extend(self, toc):
265 self.children.extend(toc.children)
269 return max((c.depth() for c in self.children)) + 1
275 if self.sub_number is not None:
276 src += '#sub%d' % self.sub_number
279 def write_to_xml(self, nav_map, counter=1):
280 for child in self.children:
281 nav_point = nav_map.makeelement(NCXNS('navPoint'))
282 nav_point.set('id', 'NavPoint-%d' % counter)
283 nav_point.set('playOrder', str(counter))
285 nav_label = nav_map.makeelement(NCXNS('navLabel'))
286 text = nav_map.makeelement(NCXNS('text'))
287 if child.name is not None:
288 text.text = re.sub(r'\n', ' ', child.name)
290 text.text = child.name
291 nav_label.append(text)
292 nav_point.append(nav_label)
294 content = nav_map.makeelement(NCXNS('content'))
295 content.set('src', child.href())
296 nav_point.append(content)
297 nav_map.append(nav_point)
298 counter = child.write_to_xml(nav_point, counter + 1)
301 def html_part(self, depth=0):
303 for child in self.children:
305 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
306 (depth, child.href(), child.name))
307 texts.append(child.html_part(depth+1))
308 return "\n".join(texts)
311 with open(get_resource('epub/toc.html')) as f:
312 t = unicode(f.read(), 'utf-8')
313 return t % self.html_part()
316 def used_chars(element):
317 """ Lists characters used in an ETree Element """
318 chars = set((element.text or '') + (element.tail or ''))
319 for child in element:
320 chars = chars.union(used_chars(child))
325 """ divide main content of the XML file into chunks """
327 # prepare a container for each chunk
328 part_xml = etree.Element('utwor')
329 etree.SubElement(part_xml, 'master')
330 main_xml_part = part_xml[0] # master
332 last_node_part = False
334 # the below loop are workaround for a problem with epubs in drama ebooks without acts
337 for one_part in main_text:
339 if name == 'naglowek_scena':
341 elif name == 'naglowek_akt':
344 for one_part in main_text:
346 if is_act is False and is_scene is True:
347 if name == 'naglowek_czesc':
349 last_node_part = True
350 main_xml_part[:] = [deepcopy(one_part)]
351 elif not last_node_part and name == "naglowek_scena":
353 main_xml_part[:] = [deepcopy(one_part)]
355 main_xml_part.append(deepcopy(one_part))
356 last_node_part = False
358 if name == 'naglowek_czesc':
360 last_node_part = True
361 main_xml_part[:] = [deepcopy(one_part)]
362 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
364 main_xml_part[:] = [deepcopy(one_part)]
366 main_xml_part.append(deepcopy(one_part))
367 last_node_part = False
371 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
372 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
375 for element in chunk_xml[0]:
376 if element.tag == "naglowek_czesc":
377 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
378 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
379 toc.add(node_name(element), "part%d.html" % chunk_no)
380 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
381 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
382 element.set('sub', str(subnumber))
384 if not _empty_html_static:
385 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
387 output_html = _empty_html_static[0]
389 find_annotations(annotations, chunk_xml, chunk_no)
390 replace_by_verse(chunk_xml)
391 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
392 chars = used_chars(html_tree.getroot())
393 output_html = etree.tostring(html_tree, pretty_print = True,
394 xml_declaration = True,
396 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
397 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">')
398 return output_html, toc, chars
401 def transform(wldoc, verbose=False,
402 style=None, html_toc=False,
403 sample=None, cover=None, flags=None):
404 """ produces a EPUB file
406 sample=n: generate sample e-book (with at least n paragraphs)
407 cover: a cover.Cover factory or True for default
408 flags: less-advertising, without-fonts, working-copy, with-full-fonts
411 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
412 """ processes one input file and proceeds to its children """
414 replace_characters(wldoc.edoc.getroot())
416 hyphenator = set_hyph_language(wldoc.edoc.getroot())
417 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
420 # every input file will have a TOC entry,
421 # pointing to starting chunk
422 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
425 # write book title page
426 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
427 chars = used_chars(html_tree.getroot())
428 zip.writestr('OPS/title.html',
429 etree.tostring(html_tree, pretty_print = True,
430 xml_declaration = True,
432 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
433 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'))
434 # add a title page TOC entry
435 toc.add(u"Strona tytułowa", "title.html")
436 elif wldoc.book_info.parts:
437 # write title page for every parent
438 if sample is not None and sample <= 0:
440 html_string = open(get_resource('epub/emptyChunk.html')).read()
442 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
443 chars = used_chars(html_tree.getroot())
444 html_string = etree.tostring(html_tree,
446 xml_declaration = True,
448 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
449 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">')
450 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
451 add_to_manifest(manifest, chunk_counter)
452 add_to_spine(spine, chunk_counter)
455 if len(wldoc.edoc.getroot()) > 1:
456 # rdf before style master
457 main_text = wldoc.edoc.getroot()[1]
459 # rdf in style master
460 main_text = wldoc.edoc.getroot()[0]
461 if main_text.tag == RDFNS('RDF'):
464 if main_text is not None:
465 for chunk_xml in chop(main_text):
467 if sample is not None:
471 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
472 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
474 toc.extend(chunk_toc)
475 chars = chars.union(chunk_chars)
476 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
477 add_to_manifest(manifest, chunk_counter)
478 add_to_spine(spine, chunk_counter)
481 for child in wldoc.parts():
482 child_toc, chunk_counter, chunk_chars, sample = transform_file(
483 child, chunk_counter, first=False, sample=sample)
484 toc.append(child_toc)
485 chars = chars.union(chunk_chars)
487 return toc, chunk_counter, chars, sample
490 document = deepcopy(wldoc)
495 document.edoc.getroot().set(flag, 'yes')
498 document.edoc.getroot().set('editors', u', '.join(sorted(
499 editor.readable() for editor in document.editors())))
500 if document.book_info.funders:
501 document.edoc.getroot().set('funders', u', '.join(
502 document.book_info.funders))
503 if document.book_info.thanks:
504 document.edoc.getroot().set('thanks', document.book_info.thanks)
506 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
507 manifest = opf.find('.//' + OPFNS('manifest'))
508 guide = opf.find('.//' + OPFNS('guide'))
509 spine = opf.find('.//' + OPFNS('spine'))
511 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
512 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
514 # write static elements
515 mime = zipfile.ZipInfo()
516 mime.filename = 'mimetype'
517 mime.compress_type = zipfile.ZIP_STORED
519 zip.writestr(mime, 'application/epub+zip')
520 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
521 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
522 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
523 'media-type="application/oebps-package+xml" />' \
524 '</rootfiles></container>')
525 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
526 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
528 style = get_resource('epub/style.css')
529 zip.write(style, os.path.join('OPS', 'style.css'))
533 cover = DefaultEbookCover
535 cover_file = StringIO()
536 bound_cover = cover(document.book_info)
537 bound_cover.save(cover_file)
538 cover_name = 'cover.%s' % bound_cover.ext()
539 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
542 cover_tree = etree.parse(get_resource('epub/cover.html'))
543 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
544 zip.writestr('OPS/cover.html', etree.tostring(
545 cover_tree, pretty_print = True, xml_declaration = True, encoding = "utf-8",
546 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
547 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'))
549 if bound_cover.uses_dc_cover:
550 if document.book_info.cover_by:
551 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
552 if document.book_info.cover_source:
553 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
555 manifest.append(etree.fromstring(
556 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
557 manifest.append(etree.fromstring(
558 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
559 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
560 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
561 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
564 annotations = etree.Element('annotations')
566 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
567 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
568 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
569 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
571 nav_map = toc_file[-1]
574 manifest.append(etree.fromstring(
575 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
576 spine.append(etree.fromstring(
577 '<itemref idref="html_toc" />'))
578 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
580 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
582 if len(toc.children) < 2:
583 toc.add(u"Początek utworu", "part1.html")
585 # Last modifications in container files and EPUB creation
586 if len(annotations) > 0:
587 toc.add("Przypisy", "annotations.html")
588 manifest.append(etree.fromstring(
589 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
590 spine.append(etree.fromstring(
591 '<itemref idref="annotations" />'))
592 replace_by_verse(annotations)
593 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
594 chars = chars.union(used_chars(html_tree.getroot()))
595 zip.writestr('OPS/annotations.html', etree.tostring(
596 html_tree, pretty_print = True,
597 xml_declaration = True,
599 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
600 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'))
602 toc.add("Wesprzyj Wolne Lektury", "support.html")
603 manifest.append(etree.fromstring(
604 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
605 spine.append(etree.fromstring(
606 '<itemref idref="support" />'))
607 html_string = open(get_resource('epub/support.html')).read()
608 chars.update(used_chars(etree.fromstring(html_string)))
609 zip.writestr('OPS/support.html', html_string)
611 toc.add("Strona redakcyjna", "last.html")
612 manifest.append(etree.fromstring(
613 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
614 spine.append(etree.fromstring(
615 '<itemref idref="last" />'))
616 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
617 chars.update(used_chars(html_tree.getroot()))
618 zip.writestr('OPS/last.html', etree.tostring(
619 html_tree, pretty_print = True,
620 xml_declaration = True,
622 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
623 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'))
625 if not flags or not 'without-fonts' in flags:
627 tmpdir = mkdtemp('-librarian-epub')
633 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
634 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
635 if not flags or not 'with-full-fonts' in flags:
636 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
637 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
639 print "Running font-optimizer"
640 subprocess.check_call(optimizer_call)
642 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
643 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
645 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
646 manifest.append(etree.fromstring(
647 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
651 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print = True,
652 xml_declaration = True,
654 title = document.book_info.title
655 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
656 for st in attributes:
657 meta = toc_file.makeelement(NCXNS('meta'))
659 meta.set('content', '0')
660 toc_file[0].append(meta)
661 toc_file[0][0].set('content', str(document.book_info.url))
662 toc_file[0][1].set('content', str(toc.depth()))
663 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
667 toc.add(u"Spis treści", "toc.html", index=1)
668 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
669 toc.write_to_xml(nav_map)
670 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print = True,
671 xml_declaration = True,
675 return OutputFile.from_filename(output_file.name)