1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
55 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
57 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
62 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
70 """ returns node's text and children as a string
72 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
76 nt = node.text if node.text is not None else ''
77 return ''.join([nt] + [etree.tostring(child) for child in node])
80 def set_inner_xml(node, text):
81 """ sets node's text and children from a string
83 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
84 >>> set_inner_xml(e, 'x<b>y</b>z')
85 >>> print etree.tostring(e)
89 p = etree.fromstring('<x>%s</x>' % text)
95 """ Find out a node's name
97 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
101 tempnode = deepcopy(node)
103 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
104 for e in tempnode.findall('.//%s' % p):
108 etree.strip_tags(tempnode, '*')
112 def xslt(xml, sheet):
113 if isinstance(xml, etree._Element):
114 xml = etree.ElementTree(xml)
115 with open(sheet) as xsltf:
116 return xml.xslt(etree.parse(xsltf))
119 def replace_characters(node):
120 def replace_chars(text):
123 return text.replace(u"\ufeff", u"")\
124 .replace("---", u"\u2014")\
125 .replace("--", u"\u2013")\
126 .replace(",,", u"\u201E")\
127 .replace('"', u"\u201D")\
128 .replace("'", u"\u2019")
129 if node.tag in ('uwaga', 'extra'):
133 node.text = replace_chars(node.text)
134 node.tail = replace_chars(node.tail)
136 replace_characters(child)
139 def find_annotations(annotations, source, part_no):
141 if child.tag in ('pe', 'pa', 'pt', 'pr'):
142 annotation = deepcopy(child)
143 number = str(len(annotations) + 1)
144 annotation.set('number', number)
145 annotation.set('part', str(part_no))
147 annotations.append(annotation)
152 if child.tag not in ('extra', 'uwaga'):
153 find_annotations(annotations, child, part_no)
156 class Stanza(object):
158 Converts / verse endings into verse elements in a stanza.
160 Slashes may only occur directly in the stanza. Any slashes in subelements
161 will be ignored, and the subelements will be put inside verse elements.
163 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
164 >>> Stanza(s).versify()
165 >>> print etree.tostring(s)
166 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
167 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
170 def __init__(self, stanza_elem):
171 self.stanza = stanza_elem
173 self.open_verse = None
176 self.push_text(self.stanza.text)
177 for elem in self.stanza:
179 self.push_text(elem.tail)
180 tail = self.stanza.tail
182 self.stanza.tail = tail
183 self.stanza.extend(self.verses)
185 def open_normal_verse(self):
186 self.open_verse = self.stanza.makeelement("wers_normalny")
187 self.verses.append(self.open_verse)
189 def get_open_verse(self):
190 if self.open_verse is None:
191 self.open_normal_verse()
192 return self.open_verse
194 def push_text(self, text):
197 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
199 self.open_normal_verse()
200 verse = self.get_open_verse()
202 verse[-1].tail = (verse[-1].tail or "") + verse_text
204 verse.text = (verse.text or "") + verse_text
206 def push_elem(self, elem):
207 if elem.tag.startswith("wers"):
208 verse = deepcopy(elem)
210 self.verses.append(verse)
211 self.open_verse = verse
213 appended = deepcopy(elem)
215 self.get_open_verse().append(appended)
218 def replace_by_verse(tree):
219 """ Find stanzas and create new verses in place of a '/' character """
221 stanzas = tree.findall('.//' + WLNS('strofa'))
222 for stanza in stanzas:
223 Stanza(stanza).versify()
226 def add_to_manifest(manifest, partno):
227 """ Adds a node to the manifest section in content.opf file """
229 partstr = 'part%d' % partno
230 e = manifest.makeelement(
231 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
232 'media-type': 'application/xhtml+xml'}
237 def add_to_spine(spine, partno):
238 """ Adds a node to the spine section in content.opf file """
240 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
245 def __init__(self, name=None, part_href=None):
248 self.part_href = part_href
249 self.sub_number = None
251 def add(self, name, part_href, level=0, is_part=True, index=None):
252 assert level == 0 or index is None
253 if level > 0 and self.children:
254 return self.children[-1].add(name, part_href, level - 1, is_part)
257 t.part_href = part_href
258 if index is not None:
259 self.children.insert(index, t)
261 self.children.append(t)
263 t.sub_number = len(self.children) + 1
266 def append(self, toc):
267 self.children.append(toc)
269 def extend(self, toc):
270 self.children.extend(toc.children)
274 return max((c.depth() for c in self.children)) + 1
280 if self.sub_number is not None:
281 src += '#sub%d' % self.sub_number
284 def write_to_xml(self, nav_map, counter=1):
285 for child in self.children:
286 nav_point = nav_map.makeelement(NCXNS('navPoint'))
287 nav_point.set('id', 'NavPoint-%d' % counter)
288 nav_point.set('playOrder', str(counter))
290 nav_label = nav_map.makeelement(NCXNS('navLabel'))
291 text = nav_map.makeelement(NCXNS('text'))
292 if child.name is not None:
293 text.text = re.sub(r'\n', ' ', child.name)
295 text.text = child.name
296 nav_label.append(text)
297 nav_point.append(nav_label)
299 content = nav_map.makeelement(NCXNS('content'))
300 content.set('src', child.href())
301 nav_point.append(content)
302 nav_map.append(nav_point)
303 counter = child.write_to_xml(nav_point, counter + 1)
306 def html_part(self, depth=0):
308 for child in self.children:
310 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
311 (depth, child.href(), child.name))
312 texts.append(child.html_part(depth + 1))
313 return "\n".join(texts)
316 with open(get_resource('epub/toc.html')) as f:
317 t = unicode(f.read(), 'utf-8')
318 return t % self.html_part()
321 def used_chars(element):
322 """ Lists characters used in an ETree Element """
323 chars = set((element.text or '') + (element.tail or ''))
324 for child in element:
325 chars = chars.union(used_chars(child))
330 """ divide main content of the XML file into chunks """
332 # prepare a container for each chunk
333 part_xml = etree.Element('utwor')
334 etree.SubElement(part_xml, 'master')
335 main_xml_part = part_xml[0] # master
337 last_node_part = False
339 # the below loop are workaround for a problem with epubs in drama ebooks without acts
342 for one_part in main_text:
344 if name == 'naglowek_scena':
346 elif name == 'naglowek_akt':
349 for one_part in main_text:
351 if is_act is False and is_scene is True:
352 if name == 'naglowek_czesc':
354 last_node_part = True
355 main_xml_part[:] = [deepcopy(one_part)]
356 elif not last_node_part and name == "naglowek_scena":
358 main_xml_part[:] = [deepcopy(one_part)]
360 main_xml_part.append(deepcopy(one_part))
361 last_node_part = False
363 if name == 'naglowek_czesc':
365 last_node_part = True
366 main_xml_part[:] = [deepcopy(one_part)]
367 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
369 main_xml_part[:] = [deepcopy(one_part)]
371 main_xml_part.append(deepcopy(one_part))
372 last_node_part = False
376 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
377 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
380 for element in chunk_xml[0]:
381 if element.tag == "naglowek_czesc":
382 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
383 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
384 toc.add(node_name(element), "part%d.html" % chunk_no)
385 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
386 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
387 element.set('sub', str(subnumber))
389 if not _empty_html_static:
390 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
392 output_html = _empty_html_static[0]
394 find_annotations(annotations, chunk_xml, chunk_no)
395 replace_by_verse(chunk_xml)
396 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
397 chars = used_chars(html_tree.getroot())
398 output_html = etree.tostring(
399 html_tree, pretty_print=True, xml_declaration=True,
401 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
402 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
404 return output_html, toc, chars
407 def transform(wldoc, verbose=False,
408 style=None, html_toc=False,
409 sample=None, cover=None, flags=None, hyphenate=False):
410 """ produces a EPUB file
412 sample=n: generate sample e-book (with at least n paragraphs)
413 cover: a cover.Cover factory or True for default
414 flags: less-advertising, without-fonts, working-copy
417 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
418 """ processes one input file and proceeds to its children """
420 replace_characters(wldoc.edoc.getroot())
423 hyphenator = set_hyph_language(wldoc.edoc.getroot())
424 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
426 # every input file will have a TOC entry,
427 # pointing to starting chunk
428 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
431 # write book title page
432 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
433 chars = used_chars(html_tree.getroot())
437 html_tree, pretty_print=True, xml_declaration=True,
439 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
440 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
443 # add a title page TOC entry
444 toc.add(u"Strona tytułowa", "title.html")
445 elif wldoc.book_info.parts:
446 # write title page for every parent
447 if sample is not None and sample <= 0:
449 html_string = open(get_resource('epub/emptyChunk.html')).read()
451 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
452 chars = used_chars(html_tree.getroot())
453 html_string = etree.tostring(
454 html_tree, pretty_print=True, xml_declaration=True,
456 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
457 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
459 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
460 add_to_manifest(manifest, chunk_counter)
461 add_to_spine(spine, chunk_counter)
464 if len(wldoc.edoc.getroot()) > 1:
465 # rdf before style master
466 main_text = wldoc.edoc.getroot()[1]
468 # rdf in style master
469 main_text = wldoc.edoc.getroot()[0]
470 if main_text.tag == RDFNS('RDF'):
473 if main_text is not None:
474 for chunk_xml in chop(main_text):
476 if sample is not None:
480 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
481 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
483 toc.extend(chunk_toc)
484 chars = chars.union(chunk_chars)
485 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
486 add_to_manifest(manifest, chunk_counter)
487 add_to_spine(spine, chunk_counter)
490 for child in wldoc.parts():
491 child_toc, chunk_counter, chunk_chars, sample = transform_file(
492 child, chunk_counter, first=False, sample=sample)
493 toc.append(child_toc)
494 chars = chars.union(chunk_chars)
496 return toc, chunk_counter, chars, sample
498 document = deepcopy(wldoc)
503 document.edoc.getroot().set(flag, 'yes')
505 document.clean_ed_note()
506 document.clean_ed_note('abstrakt')
509 editors = document.editors()
511 document.edoc.getroot().set('editors', u', '.join(sorted(
512 editor.readable() for editor in editors)))
513 if document.book_info.funders:
514 document.edoc.getroot().set('funders', u', '.join(
515 document.book_info.funders))
516 if document.book_info.thanks:
517 document.edoc.getroot().set('thanks', document.book_info.thanks)
519 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
520 manifest = opf.find('.//' + OPFNS('manifest'))
521 guide = opf.find('.//' + OPFNS('guide'))
522 spine = opf.find('.//' + OPFNS('spine'))
524 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
525 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
527 functions.reg_mathml_epub(zip)
529 # write static elements
530 mime = zipfile.ZipInfo()
531 mime.filename = 'mimetype'
532 mime.compress_type = zipfile.ZIP_STORED
534 zip.writestr(mime, 'application/epub+zip')
536 'META-INF/container.xml',
537 '<?xml version="1.0" ?>'
538 '<container version="1.0" '
539 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
540 '<rootfiles><rootfile full-path="OPS/content.opf" '
541 'media-type="application/oebps-package+xml" />'
542 '</rootfiles></container>'
544 zip.write(get_resource('res/wl-logo-small.png'),
545 os.path.join('OPS', 'logo_wolnelektury.png'))
546 zip.write(get_resource('res/jedenprocent.png'),
547 os.path.join('OPS', 'jedenprocent.png'))
549 style = get_resource('epub/style.css')
550 zip.write(style, os.path.join('OPS', 'style.css'))
554 cover = DefaultEbookCover
556 cover_file = StringIO()
557 bound_cover = cover(document.book_info)
558 bound_cover.save(cover_file)
559 cover_name = 'cover.%s' % bound_cover.ext()
560 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
563 cover_tree = etree.parse(get_resource('epub/cover.html'))
564 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
565 zip.writestr('OPS/cover.html', etree.tostring(
566 cover_tree, pretty_print=True, xml_declaration=True,
568 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
569 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
572 if bound_cover.uses_dc_cover:
573 if document.book_info.cover_by:
574 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
575 if document.book_info.cover_source:
576 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
578 manifest.append(etree.fromstring(
579 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
580 manifest.append(etree.fromstring(
581 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
582 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
583 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
584 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
586 annotations = etree.Element('annotations')
588 toc_file = etree.fromstring(
589 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
590 '"-//NISO//DTD ncx 2005-1//EN" '
591 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
592 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
593 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
596 nav_map = toc_file[-1]
599 manifest.append(etree.fromstring(
600 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
601 spine.append(etree.fromstring(
602 '<itemref idref="html_toc" />'))
603 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
605 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
607 if len(toc.children) < 2:
608 toc.add(u"Początek utworu", "part1.html")
610 # Last modifications in container files and EPUB creation
611 if len(annotations) > 0:
612 toc.add("Przypisy", "annotations.html")
613 manifest.append(etree.fromstring(
614 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
615 spine.append(etree.fromstring(
616 '<itemref idref="annotations" />'))
617 replace_by_verse(annotations)
618 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
619 chars = chars.union(used_chars(html_tree.getroot()))
620 zip.writestr('OPS/annotations.html', etree.tostring(
621 html_tree, pretty_print=True, xml_declaration=True,
623 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
624 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
627 toc.add("Wesprzyj Wolne Lektury", "support.html")
628 manifest.append(etree.fromstring(
629 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
630 spine.append(etree.fromstring(
631 '<itemref idref="support" />'))
632 html_string = open(get_resource('epub/support.html')).read()
633 chars.update(used_chars(etree.fromstring(html_string)))
634 zip.writestr('OPS/support.html', html_string)
636 toc.add("Strona redakcyjna", "last.html")
637 manifest.append(etree.fromstring(
638 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
639 spine.append(etree.fromstring(
640 '<itemref idref="last" />'))
641 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
642 chars.update(used_chars(html_tree.getroot()))
643 zip.writestr('OPS/last.html', etree.tostring(
644 html_tree, pretty_print=True, xml_declaration=True,
646 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
647 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
650 if not flags or 'without-fonts' not in flags:
652 tmpdir = mkdtemp('-librarian-epub')
658 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
659 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
660 optimizer_call = ['perl', 'subset.pl', '--chars',
661 ''.join(chars).encode('utf-8'),
662 get_resource('fonts/' + fname),
663 os.path.join(tmpdir, fname)]
665 print "Running font-optimizer"
666 subprocess.check_call(optimizer_call)
668 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
669 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
670 manifest.append(etree.fromstring(
671 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
675 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
676 xml_declaration=True, encoding="utf-8"))
677 title = document.book_info.title
678 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
679 for st in attributes:
680 meta = toc_file.makeelement(NCXNS('meta'))
682 meta.set('content', '0')
683 toc_file[0].append(meta)
684 toc_file[0][0].set('content', str(document.book_info.url))
685 toc_file[0][1].set('content', str(toc.depth()))
686 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
690 toc.add(u"Spis treści", "toc.html", index=1)
691 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
692 toc.write_to_xml(nav_map)
693 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
694 xml_declaration=True, encoding="utf-8"))
697 return OutputFile.from_filename(output_file.name)