1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
55 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
57 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
62 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
70 """ returns node's text and children as a string
72 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
76 nt = node.text if node.text is not None else ''
77 return ''.join([nt] + [etree.tostring(child) for child in node])
80 def set_inner_xml(node, text):
81 """ sets node's text and children from a string
83 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
84 >>> set_inner_xml(e, 'x<b>y</b>z')
85 >>> print etree.tostring(e)
89 p = etree.fromstring('<x>%s</x>' % text)
95 """ Find out a node's name
97 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
101 tempnode = deepcopy(node)
103 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
104 for e in tempnode.findall('.//%s' % p):
108 etree.strip_tags(tempnode, '*')
112 def xslt(xml, sheet):
113 if isinstance(xml, etree._Element):
114 xml = etree.ElementTree(xml)
115 with open(sheet) as xsltf:
116 return xml.xslt(etree.parse(xsltf))
119 def replace_characters(node):
120 def replace_chars(text):
123 return text.replace(u"\ufeff", u"")\
124 .replace("---", u"\u2014")\
125 .replace("--", u"\u2013")\
126 .replace(",,", u"\u201E")\
127 .replace('"', u"\u201D")\
128 .replace("'", u"\u2019")
129 if node.tag in ('uwaga', 'extra'):
133 node.text = replace_chars(node.text)
134 node.tail = replace_chars(node.tail)
136 replace_characters(child)
139 def find_annotations(annotations, source, part_no):
141 if child.tag in ('pe', 'pa', 'pt', 'pr'):
142 annotation = deepcopy(child)
143 number = str(len(annotations) + 1)
144 annotation.set('number', number)
145 annotation.set('part', str(part_no))
147 annotations.append(annotation)
152 if child.tag not in ('extra', 'uwaga'):
153 find_annotations(annotations, child, part_no)
156 class Stanza(object):
158 Converts / verse endings into verse elements in a stanza.
160 Slashes may only occur directly in the stanza. Any slashes in subelements
161 will be ignored, and the subelements will be put inside verse elements.
163 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
164 >>> Stanza(s).versify()
165 >>> print etree.tostring(s)
166 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
167 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
170 def __init__(self, stanza_elem):
171 self.stanza = stanza_elem
173 self.open_verse = None
176 self.push_text(self.stanza.text)
177 for elem in self.stanza:
179 self.push_text(elem.tail)
180 tail = self.stanza.tail
182 self.stanza.tail = tail
183 self.stanza.extend(self.verses)
185 def open_normal_verse(self):
186 self.open_verse = self.stanza.makeelement("wers_normalny")
187 self.verses.append(self.open_verse)
189 def get_open_verse(self):
190 if self.open_verse is None:
191 self.open_normal_verse()
192 return self.open_verse
194 def push_text(self, text):
197 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
199 self.open_normal_verse()
200 verse = self.get_open_verse()
202 verse[-1].tail = (verse[-1].tail or "") + verse_text
204 verse.text = (verse.text or "") + verse_text
206 def push_elem(self, elem):
207 if elem.tag.startswith("wers"):
208 verse = deepcopy(elem)
210 self.verses.append(verse)
211 self.open_verse = verse
213 appended = deepcopy(elem)
215 self.get_open_verse().append(appended)
218 def replace_by_verse(tree):
219 """ Find stanzas and create new verses in place of a '/' character """
221 stanzas = tree.findall('.//' + WLNS('strofa'))
222 for stanza in stanzas:
223 Stanza(stanza).versify()
226 def add_to_manifest(manifest, partno):
227 """ Adds a node to the manifest section in content.opf file """
229 partstr = 'part%d' % partno
230 e = manifest.makeelement(
231 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
232 'media-type': 'application/xhtml+xml'}
237 def add_to_spine(spine, partno):
238 """ Adds a node to the spine section in content.opf file """
240 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
245 def __init__(self, name=None, part_href=None):
248 self.part_href = part_href
249 self.sub_number = None
251 def add(self, name, part_href, level=0, is_part=True, index=None):
252 assert level == 0 or index is None
253 if level > 0 and self.children:
254 return self.children[-1].add(name, part_href, level - 1, is_part)
257 t.part_href = part_href
258 if index is not None:
259 self.children.insert(index, t)
261 self.children.append(t)
263 t.sub_number = len(self.children) + 1
266 def append(self, toc):
267 self.children.append(toc)
269 def extend(self, toc):
270 self.children.extend(toc.children)
274 return max((c.depth() for c in self.children)) + 1
280 if self.sub_number is not None:
281 src += '#sub%d' % self.sub_number
284 def write_to_xml(self, nav_map, counter=1):
285 for child in self.children:
286 nav_point = nav_map.makeelement(NCXNS('navPoint'))
287 nav_point.set('id', 'NavPoint-%d' % counter)
288 nav_point.set('playOrder', str(counter))
290 nav_label = nav_map.makeelement(NCXNS('navLabel'))
291 text = nav_map.makeelement(NCXNS('text'))
292 if child.name is not None:
293 text.text = re.sub(r'\n', ' ', child.name)
295 text.text = child.name
296 nav_label.append(text)
297 nav_point.append(nav_label)
299 content = nav_map.makeelement(NCXNS('content'))
300 content.set('src', child.href())
301 nav_point.append(content)
302 nav_map.append(nav_point)
303 counter = child.write_to_xml(nav_point, counter + 1)
306 def html_part(self, depth=0):
308 for child in self.children:
310 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
311 (depth, child.href(), child.name))
312 texts.append(child.html_part(depth + 1))
313 return "\n".join(texts)
316 with open(get_resource('epub/toc.html')) as f:
317 t = unicode(f.read(), 'utf-8')
318 return t % self.html_part()
321 def used_chars(element):
322 """ Lists characters used in an ETree Element """
323 chars = set((element.text or '') + (element.tail or ''))
324 for child in element:
325 chars = chars.union(used_chars(child))
330 """ divide main content of the XML file into chunks """
332 # prepare a container for each chunk
333 part_xml = etree.Element('utwor')
334 etree.SubElement(part_xml, 'master')
335 main_xml_part = part_xml[0] # master
337 last_node_part = False
339 # the below loop are workaround for a problem with epubs in drama ebooks without acts
342 for one_part in main_text:
344 if name == 'naglowek_scena':
346 elif name == 'naglowek_akt':
349 for one_part in main_text:
351 if is_act is False and is_scene is True:
352 if name == 'naglowek_czesc':
354 last_node_part = True
355 main_xml_part[:] = [deepcopy(one_part)]
356 elif not last_node_part and name == "naglowek_scena":
358 main_xml_part[:] = [deepcopy(one_part)]
360 main_xml_part.append(deepcopy(one_part))
361 last_node_part = False
363 if name == 'naglowek_czesc':
365 last_node_part = True
366 main_xml_part[:] = [deepcopy(one_part)]
367 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
369 main_xml_part[:] = [deepcopy(one_part)]
371 main_xml_part.append(deepcopy(one_part))
372 last_node_part = False
376 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
377 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
380 for element in chunk_xml[0]:
381 if element.tag == "naglowek_czesc":
382 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
383 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
384 toc.add(node_name(element), "part%d.html" % chunk_no)
385 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
386 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
387 element.set('sub', str(subnumber))
389 if not _empty_html_static:
390 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
392 output_html = _empty_html_static[0]
394 find_annotations(annotations, chunk_xml, chunk_no)
395 replace_by_verse(chunk_xml)
396 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
397 chars = used_chars(html_tree.getroot())
398 output_html = etree.tostring(
399 html_tree, pretty_print=True, xml_declaration=True,
401 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
402 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
404 return output_html, toc, chars
407 def transform(wldoc, verbose=False,
408 style=None, html_toc=False,
409 sample=None, cover=None, flags=None):
410 """ produces a EPUB file
412 sample=n: generate sample e-book (with at least n paragraphs)
413 cover: a cover.Cover factory or True for default
414 flags: less-advertising, without-fonts, working-copy
417 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
418 """ processes one input file and proceeds to its children """
420 replace_characters(wldoc.edoc.getroot())
422 hyphenator = set_hyph_language(wldoc.edoc.getroot())
423 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
425 # every input file will have a TOC entry,
426 # pointing to starting chunk
427 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
430 # write book title page
431 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
432 chars = used_chars(html_tree.getroot())
436 html_tree, pretty_print=True, xml_declaration=True,
438 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
439 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
442 # add a title page TOC entry
443 toc.add(u"Strona tytułowa", "title.html")
444 elif wldoc.book_info.parts:
445 # write title page for every parent
446 if sample is not None and sample <= 0:
448 html_string = open(get_resource('epub/emptyChunk.html')).read()
450 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
451 chars = used_chars(html_tree.getroot())
452 html_string = etree.tostring(
453 html_tree, pretty_print=True, xml_declaration=True,
455 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
456 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
458 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
459 add_to_manifest(manifest, chunk_counter)
460 add_to_spine(spine, chunk_counter)
463 if len(wldoc.edoc.getroot()) > 1:
464 # rdf before style master
465 main_text = wldoc.edoc.getroot()[1]
467 # rdf in style master
468 main_text = wldoc.edoc.getroot()[0]
469 if main_text.tag == RDFNS('RDF'):
472 if main_text is not None:
473 for chunk_xml in chop(main_text):
475 if sample is not None:
479 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
480 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
482 toc.extend(chunk_toc)
483 chars = chars.union(chunk_chars)
484 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
485 add_to_manifest(manifest, chunk_counter)
486 add_to_spine(spine, chunk_counter)
489 for child in wldoc.parts():
490 child_toc, chunk_counter, chunk_chars, sample = transform_file(
491 child, chunk_counter, first=False, sample=sample)
492 toc.append(child_toc)
493 chars = chars.union(chunk_chars)
495 return toc, chunk_counter, chars, sample
497 document = deepcopy(wldoc)
502 document.edoc.getroot().set(flag, 'yes')
505 editors = document.editors()
507 document.edoc.getroot().set('editors', u', '.join(sorted(
508 editor.readable() for editor in editors)))
509 if document.book_info.funders:
510 document.edoc.getroot().set('funders', u', '.join(
511 document.book_info.funders))
512 if document.book_info.thanks:
513 document.edoc.getroot().set('thanks', document.book_info.thanks)
515 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
516 manifest = opf.find('.//' + OPFNS('manifest'))
517 guide = opf.find('.//' + OPFNS('guide'))
518 spine = opf.find('.//' + OPFNS('spine'))
520 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
521 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
523 functions.reg_mathml_epub(zip)
525 # write static elements
526 mime = zipfile.ZipInfo()
527 mime.filename = 'mimetype'
528 mime.compress_type = zipfile.ZIP_STORED
530 zip.writestr(mime, 'application/epub+zip')
532 'META-INF/container.xml',
533 '<?xml version="1.0" ?>'
534 '<container version="1.0" '
535 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
536 '<rootfiles><rootfile full-path="OPS/content.opf" '
537 'media-type="application/oebps-package+xml" />'
538 '</rootfiles></container>'
540 zip.write(get_resource('res/wl-logo-small.png'),
541 os.path.join('OPS', 'logo_wolnelektury.png'))
542 zip.write(get_resource('res/jedenprocent.png'),
543 os.path.join('OPS', 'jedenprocent.png'))
545 style = get_resource('epub/style.css')
546 zip.write(style, os.path.join('OPS', 'style.css'))
550 cover = DefaultEbookCover
552 cover_file = StringIO()
553 bound_cover = cover(document.book_info)
554 bound_cover.save(cover_file)
555 cover_name = 'cover.%s' % bound_cover.ext()
556 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
559 cover_tree = etree.parse(get_resource('epub/cover.html'))
560 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
561 zip.writestr('OPS/cover.html', etree.tostring(
562 cover_tree, pretty_print=True, xml_declaration=True,
564 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
565 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
568 if bound_cover.uses_dc_cover:
569 if document.book_info.cover_by:
570 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
571 if document.book_info.cover_source:
572 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
574 manifest.append(etree.fromstring(
575 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
576 manifest.append(etree.fromstring(
577 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
578 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
579 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
580 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
582 annotations = etree.Element('annotations')
584 toc_file = etree.fromstring(
585 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
586 '"-//NISO//DTD ncx 2005-1//EN" '
587 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
588 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
589 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
592 nav_map = toc_file[-1]
595 manifest.append(etree.fromstring(
596 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
597 spine.append(etree.fromstring(
598 '<itemref idref="html_toc" />'))
599 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
601 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
603 if len(toc.children) < 2:
604 toc.add(u"Początek utworu", "part1.html")
606 # Last modifications in container files and EPUB creation
607 if len(annotations) > 0:
608 toc.add("Przypisy", "annotations.html")
609 manifest.append(etree.fromstring(
610 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
611 spine.append(etree.fromstring(
612 '<itemref idref="annotations" />'))
613 replace_by_verse(annotations)
614 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
615 chars = chars.union(used_chars(html_tree.getroot()))
616 zip.writestr('OPS/annotations.html', etree.tostring(
617 html_tree, pretty_print=True, xml_declaration=True,
619 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
620 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
623 toc.add("Wesprzyj Wolne Lektury", "support.html")
624 manifest.append(etree.fromstring(
625 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
626 spine.append(etree.fromstring(
627 '<itemref idref="support" />'))
628 html_string = open(get_resource('epub/support.html')).read()
629 chars.update(used_chars(etree.fromstring(html_string)))
630 zip.writestr('OPS/support.html', html_string)
632 toc.add("Strona redakcyjna", "last.html")
633 manifest.append(etree.fromstring(
634 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
635 spine.append(etree.fromstring(
636 '<itemref idref="last" />'))
637 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
638 chars.update(used_chars(html_tree.getroot()))
639 zip.writestr('OPS/last.html', etree.tostring(
640 html_tree, pretty_print=True, xml_declaration=True,
642 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
643 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
646 if not flags or not 'without-fonts' in flags:
648 tmpdir = mkdtemp('-librarian-epub')
654 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
655 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
656 optimizer_call = ['perl', 'subset.pl', '--chars',
657 ''.join(chars).encode('utf-8'),
658 get_resource('fonts/' + fname),
659 os.path.join(tmpdir, fname)]
661 print "Running font-optimizer"
662 subprocess.check_call(optimizer_call)
664 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
665 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
666 manifest.append(etree.fromstring(
667 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
671 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
672 xml_declaration=True, encoding="utf-8"))
673 title = document.book_info.title
674 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
675 for st in attributes:
676 meta = toc_file.makeelement(NCXNS('meta'))
678 meta.set('content', '0')
679 toc_file[0].append(meta)
680 toc_file[0][0].set('content', str(document.book_info.url))
681 toc_file[0][1].set('content', str(toc.depth()))
682 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
686 toc.add(u"Spis treści", "toc.html", index=1)
687 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
688 toc.write_to_xml(nav_map)
689 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
690 xml_declaration=True, encoding="utf-8"))
693 return OutputFile.from_filename(output_file.name)