1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
54 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
56 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
64 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
72 """ returns node's text and children as a string
74 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
78 nt = node.text if node.text is not None else ''
79 return ''.join([nt] + [etree.tostring(child) for child in node])
82 def set_inner_xml(node, text):
83 """ sets node's text and children from a string
85 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
86 >>> set_inner_xml(e, 'x<b>y</b>z')
87 >>> print etree.tostring(e)
91 p = etree.fromstring('<x>%s</x>' % text)
97 """ Find out a node's name
99 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
103 tempnode = deepcopy(node)
105 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
106 for e in tempnode.findall('.//%s' % p):
110 etree.strip_tags(tempnode, '*')
114 def xslt(xml, sheet):
115 if isinstance(xml, etree._Element):
116 xml = etree.ElementTree(xml)
117 with open(sheet) as xsltf:
118 return xml.xslt(etree.parse(xsltf))
121 def replace_characters(node):
122 def replace_chars(text):
125 return text.replace(u"\ufeff", u"")\
126 .replace("---", u"\u2014")\
127 .replace("--", u"\u2013")\
128 .replace(",,", u"\u201E")\
129 .replace('"', u"\u201D")\
130 .replace("'", u"\u2019")
131 if node.tag in ('uwaga', 'extra'):
135 node.text = replace_chars(node.text)
136 node.tail = replace_chars(node.tail)
138 replace_characters(child)
141 def find_annotations(annotations, source, part_no):
143 if child.tag in ('pe', 'pa', 'pt', 'pr'):
144 annotation = deepcopy(child)
145 number = str(len(annotations) + 1)
146 annotation.set('number', number)
147 annotation.set('part', str(part_no))
149 annotations.append(annotation)
154 if child.tag not in ('extra', 'uwaga'):
155 find_annotations(annotations, child, part_no)
158 class Stanza(object):
160 Converts / verse endings into verse elements in a stanza.
162 Slashes may only occur directly in the stanza. Any slashes in subelements
163 will be ignored, and the subelements will be put inside verse elements.
165 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
166 >>> Stanza(s).versify()
167 >>> print etree.tostring(s)
168 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
169 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
172 def __init__(self, stanza_elem):
173 self.stanza = stanza_elem
175 self.open_verse = None
178 self.push_text(self.stanza.text)
179 for elem in self.stanza:
181 self.push_text(elem.tail)
182 tail = self.stanza.tail
184 self.stanza.tail = tail
185 self.stanza.extend(self.verses)
187 def open_normal_verse(self):
188 self.open_verse = self.stanza.makeelement("wers_normalny")
189 self.verses.append(self.open_verse)
191 def get_open_verse(self):
192 if self.open_verse is None:
193 self.open_normal_verse()
194 return self.open_verse
196 def push_text(self, text):
199 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
201 self.open_normal_verse()
202 verse = self.get_open_verse()
204 verse[-1].tail = (verse[-1].tail or "") + verse_text
206 verse.text = (verse.text or "") + verse_text
208 def push_elem(self, elem):
209 if elem.tag.startswith("wers"):
210 verse = deepcopy(elem)
212 self.verses.append(verse)
213 self.open_verse = verse
215 appended = deepcopy(elem)
217 self.get_open_verse().append(appended)
220 def replace_by_verse(tree):
221 """ Find stanzas and create new verses in place of a '/' character """
223 stanzas = tree.findall('.//' + WLNS('strofa'))
224 for stanza in stanzas:
225 Stanza(stanza).versify()
228 def add_to_manifest(manifest, partno):
229 """ Adds a node to the manifest section in content.opf file """
231 partstr = 'part%d' % partno
232 e = manifest.makeelement(
233 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
234 'media-type': 'application/xhtml+xml'}
239 def add_to_spine(spine, partno):
240 """ Adds a node to the spine section in content.opf file """
242 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
247 def __init__(self, name=None, part_href=None):
250 self.part_href = part_href
251 self.sub_number = None
253 def add(self, name, part_href, level=0, is_part=True, index=None):
254 assert level == 0 or index is None
255 if level > 0 and self.children:
256 return self.children[-1].add(name, part_href, level - 1, is_part)
259 t.part_href = part_href
260 if index is not None:
261 self.children.insert(index, t)
263 self.children.append(t)
265 t.sub_number = len(self.children) + 1
268 def append(self, toc):
269 self.children.append(toc)
271 def extend(self, toc):
272 self.children.extend(toc.children)
276 return max((c.depth() for c in self.children)) + 1
282 if self.sub_number is not None:
283 src += '#sub%d' % self.sub_number
286 def write_to_xml(self, nav_map, counter=1):
287 for child in self.children:
288 nav_point = nav_map.makeelement(NCXNS('navPoint'))
289 nav_point.set('id', 'NavPoint-%d' % counter)
290 nav_point.set('playOrder', str(counter))
292 nav_label = nav_map.makeelement(NCXNS('navLabel'))
293 text = nav_map.makeelement(NCXNS('text'))
294 if child.name is not None:
295 text.text = re.sub(r'\n', ' ', child.name)
297 text.text = child.name
298 nav_label.append(text)
299 nav_point.append(nav_label)
301 content = nav_map.makeelement(NCXNS('content'))
302 content.set('src', child.href())
303 nav_point.append(content)
304 nav_map.append(nav_point)
305 counter = child.write_to_xml(nav_point, counter + 1)
308 def html_part(self, depth=0):
310 for child in self.children:
312 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
313 (depth, child.href(), child.name))
314 texts.append(child.html_part(depth + 1))
315 return "\n".join(texts)
318 with open(get_resource('epub/toc.html')) as f:
319 t = unicode(f.read(), 'utf-8')
320 return t % self.html_part()
323 def used_chars(element):
324 """ Lists characters used in an ETree Element """
325 chars = set((element.text or '') + (element.tail or ''))
326 for child in element:
327 chars = chars.union(used_chars(child))
332 """ divide main content of the XML file into chunks """
334 # prepare a container for each chunk
335 part_xml = etree.Element('utwor')
336 etree.SubElement(part_xml, 'master')
337 main_xml_part = part_xml[0] # master
339 last_node_part = False
341 # the below loop are workaround for a problem with epubs in drama ebooks without acts
344 for one_part in main_text:
346 if name == 'naglowek_scena':
348 elif name == 'naglowek_akt':
351 for one_part in main_text:
353 if is_act is False and is_scene is True:
354 if name == 'naglowek_czesc':
356 last_node_part = True
357 main_xml_part[:] = [deepcopy(one_part)]
358 elif not last_node_part and name == "naglowek_scena":
360 main_xml_part[:] = [deepcopy(one_part)]
362 main_xml_part.append(deepcopy(one_part))
363 last_node_part = False
365 if name == 'naglowek_czesc':
367 last_node_part = True
368 main_xml_part[:] = [deepcopy(one_part)]
369 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
371 main_xml_part[:] = [deepcopy(one_part)]
373 main_xml_part.append(deepcopy(one_part))
374 last_node_part = False
378 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
379 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
382 for element in chunk_xml[0]:
383 if element.tag == "naglowek_czesc":
384 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
385 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
386 toc.add(node_name(element), "part%d.html" % chunk_no)
387 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
388 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
389 element.set('sub', str(subnumber))
391 if not _empty_html_static:
392 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
394 output_html = _empty_html_static[0]
396 find_annotations(annotations, chunk_xml, chunk_no)
397 replace_by_verse(chunk_xml)
398 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
399 chars = used_chars(html_tree.getroot())
400 output_html = etree.tostring(
401 html_tree, pretty_print=True, xml_declaration=True,
403 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
404 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
406 return output_html, toc, chars
409 def transform(wldoc, verbose=False, style=None, html_toc=False,
410 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path=''):
411 """ produces a EPUB file
413 sample=n: generate sample e-book (with at least n paragraphs)
414 cover: a cover.Cover factory or True for default
415 flags: less-advertising, without-fonts, working-copy
418 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
419 """ processes one input file and proceeds to its children """
421 replace_characters(wldoc.edoc.getroot())
423 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
424 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
426 # every input file will have a TOC entry,
427 # pointing to starting chunk
428 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
431 # write book title page
432 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
433 chars = used_chars(html_tree.getroot())
437 html_tree, pretty_print=True, xml_declaration=True,
439 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
440 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
443 # add a title page TOC entry
444 toc.add(u"Strona tytułowa", "title.html")
445 elif wldoc.book_info.parts:
446 # write title page for every parent
447 if sample is not None and sample <= 0:
449 html_string = open(get_resource('epub/emptyChunk.html')).read()
451 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
452 chars = used_chars(html_tree.getroot())
453 html_string = etree.tostring(
454 html_tree, pretty_print=True, xml_declaration=True,
456 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
457 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
459 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
460 add_to_manifest(manifest, chunk_counter)
461 add_to_spine(spine, chunk_counter)
464 if len(wldoc.edoc.getroot()) > 1:
465 # rdf before style master
466 main_text = wldoc.edoc.getroot()[1]
468 # rdf in style master
469 main_text = wldoc.edoc.getroot()[0]
470 if main_text.tag == RDFNS('RDF'):
473 if main_text is not None:
474 for chunk_xml in chop(main_text):
476 if sample is not None:
480 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
481 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
483 toc.extend(chunk_toc)
484 chars = chars.union(chunk_chars)
485 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
486 add_to_manifest(manifest, chunk_counter)
487 add_to_spine(spine, chunk_counter)
490 for child in wldoc.parts():
491 child_toc, chunk_counter, chunk_chars, sample = transform_file(
492 child, chunk_counter, first=False, sample=sample)
493 toc.append(child_toc)
494 chars = chars.union(chunk_chars)
496 return toc, chunk_counter, chars, sample
498 document = deepcopy(wldoc)
503 document.edoc.getroot().set(flag, 'yes')
505 document.clean_ed_note()
506 document.clean_ed_note('abstrakt')
509 editors = document.editors()
511 document.edoc.getroot().set('editors', u', '.join(sorted(
512 editor.readable() for editor in editors)))
513 if document.book_info.funders:
514 document.edoc.getroot().set('funders', u', '.join(
515 document.book_info.funders))
516 if document.book_info.thanks:
517 document.edoc.getroot().set('thanks', document.book_info.thanks)
519 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
520 manifest = opf.find('.//' + OPFNS('manifest'))
521 guide = opf.find('.//' + OPFNS('guide'))
522 spine = opf.find('.//' + OPFNS('spine'))
524 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
525 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
527 functions.reg_mathml_epub(zip)
529 for filename in os.listdir(ilustr_path):
530 zip.write(os.path.join(ilustr_path, filename), os.path.join('OPS', filename))
532 # write static elements
533 mime = zipfile.ZipInfo()
534 mime.filename = 'mimetype'
535 mime.compress_type = zipfile.ZIP_STORED
537 zip.writestr(mime, 'application/epub+zip')
539 'META-INF/container.xml',
540 '<?xml version="1.0" ?>'
541 '<container version="1.0" '
542 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
543 '<rootfiles><rootfile full-path="OPS/content.opf" '
544 'media-type="application/oebps-package+xml" />'
545 '</rootfiles></container>'
547 zip.write(get_resource('res/wl-logo-small.png'),
548 os.path.join('OPS', 'logo_wolnelektury.png'))
549 zip.write(get_resource('res/jedenprocent.png'),
550 os.path.join('OPS', 'jedenprocent.png'))
552 style = get_resource('epub/style.css')
553 zip.write(style, os.path.join('OPS', 'style.css'))
557 cover = DefaultEbookCover
559 cover_file = StringIO()
560 bound_cover = cover(document.book_info)
561 bound_cover.save(cover_file)
562 cover_name = 'cover.%s' % bound_cover.ext()
563 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
566 cover_tree = etree.parse(get_resource('epub/cover.html'))
567 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
568 zip.writestr('OPS/cover.html', etree.tostring(
569 cover_tree, pretty_print=True, xml_declaration=True,
571 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
572 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
575 if bound_cover.uses_dc_cover:
576 if document.book_info.cover_by:
577 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
578 if document.book_info.cover_source:
579 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
581 manifest.append(etree.fromstring(
582 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
583 manifest.append(etree.fromstring(
584 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
585 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
586 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
587 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
589 annotations = etree.Element('annotations')
591 toc_file = etree.fromstring(
592 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
593 '"-//NISO//DTD ncx 2005-1//EN" '
594 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
595 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
596 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
599 nav_map = toc_file[-1]
602 manifest.append(etree.fromstring(
603 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
604 spine.append(etree.fromstring(
605 '<itemref idref="html_toc" />'))
606 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
608 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
610 if len(toc.children) < 2:
611 toc.add(u"Początek utworu", "part1.html")
613 # Last modifications in container files and EPUB creation
614 if len(annotations) > 0:
615 toc.add("Przypisy", "annotations.html")
616 manifest.append(etree.fromstring(
617 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
618 spine.append(etree.fromstring(
619 '<itemref idref="annotations" />'))
620 replace_by_verse(annotations)
621 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
622 chars = chars.union(used_chars(html_tree.getroot()))
623 zip.writestr('OPS/annotations.html', etree.tostring(
624 html_tree, pretty_print=True, xml_declaration=True,
626 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
627 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
630 toc.add("Wesprzyj Wolne Lektury", "support.html")
631 manifest.append(etree.fromstring(
632 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
633 spine.append(etree.fromstring(
634 '<itemref idref="support" />'))
635 html_string = open(get_resource('epub/support.html')).read()
636 chars.update(used_chars(etree.fromstring(html_string)))
637 zip.writestr('OPS/support.html', html_string)
639 toc.add("Strona redakcyjna", "last.html")
640 manifest.append(etree.fromstring(
641 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
642 spine.append(etree.fromstring(
643 '<itemref idref="last" />'))
644 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
645 chars.update(used_chars(html_tree.getroot()))
646 zip.writestr('OPS/last.html', etree.tostring(
647 html_tree, pretty_print=True, xml_declaration=True,
649 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
650 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
653 if not flags or 'without-fonts' not in flags:
655 tmpdir = mkdtemp('-librarian-epub')
661 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
662 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
663 optimizer_call = ['perl', 'subset.pl', '--chars',
664 ''.join(chars).encode('utf-8'),
665 get_resource('fonts/' + fname),
666 os.path.join(tmpdir, fname)]
668 print "Running font-optimizer"
669 subprocess.check_call(optimizer_call)
671 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
672 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
673 manifest.append(etree.fromstring(
674 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
678 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
679 xml_declaration=True, encoding="utf-8"))
680 title = document.book_info.title
681 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
682 for st in attributes:
683 meta = toc_file.makeelement(NCXNS('meta'))
685 meta.set('content', '0')
686 toc_file[0].append(meta)
687 toc_file[0][0].set('content', str(document.book_info.url))
688 toc_file[0][1].set('content', str(toc.depth()))
689 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
693 toc.add(u"Spis treści", "toc.html", index=1)
694 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
695 toc.write_to_xml(nav_map)
696 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
697 xml_declaration=True, encoding="utf-8"))
700 return OutputFile.from_filename(output_file.name)