1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
55 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
57 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
62 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
70 """ returns node's text and children as a string
72 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
76 nt = node.text if node.text is not None else ''
77 return ''.join([nt] + [etree.tostring(child) for child in node])
80 def set_inner_xml(node, text):
81 """ sets node's text and children from a string
83 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
84 >>> set_inner_xml(e, 'x<b>y</b>z')
85 >>> print etree.tostring(e)
89 p = etree.fromstring('<x>%s</x>' % text)
95 """ Find out a node's name
97 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
101 tempnode = deepcopy(node)
103 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
104 for e in tempnode.findall('.//%s' % p):
108 etree.strip_tags(tempnode, '*')
112 def xslt(xml, sheet):
113 if isinstance(xml, etree._Element):
114 xml = etree.ElementTree(xml)
115 with open(sheet) as xsltf:
116 return xml.xslt(etree.parse(xsltf))
119 def replace_characters(node):
120 def replace_chars(text):
123 return text.replace(u"\ufeff", u"")\
124 .replace("---", u"\u2014")\
125 .replace("--", u"\u2013")\
126 .replace(",,", u"\u201E")\
127 .replace('"', u"\u201D")\
128 .replace("'", u"\u2019")
129 if node.tag in ('uwaga', 'extra'):
133 node.text = replace_chars(node.text)
134 node.tail = replace_chars(node.tail)
136 replace_characters(child)
139 def find_annotations(annotations, source, part_no):
141 if child.tag in ('pe', 'pa', 'pt', 'pr'):
142 annotation = deepcopy(child)
143 number = str(len(annotations) + 1)
144 annotation.set('number', number)
145 annotation.set('part', str(part_no))
147 annotations.append(annotation)
152 if child.tag not in ('extra', 'uwaga'):
153 find_annotations(annotations, child, part_no)
156 class Stanza(object):
158 Converts / verse endings into verse elements in a stanza.
160 Slashes may only occur directly in the stanza. Any slashes in subelements
161 will be ignored, and the subelements will be put inside verse elements.
163 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
164 >>> Stanza(s).versify()
165 >>> print etree.tostring(s)
166 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
167 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
170 def __init__(self, stanza_elem):
171 self.stanza = stanza_elem
173 self.open_verse = None
176 self.push_text(self.stanza.text)
177 for elem in self.stanza:
179 self.push_text(elem.tail)
180 tail = self.stanza.tail
182 self.stanza.tail = tail
183 self.stanza.extend(self.verses)
185 def open_normal_verse(self):
186 self.open_verse = self.stanza.makeelement("wers_normalny")
187 self.verses.append(self.open_verse)
189 def get_open_verse(self):
190 if self.open_verse is None:
191 self.open_normal_verse()
192 return self.open_verse
194 def push_text(self, text):
197 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
199 self.open_normal_verse()
200 verse = self.get_open_verse()
202 verse[-1].tail = (verse[-1].tail or "") + verse_text
204 verse.text = (verse.text or "") + verse_text
206 def push_elem(self, elem):
207 if elem.tag.startswith("wers"):
208 verse = deepcopy(elem)
210 self.verses.append(verse)
211 self.open_verse = verse
213 appended = deepcopy(elem)
215 self.get_open_verse().append(appended)
218 def replace_by_verse(tree):
219 """ Find stanzas and create new verses in place of a '/' character """
221 stanzas = tree.findall('.//' + WLNS('strofa'))
222 for stanza in stanzas:
223 Stanza(stanza).versify()
226 def add_to_manifest(manifest, partno):
227 """ Adds a node to the manifest section in content.opf file """
229 partstr = 'part%d' % partno
230 e = manifest.makeelement(
231 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
232 'media-type': 'application/xhtml+xml'}
237 def add_to_spine(spine, partno):
238 """ Adds a node to the spine section in content.opf file """
240 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
245 def __init__(self, name=None, part_href=None):
248 self.part_href = part_href
249 self.sub_number = None
251 def add(self, name, part_href, level=0, is_part=True, index=None):
252 assert level == 0 or index is None
253 if level > 0 and self.children:
254 return self.children[-1].add(name, part_href, level - 1, is_part)
257 t.part_href = part_href
258 if index is not None:
259 self.children.insert(index, t)
261 self.children.append(t)
263 t.sub_number = len(self.children) + 1
266 def append(self, toc):
267 self.children.append(toc)
269 def extend(self, toc):
270 self.children.extend(toc.children)
274 return max((c.depth() for c in self.children)) + 1
280 if self.sub_number is not None:
281 src += '#sub%d' % self.sub_number
284 def write_to_xml(self, nav_map, counter=1):
285 for child in self.children:
286 nav_point = nav_map.makeelement(NCXNS('navPoint'))
287 nav_point.set('id', 'NavPoint-%d' % counter)
288 nav_point.set('playOrder', str(counter))
290 nav_label = nav_map.makeelement(NCXNS('navLabel'))
291 text = nav_map.makeelement(NCXNS('text'))
292 if child.name is not None:
293 text.text = re.sub(r'\n', ' ', child.name)
295 text.text = child.name
296 nav_label.append(text)
297 nav_point.append(nav_label)
299 content = nav_map.makeelement(NCXNS('content'))
300 content.set('src', child.href())
301 nav_point.append(content)
302 nav_map.append(nav_point)
303 counter = child.write_to_xml(nav_point, counter + 1)
306 def html_part(self, depth=0):
308 for child in self.children:
310 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
311 (depth, child.href(), child.name))
312 texts.append(child.html_part(depth + 1))
313 return "\n".join(texts)
316 with open(get_resource('epub/toc.html')) as f:
317 t = unicode(f.read(), 'utf-8')
318 return t % self.html_part()
321 def used_chars(element):
322 """ Lists characters used in an ETree Element """
323 chars = set((element.text or '') + (element.tail or ''))
324 for child in element:
325 chars = chars.union(used_chars(child))
330 """ divide main content of the XML file into chunks """
332 # prepare a container for each chunk
333 part_xml = etree.Element('utwor')
334 etree.SubElement(part_xml, 'master')
335 main_xml_part = part_xml[0] # master
337 last_node_part = False
339 # the below loop are workaround for a problem with epubs in drama ebooks without acts
342 for one_part in main_text:
344 if name == 'naglowek_scena':
346 elif name == 'naglowek_akt':
349 for one_part in main_text:
351 if is_act is False and is_scene is True:
352 if name == 'naglowek_czesc':
354 last_node_part = True
355 main_xml_part[:] = [deepcopy(one_part)]
356 elif not last_node_part and name == "naglowek_scena":
358 main_xml_part[:] = [deepcopy(one_part)]
360 main_xml_part.append(deepcopy(one_part))
361 last_node_part = False
363 if name == 'naglowek_czesc':
365 last_node_part = True
366 main_xml_part[:] = [deepcopy(one_part)]
367 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
369 main_xml_part[:] = [deepcopy(one_part)]
371 main_xml_part.append(deepcopy(one_part))
372 last_node_part = False
376 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
377 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
380 for element in chunk_xml[0]:
381 if element.tag == "naglowek_czesc":
382 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
383 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
384 toc.add(node_name(element), "part%d.html" % chunk_no)
385 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
386 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
387 element.set('sub', str(subnumber))
389 if not _empty_html_static:
390 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
392 output_html = _empty_html_static[0]
394 find_annotations(annotations, chunk_xml, chunk_no)
395 replace_by_verse(chunk_xml)
396 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
397 chars = used_chars(html_tree.getroot())
398 output_html = etree.tostring(
399 html_tree, pretty_print=True, xml_declaration=True,
401 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
402 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
404 return output_html, toc, chars
407 def transform(wldoc, verbose=False,
408 style=None, html_toc=False,
409 sample=None, cover=None, flags=None):
410 """ produces a EPUB file
412 sample=n: generate sample e-book (with at least n paragraphs)
413 cover: a cover.Cover factory or True for default
414 flags: less-advertising, without-fonts, working-copy
417 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
418 """ processes one input file and proceeds to its children """
420 replace_characters(wldoc.edoc.getroot())
422 hyphenator = set_hyph_language(wldoc.edoc.getroot())
423 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
425 # every input file will have a TOC entry,
426 # pointing to starting chunk
427 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
430 # write book title page
431 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
432 chars = used_chars(html_tree.getroot())
436 html_tree, pretty_print=True, xml_declaration=True,
438 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
439 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
442 # add a title page TOC entry
443 toc.add(u"Strona tytułowa", "title.html")
444 elif wldoc.book_info.parts:
445 # write title page for every parent
446 if sample is not None and sample <= 0:
448 html_string = open(get_resource('epub/emptyChunk.html')).read()
450 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
451 chars = used_chars(html_tree.getroot())
452 html_string = etree.tostring(
453 html_tree, pretty_print=True, xml_declaration=True,
455 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
456 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
458 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
459 add_to_manifest(manifest, chunk_counter)
460 add_to_spine(spine, chunk_counter)
463 if len(wldoc.edoc.getroot()) > 1:
464 # rdf before style master
465 main_text = wldoc.edoc.getroot()[1]
467 # rdf in style master
468 main_text = wldoc.edoc.getroot()[0]
469 if main_text.tag == RDFNS('RDF'):
472 if main_text is not None:
473 for chunk_xml in chop(main_text):
475 if sample is not None:
479 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
480 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
482 toc.extend(chunk_toc)
483 chars = chars.union(chunk_chars)
484 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
485 add_to_manifest(manifest, chunk_counter)
486 add_to_spine(spine, chunk_counter)
489 for child in wldoc.parts():
490 child_toc, chunk_counter, chunk_chars, sample = transform_file(
491 child, chunk_counter, first=False, sample=sample)
492 toc.append(child_toc)
493 chars = chars.union(chunk_chars)
495 return toc, chunk_counter, chars, sample
497 document = deepcopy(wldoc)
502 document.edoc.getroot().set(flag, 'yes')
505 editors = document.editors()
507 document.edoc.getroot().set('editors', u', '.join(sorted(
508 editor.readable() for editor in editors)))
509 if document.book_info.funders:
510 document.edoc.getroot().set('funders', u', '.join(
511 document.book_info.funders))
512 if document.book_info.thanks:
513 document.edoc.getroot().set('thanks', document.book_info.thanks)
515 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
516 manifest = opf.find('.//' + OPFNS('manifest'))
517 guide = opf.find('.//' + OPFNS('guide'))
518 spine = opf.find('.//' + OPFNS('spine'))
520 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
521 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
523 # write static elements
524 mime = zipfile.ZipInfo()
525 mime.filename = 'mimetype'
526 mime.compress_type = zipfile.ZIP_STORED
528 zip.writestr(mime, 'application/epub+zip')
530 'META-INF/container.xml',
531 '<?xml version="1.0" ?>'
532 '<container version="1.0" '
533 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
534 '<rootfiles><rootfile full-path="OPS/content.opf" '
535 'media-type="application/oebps-package+xml" />'
536 '</rootfiles></container>'
538 zip.write(get_resource('res/wl-logo-small.png'),
539 os.path.join('OPS', 'logo_wolnelektury.png'))
540 zip.write(get_resource('res/jedenprocent.png'),
541 os.path.join('OPS', 'jedenprocent.png'))
543 style = get_resource('epub/style.css')
544 zip.write(style, os.path.join('OPS', 'style.css'))
548 cover = DefaultEbookCover
550 cover_file = StringIO()
551 bound_cover = cover(document.book_info)
552 bound_cover.save(cover_file)
553 cover_name = 'cover.%s' % bound_cover.ext()
554 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
557 cover_tree = etree.parse(get_resource('epub/cover.html'))
558 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
559 zip.writestr('OPS/cover.html', etree.tostring(
560 cover_tree, pretty_print=True, xml_declaration=True,
562 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
563 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
566 if bound_cover.uses_dc_cover:
567 if document.book_info.cover_by:
568 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
569 if document.book_info.cover_source:
570 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
572 manifest.append(etree.fromstring(
573 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
574 manifest.append(etree.fromstring(
575 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
576 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
577 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
578 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
580 annotations = etree.Element('annotations')
582 toc_file = etree.fromstring(
583 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
584 '"-//NISO//DTD ncx 2005-1//EN" '
585 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
586 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
587 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
590 nav_map = toc_file[-1]
593 manifest.append(etree.fromstring(
594 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
595 spine.append(etree.fromstring(
596 '<itemref idref="html_toc" />'))
597 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
599 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
601 if len(toc.children) < 2:
602 toc.add(u"Początek utworu", "part1.html")
604 # Last modifications in container files and EPUB creation
605 if len(annotations) > 0:
606 toc.add("Przypisy", "annotations.html")
607 manifest.append(etree.fromstring(
608 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
609 spine.append(etree.fromstring(
610 '<itemref idref="annotations" />'))
611 replace_by_verse(annotations)
612 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
613 chars = chars.union(used_chars(html_tree.getroot()))
614 zip.writestr('OPS/annotations.html', etree.tostring(
615 html_tree, pretty_print=True, xml_declaration=True,
617 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
618 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
621 toc.add("Wesprzyj Wolne Lektury", "support.html")
622 manifest.append(etree.fromstring(
623 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
624 spine.append(etree.fromstring(
625 '<itemref idref="support" />'))
626 html_string = open(get_resource('epub/support.html')).read()
627 chars.update(used_chars(etree.fromstring(html_string)))
628 zip.writestr('OPS/support.html', html_string)
630 toc.add("Strona redakcyjna", "last.html")
631 manifest.append(etree.fromstring(
632 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
633 spine.append(etree.fromstring(
634 '<itemref idref="last" />'))
635 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
636 chars.update(used_chars(html_tree.getroot()))
637 zip.writestr('OPS/last.html', etree.tostring(
638 html_tree, pretty_print=True, xml_declaration=True,
640 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
641 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
644 if not flags or not 'without-fonts' in flags:
646 tmpdir = mkdtemp('-librarian-epub')
652 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
653 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
654 optimizer_call = ['perl', 'subset.pl', '--chars',
655 ''.join(chars).encode('utf-8'),
656 get_resource('fonts/' + fname),
657 os.path.join(tmpdir, fname)]
659 print "Running font-optimizer"
660 subprocess.check_call(optimizer_call)
662 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
663 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
664 manifest.append(etree.fromstring(
665 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
669 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
670 xml_declaration=True, encoding="utf-8"))
671 title = document.book_info.title
672 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
673 for st in attributes:
674 meta = toc_file.makeelement(NCXNS('meta'))
676 meta.set('content', '0')
677 toc_file[0].append(meta)
678 toc_file[0][0].set('content', str(document.book_info.url))
679 toc_file[0][1].set('content', str(toc.depth()))
680 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
684 toc.add(u"Spis treści", "toc.html", index=1)
685 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
686 toc.write_to_xml(nav_map)
687 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
688 xml_declaration=True, encoding="utf-8"))
691 return OutputFile.from_filename(output_file.name)