1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
55 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
57 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
62 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
70 """ returns node's text and children as a string
72 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
76 nt = node.text if node.text is not None else ''
77 return ''.join([nt] + [etree.tostring(child) for child in node])
80 def set_inner_xml(node, text):
81 """ sets node's text and children from a string
83 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
84 >>> set_inner_xml(e, 'x<b>y</b>z')
85 >>> print etree.tostring(e)
89 p = etree.fromstring('<x>%s</x>' % text)
95 """ Find out a node's name
97 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
101 tempnode = deepcopy(node)
103 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
104 for e in tempnode.findall('.//%s' % p):
108 etree.strip_tags(tempnode, '*')
112 def xslt(xml, sheet):
113 if isinstance(xml, etree._Element):
114 xml = etree.ElementTree(xml)
115 with open(sheet) as xsltf:
116 return xml.xslt(etree.parse(xsltf))
119 def replace_characters(node):
120 def replace_chars(text):
123 return text.replace(u"\ufeff", u"")\
124 .replace("---", u"\u2014")\
125 .replace("--", u"\u2013")\
126 .replace(",,", u"\u201E")\
127 .replace('"', u"\u201D")\
128 .replace("'", u"\u2019")
129 if node.tag in ('uwaga', 'extra'):
133 node.text = replace_chars(node.text)
134 node.tail = replace_chars(node.tail)
136 replace_characters(child)
139 def find_annotations(annotations, source, part_no):
141 if child.tag in ('pe', 'pa', 'pt', 'pr'):
142 annotation = deepcopy(child)
143 number = str(len(annotations) + 1)
144 annotation.set('number', number)
145 annotation.set('part', str(part_no))
147 annotations.append(annotation)
152 if child.tag not in ('extra', 'uwaga'):
153 find_annotations(annotations, child, part_no)
156 class Stanza(object):
158 Converts / verse endings into verse elements in a stanza.
160 Slashes may only occur directly in the stanza. Any slashes in subelements
161 will be ignored, and the subelements will be put inside verse elements.
163 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
164 >>> Stanza(s).versify()
165 >>> print etree.tostring(s)
166 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
167 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
170 def __init__(self, stanza_elem):
171 self.stanza = stanza_elem
173 self.open_verse = None
176 self.push_text(self.stanza.text)
177 for elem in self.stanza:
179 self.push_text(elem.tail)
180 tail = self.stanza.tail
182 self.stanza.tail = tail
183 self.stanza.extend(self.verses)
185 def open_normal_verse(self):
186 self.open_verse = self.stanza.makeelement("wers_normalny")
187 self.verses.append(self.open_verse)
189 def get_open_verse(self):
190 if self.open_verse is None:
191 self.open_normal_verse()
192 return self.open_verse
194 def push_text(self, text):
197 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
199 self.open_normal_verse()
200 verse = self.get_open_verse()
202 verse[-1].tail = (verse[-1].tail or "") + verse_text
204 verse.text = (verse.text or "") + verse_text
206 def push_elem(self, elem):
207 if elem.tag.startswith("wers"):
208 verse = deepcopy(elem)
210 self.verses.append(verse)
211 self.open_verse = verse
213 appended = deepcopy(elem)
215 self.get_open_verse().append(appended)
218 def replace_by_verse(tree):
219 """ Find stanzas and create new verses in place of a '/' character """
221 stanzas = tree.findall('.//' + WLNS('strofa'))
222 for stanza in stanzas:
223 Stanza(stanza).versify()
226 def add_to_manifest(manifest, partno):
227 """ Adds a node to the manifest section in content.opf file """
229 partstr = 'part%d' % partno
230 e = manifest.makeelement(
231 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
232 'media-type': 'application/xhtml+xml'}
237 def add_to_spine(spine, partno):
238 """ Adds a node to the spine section in content.opf file """
240 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
245 def __init__(self, name=None, part_href=None):
248 self.part_href = part_href
249 self.sub_number = None
251 def add(self, name, part_href, level=0, is_part=True, index=None):
252 assert level == 0 or index is None
253 if level > 0 and self.children:
254 return self.children[-1].add(name, part_href, level - 1, is_part)
257 t.part_href = part_href
258 if index is not None:
259 self.children.insert(index, t)
261 self.children.append(t)
263 t.sub_number = len(self.children) + 1
266 def append(self, toc):
267 self.children.append(toc)
269 def extend(self, toc):
270 self.children.extend(toc.children)
274 return max((c.depth() for c in self.children)) + 1
280 if self.sub_number is not None:
281 src += '#sub%d' % self.sub_number
284 def write_to_xml(self, nav_map, counter=1):
285 for child in self.children:
286 nav_point = nav_map.makeelement(NCXNS('navPoint'))
287 nav_point.set('id', 'NavPoint-%d' % counter)
288 nav_point.set('playOrder', str(counter))
290 nav_label = nav_map.makeelement(NCXNS('navLabel'))
291 text = nav_map.makeelement(NCXNS('text'))
292 if child.name is not None:
293 text.text = re.sub(r'\n', ' ', child.name)
295 text.text = child.name
296 nav_label.append(text)
297 nav_point.append(nav_label)
299 content = nav_map.makeelement(NCXNS('content'))
300 content.set('src', child.href())
301 nav_point.append(content)
302 nav_map.append(nav_point)
303 counter = child.write_to_xml(nav_point, counter + 1)
306 def html_part(self, depth=0):
308 for child in self.children:
310 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
311 (depth, child.href(), child.name))
312 texts.append(child.html_part(depth + 1))
313 return "\n".join(texts)
316 with open(get_resource('epub/toc.html')) as f:
317 t = unicode(f.read(), 'utf-8')
318 return t % self.html_part()
321 def used_chars(element):
322 """ Lists characters used in an ETree Element """
323 chars = set((element.text or '') + (element.tail or ''))
324 for child in element:
325 chars = chars.union(used_chars(child))
330 """ divide main content of the XML file into chunks """
332 # prepare a container for each chunk
333 part_xml = etree.Element('utwor')
334 etree.SubElement(part_xml, 'master')
335 main_xml_part = part_xml[0] # master
337 last_node_part = False
339 # the below loop are workaround for a problem with epubs in drama ebooks without acts
342 for one_part in main_text:
344 if name == 'naglowek_scena':
346 elif name == 'naglowek_akt':
349 for one_part in main_text:
351 if is_act is False and is_scene is True:
352 if name == 'naglowek_czesc':
354 last_node_part = True
355 main_xml_part[:] = [deepcopy(one_part)]
356 elif not last_node_part and name == "naglowek_scena":
358 main_xml_part[:] = [deepcopy(one_part)]
360 main_xml_part.append(deepcopy(one_part))
361 last_node_part = False
363 if name == 'naglowek_czesc':
365 last_node_part = True
366 main_xml_part[:] = [deepcopy(one_part)]
367 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
369 main_xml_part[:] = [deepcopy(one_part)]
371 main_xml_part.append(deepcopy(one_part))
372 last_node_part = False
376 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
377 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
380 for element in chunk_xml[0]:
381 if element.tag == "naglowek_czesc":
382 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
383 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
384 toc.add(node_name(element), "part%d.html" % chunk_no)
385 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
386 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
387 element.set('sub', str(subnumber))
389 if not _empty_html_static:
390 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
392 output_html = _empty_html_static[0]
394 find_annotations(annotations, chunk_xml, chunk_no)
395 replace_by_verse(chunk_xml)
396 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
397 chars = used_chars(html_tree.getroot())
398 output_html = etree.tostring(
399 html_tree, pretty_print=True, xml_declaration=True,
401 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
402 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
404 return output_html, toc, chars
407 def transform(wldoc, verbose=False,
408 style=None, html_toc=False,
409 sample=None, cover=None, flags=None):
410 """ produces a EPUB file
412 sample=n: generate sample e-book (with at least n paragraphs)
413 cover: a cover.Cover factory or True for default
414 flags: less-advertising, without-fonts, working-copy
417 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
418 """ processes one input file and proceeds to its children """
420 replace_characters(wldoc.edoc.getroot())
422 hyphenator = set_hyph_language(wldoc.edoc.getroot())
423 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
425 # every input file will have a TOC entry,
426 # pointing to starting chunk
427 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
430 # write book title page
431 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
432 chars = used_chars(html_tree.getroot())
436 html_tree, pretty_print=True, xml_declaration=True,
438 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
439 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
442 # add a title page TOC entry
443 toc.add(u"Strona tytułowa", "title.html")
444 elif wldoc.book_info.parts:
445 # write title page for every parent
446 if sample is not None and sample <= 0:
448 html_string = open(get_resource('epub/emptyChunk.html')).read()
450 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
451 chars = used_chars(html_tree.getroot())
452 html_string = etree.tostring(
453 html_tree, pretty_print=True, xml_declaration=True,
455 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
456 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
458 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
459 add_to_manifest(manifest, chunk_counter)
460 add_to_spine(spine, chunk_counter)
463 if len(wldoc.edoc.getroot()) > 1:
464 # rdf before style master
465 main_text = wldoc.edoc.getroot()[1]
467 # rdf in style master
468 main_text = wldoc.edoc.getroot()[0]
469 if main_text.tag == RDFNS('RDF'):
472 if main_text is not None:
473 for chunk_xml in chop(main_text):
475 if sample is not None:
479 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
480 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
482 toc.extend(chunk_toc)
483 chars = chars.union(chunk_chars)
484 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
485 add_to_manifest(manifest, chunk_counter)
486 add_to_spine(spine, chunk_counter)
489 for child in wldoc.parts():
490 child_toc, chunk_counter, chunk_chars, sample = transform_file(
491 child, chunk_counter, first=False, sample=sample)
492 toc.append(child_toc)
493 chars = chars.union(chunk_chars)
495 return toc, chunk_counter, chars, sample
497 document = deepcopy(wldoc)
502 document.edoc.getroot().set(flag, 'yes')
504 document.clean_ed_note()
505 document.clean_ed_note('abstrakt')
508 editors = document.editors()
510 document.edoc.getroot().set('editors', u', '.join(sorted(
511 editor.readable() for editor in editors)))
512 if document.book_info.funders:
513 document.edoc.getroot().set('funders', u', '.join(
514 document.book_info.funders))
515 if document.book_info.thanks:
516 document.edoc.getroot().set('thanks', document.book_info.thanks)
518 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
519 manifest = opf.find('.//' + OPFNS('manifest'))
520 guide = opf.find('.//' + OPFNS('guide'))
521 spine = opf.find('.//' + OPFNS('spine'))
523 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
524 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
526 functions.reg_mathml_epub(zip)
528 # write static elements
529 mime = zipfile.ZipInfo()
530 mime.filename = 'mimetype'
531 mime.compress_type = zipfile.ZIP_STORED
533 zip.writestr(mime, 'application/epub+zip')
535 'META-INF/container.xml',
536 '<?xml version="1.0" ?>'
537 '<container version="1.0" '
538 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
539 '<rootfiles><rootfile full-path="OPS/content.opf" '
540 'media-type="application/oebps-package+xml" />'
541 '</rootfiles></container>'
543 zip.write(get_resource('res/wl-logo-small.png'),
544 os.path.join('OPS', 'logo_wolnelektury.png'))
545 zip.write(get_resource('res/jedenprocent.png'),
546 os.path.join('OPS', 'jedenprocent.png'))
548 style = get_resource('epub/style.css')
549 zip.write(style, os.path.join('OPS', 'style.css'))
553 cover = DefaultEbookCover
555 cover_file = StringIO()
556 bound_cover = cover(document.book_info)
557 bound_cover.save(cover_file)
558 cover_name = 'cover.%s' % bound_cover.ext()
559 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
562 cover_tree = etree.parse(get_resource('epub/cover.html'))
563 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
564 zip.writestr('OPS/cover.html', etree.tostring(
565 cover_tree, pretty_print=True, xml_declaration=True,
567 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
568 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
571 if bound_cover.uses_dc_cover:
572 if document.book_info.cover_by:
573 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
574 if document.book_info.cover_source:
575 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
577 manifest.append(etree.fromstring(
578 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
579 manifest.append(etree.fromstring(
580 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
581 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
582 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
583 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
585 annotations = etree.Element('annotations')
587 toc_file = etree.fromstring(
588 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
589 '"-//NISO//DTD ncx 2005-1//EN" '
590 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
591 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
592 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
595 nav_map = toc_file[-1]
598 manifest.append(etree.fromstring(
599 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
600 spine.append(etree.fromstring(
601 '<itemref idref="html_toc" />'))
602 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
604 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
606 if len(toc.children) < 2:
607 toc.add(u"Początek utworu", "part1.html")
609 # Last modifications in container files and EPUB creation
610 if len(annotations) > 0:
611 toc.add("Przypisy", "annotations.html")
612 manifest.append(etree.fromstring(
613 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
614 spine.append(etree.fromstring(
615 '<itemref idref="annotations" />'))
616 replace_by_verse(annotations)
617 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
618 chars = chars.union(used_chars(html_tree.getroot()))
619 zip.writestr('OPS/annotations.html', etree.tostring(
620 html_tree, pretty_print=True, xml_declaration=True,
622 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
623 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
626 toc.add("Wesprzyj Wolne Lektury", "support.html")
627 manifest.append(etree.fromstring(
628 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
629 spine.append(etree.fromstring(
630 '<itemref idref="support" />'))
631 html_string = open(get_resource('epub/support.html')).read()
632 chars.update(used_chars(etree.fromstring(html_string)))
633 zip.writestr('OPS/support.html', html_string)
635 toc.add("Strona redakcyjna", "last.html")
636 manifest.append(etree.fromstring(
637 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
638 spine.append(etree.fromstring(
639 '<itemref idref="last" />'))
640 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
641 chars.update(used_chars(html_tree.getroot()))
642 zip.writestr('OPS/last.html', etree.tostring(
643 html_tree, pretty_print=True, xml_declaration=True,
645 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
646 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
649 if not flags or not 'without-fonts' in flags:
651 tmpdir = mkdtemp('-librarian-epub')
657 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
658 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
659 optimizer_call = ['perl', 'subset.pl', '--chars',
660 ''.join(chars).encode('utf-8'),
661 get_resource('fonts/' + fname),
662 os.path.join(tmpdir, fname)]
664 print "Running font-optimizer"
665 subprocess.check_call(optimizer_call)
667 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
668 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
669 manifest.append(etree.fromstring(
670 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
674 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
675 xml_declaration=True, encoding="utf-8"))
676 title = document.book_info.title
677 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
678 for st in attributes:
679 meta = toc_file.makeelement(NCXNS('meta'))
681 meta.set('content', '0')
682 toc_file[0].append(meta)
683 toc_file[0][0].set('content', str(document.book_info.url))
684 toc_file[0][1].set('content', str(toc.depth()))
685 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
689 toc.add(u"Spis treści", "toc.html", index=1)
690 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
691 toc.write_to_xml(nav_map)
692 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
693 xml_declaration=True, encoding="utf-8"))
696 return OutputFile.from_filename(output_file.name)