1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
55 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
57 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
62 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
70 """ returns node's text and children as a string
72 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
76 nt = node.text if node.text is not None else ''
77 return ''.join([nt] + [etree.tostring(child) for child in node])
80 def set_inner_xml(node, text):
81 """ sets node's text and children from a string
83 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
84 >>> set_inner_xml(e, 'x<b>y</b>z')
85 >>> print etree.tostring(e)
89 p = etree.fromstring('<x>%s</x>' % text)
95 """ Find out a node's name
97 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
101 tempnode = deepcopy(node)
103 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
104 for e in tempnode.findall('.//%s' % p):
108 etree.strip_tags(tempnode, '*')
112 def xslt(xml, sheet):
113 if isinstance(xml, etree._Element):
114 xml = etree.ElementTree(xml)
115 with open(sheet) as xsltf:
116 return xml.xslt(etree.parse(xsltf))
119 def replace_characters(node):
120 def replace_chars(text):
123 return text.replace(u"\ufeff", u"")\
124 .replace("---", u"\u2014")\
125 .replace("--", u"\u2013")\
126 .replace(",,", u"\u201E")\
127 .replace('"', u"\u201D")\
128 .replace("'", u"\u2019")
129 if node.tag in ('uwaga', 'extra'):
133 node.text = replace_chars(node.text)
134 node.tail = replace_chars(node.tail)
136 replace_characters(child)
139 def find_annotations(annotations, source, part_no):
141 if child.tag in ('pe', 'pa', 'pt', 'pr'):
142 annotation = deepcopy(child)
143 number = str(len(annotations) + 1)
144 annotation.set('number', number)
145 annotation.set('part', str(part_no))
147 annotations.append(annotation)
152 if child.tag not in ('extra', 'uwaga'):
153 find_annotations(annotations, child, part_no)
156 class Stanza(object):
158 Converts / verse endings into verse elements in a stanza.
160 Slashes may only occur directly in the stanza. Any slashes in subelements
161 will be ignored, and the subelements will be put inside verse elements.
163 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
164 >>> Stanza(s).versify()
165 >>> print etree.tostring(s)
166 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
167 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
170 def __init__(self, stanza_elem):
171 self.stanza = stanza_elem
173 self.open_verse = None
176 self.push_text(self.stanza.text)
177 for elem in self.stanza:
179 self.push_text(elem.tail)
180 tail = self.stanza.tail
182 self.stanza.tail = tail
183 self.stanza.extend(self.verses)
185 def open_normal_verse(self):
186 self.open_verse = self.stanza.makeelement("wers_normalny")
187 self.verses.append(self.open_verse)
189 def get_open_verse(self):
190 if self.open_verse is None:
191 self.open_normal_verse()
192 return self.open_verse
194 def push_text(self, text):
197 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
199 self.open_normal_verse()
200 verse = self.get_open_verse()
202 verse[-1].tail = (verse[-1].tail or "") + verse_text
204 verse.text = (verse.text or "") + verse_text
206 def push_elem(self, elem):
207 if elem.tag.startswith("wers"):
208 verse = deepcopy(elem)
210 self.verses.append(verse)
211 self.open_verse = verse
213 appended = deepcopy(elem)
215 self.get_open_verse().append(appended)
218 def replace_by_verse(tree):
219 """ Find stanzas and create new verses in place of a '/' character """
221 stanzas = tree.findall('.//' + WLNS('strofa'))
222 for stanza in stanzas:
223 Stanza(stanza).versify()
226 def add_to_manifest(manifest, partno):
227 """ Adds a node to the manifest section in content.opf file """
229 partstr = 'part%d' % partno
230 e = manifest.makeelement(
231 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
232 'media-type': 'application/xhtml+xml'}
237 def add_to_spine(spine, partno):
238 """ Adds a node to the spine section in content.opf file """
240 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
245 def __init__(self, name=None, part_href=None):
248 self.part_href = part_href
249 self.sub_number = None
251 def add(self, name, part_href, level=0, is_part=True, index=None):
252 assert level == 0 or index is None
253 if level > 0 and self.children:
254 return self.children[-1].add(name, part_href, level - 1, is_part)
257 t.part_href = part_href
258 if index is not None:
259 self.children.insert(index, t)
261 self.children.append(t)
263 t.sub_number = len(self.children) + 1
266 def append(self, toc):
267 self.children.append(toc)
269 def extend(self, toc):
270 self.children.extend(toc.children)
274 return max((c.depth() for c in self.children)) + 1
280 if self.sub_number is not None:
281 src += '#sub%d' % self.sub_number
284 def write_to_xml(self, nav_map, counter=1):
285 for child in self.children:
286 nav_point = nav_map.makeelement(NCXNS('navPoint'))
287 nav_point.set('id', 'NavPoint-%d' % counter)
288 nav_point.set('playOrder', str(counter))
290 nav_label = nav_map.makeelement(NCXNS('navLabel'))
291 text = nav_map.makeelement(NCXNS('text'))
292 if child.name is not None:
293 text.text = re.sub(r'\n', ' ', child.name)
295 text.text = child.name
296 nav_label.append(text)
297 nav_point.append(nav_label)
299 content = nav_map.makeelement(NCXNS('content'))
300 content.set('src', child.href())
301 nav_point.append(content)
302 nav_map.append(nav_point)
303 counter = child.write_to_xml(nav_point, counter + 1)
306 def html_part(self, depth=0):
308 for child in self.children:
310 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
311 (depth, child.href(), child.name))
312 texts.append(child.html_part(depth + 1))
313 return "\n".join(texts)
316 with open(get_resource('epub/toc.html')) as f:
317 t = unicode(f.read(), 'utf-8')
318 return t % self.html_part()
321 def used_chars(element):
322 """ Lists characters used in an ETree Element """
323 chars = set((element.text or '') + (element.tail or ''))
324 for child in element:
325 chars = chars.union(used_chars(child))
330 """ divide main content of the XML file into chunks """
332 # prepare a container for each chunk
333 part_xml = etree.Element('utwor')
334 etree.SubElement(part_xml, 'master')
335 main_xml_part = part_xml[0] # master
337 last_node_part = False
339 # the below loop are workaround for a problem with epubs in drama ebooks without acts
342 for one_part in main_text:
344 if name == 'naglowek_scena':
346 elif name == 'naglowek_akt':
349 for one_part in main_text:
351 if is_act is False and is_scene is True:
352 if name == 'naglowek_czesc':
354 last_node_part = True
355 main_xml_part[:] = [deepcopy(one_part)]
356 elif not last_node_part and name == "naglowek_scena":
358 main_xml_part[:] = [deepcopy(one_part)]
360 main_xml_part.append(deepcopy(one_part))
361 last_node_part = False
363 if name == 'naglowek_czesc':
365 last_node_part = True
366 main_xml_part[:] = [deepcopy(one_part)]
367 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
369 main_xml_part[:] = [deepcopy(one_part)]
371 main_xml_part.append(deepcopy(one_part))
372 last_node_part = False
376 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
377 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
380 for element in chunk_xml[0]:
381 if element.tag == "naglowek_czesc":
382 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
383 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
384 toc.add(node_name(element), "part%d.html" % chunk_no)
385 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
386 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
387 element.set('sub', str(subnumber))
389 if not _empty_html_static:
390 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
392 output_html = _empty_html_static[0]
394 find_annotations(annotations, chunk_xml, chunk_no)
395 replace_by_verse(chunk_xml)
396 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
397 chars = used_chars(html_tree.getroot())
398 output_html = etree.tostring(
399 html_tree, pretty_print=True, xml_declaration=True,
401 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
402 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
404 return output_html, toc, chars
407 def transform(wldoc, verbose=False,
408 style=None, html_toc=False,
409 sample=None, cover=None, flags=None):
410 """ produces a EPUB file
412 sample=n: generate sample e-book (with at least n paragraphs)
413 cover: a cover.Cover factory or True for default
414 flags: less-advertising, without-fonts, working-copy
417 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
418 """ processes one input file and proceeds to its children """
420 replace_characters(wldoc.edoc.getroot())
422 hyphenator = set_hyph_language(wldoc.edoc.getroot())
423 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
425 # every input file will have a TOC entry,
426 # pointing to starting chunk
427 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
430 # write book title page
431 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
432 chars = used_chars(html_tree.getroot())
436 html_tree, pretty_print=True, xml_declaration=True,
438 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
439 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
442 # add a title page TOC entry
443 toc.add(u"Strona tytułowa", "title.html")
444 elif wldoc.book_info.parts:
445 # write title page for every parent
446 if sample is not None and sample <= 0:
448 html_string = open(get_resource('epub/emptyChunk.html')).read()
450 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
451 chars = used_chars(html_tree.getroot())
452 html_string = etree.tostring(
453 html_tree, pretty_print=True, xml_declaration=True,
455 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
456 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
458 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
459 add_to_manifest(manifest, chunk_counter)
460 add_to_spine(spine, chunk_counter)
463 if len(wldoc.edoc.getroot()) > 1:
464 # rdf before style master
465 main_text = wldoc.edoc.getroot()[1]
467 # rdf in style master
468 main_text = wldoc.edoc.getroot()[0]
469 if main_text.tag == RDFNS('RDF'):
472 if main_text is not None:
473 for chunk_xml in chop(main_text):
475 if sample is not None:
479 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
480 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
482 toc.extend(chunk_toc)
483 chars = chars.union(chunk_chars)
484 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
485 add_to_manifest(manifest, chunk_counter)
486 add_to_spine(spine, chunk_counter)
489 for child in wldoc.parts():
490 child_toc, chunk_counter, chunk_chars, sample = transform_file(
491 child, chunk_counter, first=False, sample=sample)
492 toc.append(child_toc)
493 chars = chars.union(chunk_chars)
495 return toc, chunk_counter, chars, sample
497 document = deepcopy(wldoc)
502 document.edoc.getroot().set(flag, 'yes')
505 document.edoc.getroot().set('editors', u', '.join(sorted(
506 editor.readable() for editor in document.editors())))
507 if document.book_info.funders:
508 document.edoc.getroot().set('funders', u', '.join(
509 document.book_info.funders))
510 if document.book_info.thanks:
511 document.edoc.getroot().set('thanks', document.book_info.thanks)
513 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
514 manifest = opf.find('.//' + OPFNS('manifest'))
515 guide = opf.find('.//' + OPFNS('guide'))
516 spine = opf.find('.//' + OPFNS('spine'))
518 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
519 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
521 # write static elements
522 mime = zipfile.ZipInfo()
523 mime.filename = 'mimetype'
524 mime.compress_type = zipfile.ZIP_STORED
526 zip.writestr(mime, 'application/epub+zip')
528 'META-INF/container.xml',
529 '<?xml version="1.0" ?>'
530 '<container version="1.0" '
531 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
532 '<rootfiles><rootfile full-path="OPS/content.opf" '
533 'media-type="application/oebps-package+xml" />'
534 '</rootfiles></container>'
536 zip.write(get_resource('res/wl-logo-small.png'),
537 os.path.join('OPS', 'logo_wolnelektury.png'))
538 zip.write(get_resource('res/jedenprocent.png'),
539 os.path.join('OPS', 'jedenprocent.png'))
541 style = get_resource('epub/style.css')
542 zip.write(style, os.path.join('OPS', 'style.css'))
546 cover = DefaultEbookCover
548 cover_file = StringIO()
549 bound_cover = cover(document.book_info)
550 bound_cover.save(cover_file)
551 cover_name = 'cover.%s' % bound_cover.ext()
552 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
555 cover_tree = etree.parse(get_resource('epub/cover.html'))
556 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
557 zip.writestr('OPS/cover.html', etree.tostring(
558 cover_tree, pretty_print=True, xml_declaration=True,
560 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
561 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
564 if bound_cover.uses_dc_cover:
565 if document.book_info.cover_by:
566 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
567 if document.book_info.cover_source:
568 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
570 manifest.append(etree.fromstring(
571 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
572 manifest.append(etree.fromstring(
573 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
574 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
575 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
576 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
578 annotations = etree.Element('annotations')
580 toc_file = etree.fromstring(
581 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
582 '"-//NISO//DTD ncx 2005-1//EN" '
583 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
584 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
585 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
588 nav_map = toc_file[-1]
591 manifest.append(etree.fromstring(
592 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
593 spine.append(etree.fromstring(
594 '<itemref idref="html_toc" />'))
595 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
597 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
599 if len(toc.children) < 2:
600 toc.add(u"Początek utworu", "part1.html")
602 # Last modifications in container files and EPUB creation
603 if len(annotations) > 0:
604 toc.add("Przypisy", "annotations.html")
605 manifest.append(etree.fromstring(
606 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
607 spine.append(etree.fromstring(
608 '<itemref idref="annotations" />'))
609 replace_by_verse(annotations)
610 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
611 chars = chars.union(used_chars(html_tree.getroot()))
612 zip.writestr('OPS/annotations.html', etree.tostring(
613 html_tree, pretty_print=True, xml_declaration=True,
615 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
616 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
619 toc.add("Wesprzyj Wolne Lektury", "support.html")
620 manifest.append(etree.fromstring(
621 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
622 spine.append(etree.fromstring(
623 '<itemref idref="support" />'))
624 html_string = open(get_resource('epub/support.html')).read()
625 chars.update(used_chars(etree.fromstring(html_string)))
626 zip.writestr('OPS/support.html', html_string)
628 toc.add("Strona redakcyjna", "last.html")
629 manifest.append(etree.fromstring(
630 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
631 spine.append(etree.fromstring(
632 '<itemref idref="last" />'))
633 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
634 chars.update(used_chars(html_tree.getroot()))
635 zip.writestr('OPS/last.html', etree.tostring(
636 html_tree, pretty_print=True, xml_declaration=True,
638 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
639 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
642 if not flags or not 'without-fonts' in flags:
644 tmpdir = mkdtemp('-librarian-epub')
650 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
651 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
652 optimizer_call = ['perl', 'subset.pl', '--chars',
653 ''.join(chars).encode('utf-8'),
654 get_resource('fonts/' + fname),
655 os.path.join(tmpdir, fname)]
657 print "Running font-optimizer"
658 subprocess.check_call(optimizer_call)
660 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
661 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
662 manifest.append(etree.fromstring(
663 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
667 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
668 xml_declaration=True, encoding="utf-8"))
669 title = document.book_info.title
670 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
671 for st in attributes:
672 meta = toc_file.makeelement(NCXNS('meta'))
674 meta.set('content', '0')
675 toc_file[0].append(meta)
676 toc_file[0][0].set('content', str(document.book_info.url))
677 toc_file[0][1].set('content', str(toc.depth()))
678 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
682 toc.add(u"Spis treści", "toc.html", index=1)
683 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
684 toc.write_to_xml(nav_map)
685 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
686 xml_declaration=True, encoding="utf-8"))
689 return OutputFile.from_filename(output_file.name)