1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
30 def set_hyph_language(source_tree):
31 def get_short_lng_code(text):
34 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
36 list = line.strip().split('|')
43 bibl_lng = etree.XPath('//dc:language//text()',
44 namespaces={'dc': str(DCNS)})(source_tree)
45 short_lng = get_short_lng_code(bibl_lng[0])
47 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
53 def hyphenate_and_fix_conjunctions(source_tree, hyph):
54 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
56 parent = t.getparent()
59 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
61 newt += hyph.inserted(w, u'\u00AD')
64 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
72 """ returns node's text and children as a string
74 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
78 nt = node.text if node.text is not None else ''
79 return ''.join([nt] + [etree.tostring(child) for child in node])
82 def set_inner_xml(node, text):
83 """ sets node's text and children from a string
85 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
86 >>> set_inner_xml(e, 'x<b>y</b>z')
87 >>> print etree.tostring(e)
91 p = etree.fromstring('<x>%s</x>' % text)
97 """ Find out a node's name
99 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
103 tempnode = deepcopy(node)
105 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
106 for e in tempnode.findall('.//%s' % p):
110 etree.strip_tags(tempnode, '*')
114 def xslt(xml, sheet):
115 if isinstance(xml, etree._Element):
116 xml = etree.ElementTree(xml)
117 with open(sheet) as xsltf:
118 return xml.xslt(etree.parse(xsltf))
121 def replace_characters(node):
122 def replace_chars(text):
125 return text.replace(u"\ufeff", u"")\
126 .replace("---", u"\u2014")\
127 .replace("--", u"\u2013")\
128 .replace(",,", u"\u201E")\
129 .replace('"', u"\u201D")\
130 .replace("'", u"\u2019")
131 if node.tag in ('uwaga', 'extra'):
135 node.text = replace_chars(node.text)
136 node.tail = replace_chars(node.tail)
138 replace_characters(child)
141 def find_annotations(annotations, source, part_no):
143 if child.tag in ('pe', 'pa', 'pt', 'pr'):
144 annotation = deepcopy(child)
145 number = str(len(annotations) + 1)
146 annotation.set('number', number)
147 annotation.set('part', str(part_no))
149 annotations.append(annotation)
154 if child.tag not in ('extra', 'uwaga'):
155 find_annotations(annotations, child, part_no)
158 class Stanza(object):
160 Converts / verse endings into verse elements in a stanza.
162 Slashes may only occur directly in the stanza. Any slashes in subelements
163 will be ignored, and the subelements will be put inside verse elements.
165 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
166 >>> Stanza(s).versify()
167 >>> print etree.tostring(s)
168 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
169 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
172 def __init__(self, stanza_elem):
173 self.stanza = stanza_elem
175 self.open_verse = None
178 self.push_text(self.stanza.text)
179 for elem in self.stanza:
181 self.push_text(elem.tail)
182 tail = self.stanza.tail
184 self.stanza.tail = tail
185 self.stanza.extend(self.verses)
187 def open_normal_verse(self):
188 self.open_verse = self.stanza.makeelement("wers_normalny")
189 self.verses.append(self.open_verse)
191 def get_open_verse(self):
192 if self.open_verse is None:
193 self.open_normal_verse()
194 return self.open_verse
196 def push_text(self, text):
199 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
201 self.open_normal_verse()
202 verse = self.get_open_verse()
204 verse[-1].tail = (verse[-1].tail or "") + verse_text
206 verse.text = (verse.text or "") + verse_text
208 def push_elem(self, elem):
209 if elem.tag.startswith("wers"):
210 verse = deepcopy(elem)
212 self.verses.append(verse)
213 self.open_verse = verse
215 appended = deepcopy(elem)
217 self.get_open_verse().append(appended)
220 def replace_by_verse(tree):
221 """ Find stanzas and create new verses in place of a '/' character """
223 stanzas = tree.findall('.//' + WLNS('strofa'))
224 for stanza in stanzas:
225 Stanza(stanza).versify()
228 def add_to_manifest(manifest, partno):
229 """ Adds a node to the manifest section in content.opf file """
231 partstr = 'part%d' % partno
232 e = manifest.makeelement(
233 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
234 'media-type': 'application/xhtml+xml'}
239 def add_to_spine(spine, partno):
240 """ Adds a node to the spine section in content.opf file """
242 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
247 def __init__(self, name=None, part_href=None):
250 self.part_href = part_href
251 self.sub_number = None
253 def add(self, name, part_href, level=0, is_part=True, index=None):
254 assert level == 0 or index is None
255 if level > 0 and self.children:
256 return self.children[-1].add(name, part_href, level - 1, is_part)
259 t.part_href = part_href
260 if index is not None:
261 self.children.insert(index, t)
263 self.children.append(t)
265 t.sub_number = len(self.children) + 1
268 def append(self, toc):
269 self.children.append(toc)
271 def extend(self, toc):
272 self.children.extend(toc.children)
276 return max((c.depth() for c in self.children)) + 1
282 if self.sub_number is not None:
283 src += '#sub%d' % self.sub_number
286 def write_to_xml(self, nav_map, counter=1):
287 for child in self.children:
288 nav_point = nav_map.makeelement(NCXNS('navPoint'))
289 nav_point.set('id', 'NavPoint-%d' % counter)
290 nav_point.set('playOrder', str(counter))
292 nav_label = nav_map.makeelement(NCXNS('navLabel'))
293 text = nav_map.makeelement(NCXNS('text'))
294 if child.name is not None:
295 text.text = re.sub(r'\n', ' ', child.name)
297 text.text = child.name
298 nav_label.append(text)
299 nav_point.append(nav_label)
301 content = nav_map.makeelement(NCXNS('content'))
302 content.set('src', child.href())
303 nav_point.append(content)
304 nav_map.append(nav_point)
305 counter = child.write_to_xml(nav_point, counter + 1)
308 def html_part(self, depth=0):
310 for child in self.children:
312 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
313 (depth, child.href(), child.name))
314 texts.append(child.html_part(depth + 1))
315 return "\n".join(texts)
318 with open(get_resource('epub/toc.html')) as f:
319 t = unicode(f.read(), 'utf-8')
320 return t % self.html_part()
323 def used_chars(element):
324 """ Lists characters used in an ETree Element """
325 chars = set((element.text or '') + (element.tail or ''))
326 for child in element:
327 chars = chars.union(used_chars(child))
332 """ divide main content of the XML file into chunks """
334 # prepare a container for each chunk
335 part_xml = etree.Element('utwor')
336 etree.SubElement(part_xml, 'master')
337 main_xml_part = part_xml[0] # master
339 last_node_part = False
341 # the below loop are workaround for a problem with epubs in drama ebooks without acts
344 for one_part in main_text:
346 if name == 'naglowek_scena':
348 elif name == 'naglowek_akt':
351 for one_part in main_text:
353 if is_act is False and is_scene is True:
354 if name == 'naglowek_czesc':
356 last_node_part = True
357 main_xml_part[:] = [deepcopy(one_part)]
358 elif not last_node_part and name == "naglowek_scena":
360 main_xml_part[:] = [deepcopy(one_part)]
362 main_xml_part.append(deepcopy(one_part))
363 last_node_part = False
365 if name == 'naglowek_czesc':
367 last_node_part = True
368 main_xml_part[:] = [deepcopy(one_part)]
369 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
371 main_xml_part[:] = [deepcopy(one_part)]
373 main_xml_part.append(deepcopy(one_part))
374 last_node_part = False
378 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
379 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
382 for element in chunk_xml[0]:
383 if element.tag == "naglowek_czesc":
384 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
385 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
386 toc.add(node_name(element), "part%d.html" % chunk_no)
387 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
388 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
389 element.set('sub', str(subnumber))
391 if not _empty_html_static:
392 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
394 output_html = _empty_html_static[0]
396 find_annotations(annotations, chunk_xml, chunk_no)
397 replace_by_verse(chunk_xml)
398 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
399 chars = used_chars(html_tree.getroot())
400 output_html = etree.tostring(
401 html_tree, pretty_print=True, xml_declaration=True,
403 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
404 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
406 return output_html, toc, chars
409 def transform(wldoc, verbose=False,
410 style=None, html_toc=False,
411 sample=None, cover=None, flags=None, hyphenate=False):
412 """ produces a EPUB file
414 sample=n: generate sample e-book (with at least n paragraphs)
415 cover: a cover.Cover factory or True for default
416 flags: less-advertising, without-fonts, working-copy
419 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
420 """ processes one input file and proceeds to its children """
422 replace_characters(wldoc.edoc.getroot())
424 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
425 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
427 # every input file will have a TOC entry,
428 # pointing to starting chunk
429 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
432 # write book title page
433 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
434 chars = used_chars(html_tree.getroot())
438 html_tree, pretty_print=True, xml_declaration=True,
440 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
441 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
444 # add a title page TOC entry
445 toc.add(u"Strona tytułowa", "title.html")
446 elif wldoc.book_info.parts:
447 # write title page for every parent
448 if sample is not None and sample <= 0:
450 html_string = open(get_resource('epub/emptyChunk.html')).read()
452 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
453 chars = used_chars(html_tree.getroot())
454 html_string = etree.tostring(
455 html_tree, pretty_print=True, xml_declaration=True,
457 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
458 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
460 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
461 add_to_manifest(manifest, chunk_counter)
462 add_to_spine(spine, chunk_counter)
465 if len(wldoc.edoc.getroot()) > 1:
466 # rdf before style master
467 main_text = wldoc.edoc.getroot()[1]
469 # rdf in style master
470 main_text = wldoc.edoc.getroot()[0]
471 if main_text.tag == RDFNS('RDF'):
474 if main_text is not None:
475 for chunk_xml in chop(main_text):
477 if sample is not None:
481 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
482 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
484 toc.extend(chunk_toc)
485 chars = chars.union(chunk_chars)
486 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
487 add_to_manifest(manifest, chunk_counter)
488 add_to_spine(spine, chunk_counter)
491 for child in wldoc.parts():
492 child_toc, chunk_counter, chunk_chars, sample = transform_file(
493 child, chunk_counter, first=False, sample=sample)
494 toc.append(child_toc)
495 chars = chars.union(chunk_chars)
497 return toc, chunk_counter, chars, sample
499 document = deepcopy(wldoc)
504 document.edoc.getroot().set(flag, 'yes')
506 document.clean_ed_note()
507 document.clean_ed_note('abstrakt')
510 editors = document.editors()
512 document.edoc.getroot().set('editors', u', '.join(sorted(
513 editor.readable() for editor in editors)))
514 if document.book_info.funders:
515 document.edoc.getroot().set('funders', u', '.join(
516 document.book_info.funders))
517 if document.book_info.thanks:
518 document.edoc.getroot().set('thanks', document.book_info.thanks)
520 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
521 manifest = opf.find('.//' + OPFNS('manifest'))
522 guide = opf.find('.//' + OPFNS('guide'))
523 spine = opf.find('.//' + OPFNS('spine'))
525 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
526 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
528 functions.reg_mathml_epub(zip)
530 # write static elements
531 mime = zipfile.ZipInfo()
532 mime.filename = 'mimetype'
533 mime.compress_type = zipfile.ZIP_STORED
535 zip.writestr(mime, 'application/epub+zip')
537 'META-INF/container.xml',
538 '<?xml version="1.0" ?>'
539 '<container version="1.0" '
540 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
541 '<rootfiles><rootfile full-path="OPS/content.opf" '
542 'media-type="application/oebps-package+xml" />'
543 '</rootfiles></container>'
545 zip.write(get_resource('res/wl-logo-small.png'),
546 os.path.join('OPS', 'logo_wolnelektury.png'))
547 zip.write(get_resource('res/jedenprocent.png'),
548 os.path.join('OPS', 'jedenprocent.png'))
550 style = get_resource('epub/style.css')
551 zip.write(style, os.path.join('OPS', 'style.css'))
555 cover = DefaultEbookCover
557 cover_file = StringIO()
558 bound_cover = cover(document.book_info)
559 bound_cover.save(cover_file)
560 cover_name = 'cover.%s' % bound_cover.ext()
561 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
564 cover_tree = etree.parse(get_resource('epub/cover.html'))
565 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
566 zip.writestr('OPS/cover.html', etree.tostring(
567 cover_tree, pretty_print=True, xml_declaration=True,
569 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
570 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
573 if bound_cover.uses_dc_cover:
574 if document.book_info.cover_by:
575 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
576 if document.book_info.cover_source:
577 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
579 manifest.append(etree.fromstring(
580 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
581 manifest.append(etree.fromstring(
582 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
583 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
584 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
585 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
587 annotations = etree.Element('annotations')
589 toc_file = etree.fromstring(
590 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
591 '"-//NISO//DTD ncx 2005-1//EN" '
592 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
593 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
594 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
597 nav_map = toc_file[-1]
600 manifest.append(etree.fromstring(
601 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
602 spine.append(etree.fromstring(
603 '<itemref idref="html_toc" />'))
604 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
606 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
608 if len(toc.children) < 2:
609 toc.add(u"Początek utworu", "part1.html")
611 # Last modifications in container files and EPUB creation
612 if len(annotations) > 0:
613 toc.add("Przypisy", "annotations.html")
614 manifest.append(etree.fromstring(
615 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
616 spine.append(etree.fromstring(
617 '<itemref idref="annotations" />'))
618 replace_by_verse(annotations)
619 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
620 chars = chars.union(used_chars(html_tree.getroot()))
621 zip.writestr('OPS/annotations.html', etree.tostring(
622 html_tree, pretty_print=True, xml_declaration=True,
624 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
625 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
628 toc.add("Wesprzyj Wolne Lektury", "support.html")
629 manifest.append(etree.fromstring(
630 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
631 spine.append(etree.fromstring(
632 '<itemref idref="support" />'))
633 html_string = open(get_resource('epub/support.html')).read()
634 chars.update(used_chars(etree.fromstring(html_string)))
635 zip.writestr('OPS/support.html', html_string)
637 toc.add("Strona redakcyjna", "last.html")
638 manifest.append(etree.fromstring(
639 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
640 spine.append(etree.fromstring(
641 '<itemref idref="last" />'))
642 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
643 chars.update(used_chars(html_tree.getroot()))
644 zip.writestr('OPS/last.html', etree.tostring(
645 html_tree, pretty_print=True, xml_declaration=True,
647 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
648 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
651 if not flags or 'without-fonts' not in flags:
653 tmpdir = mkdtemp('-librarian-epub')
659 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
660 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
661 optimizer_call = ['perl', 'subset.pl', '--chars',
662 ''.join(chars).encode('utf-8'),
663 get_resource('fonts/' + fname),
664 os.path.join(tmpdir, fname)]
666 print "Running font-optimizer"
667 subprocess.check_call(optimizer_call)
669 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
670 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
671 manifest.append(etree.fromstring(
672 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
676 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
677 xml_declaration=True, encoding="utf-8"))
678 title = document.book_info.title
679 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
680 for st in attributes:
681 meta = toc_file.makeelement(NCXNS('meta'))
683 meta.set('content', '0')
684 toc_file[0].append(meta)
685 toc_file[0][0].set('content', str(document.book_info.url))
686 toc_file[0][1].set('content', str(toc.depth()))
687 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
691 toc.add(u"Spis treści", "toc.html", index=1)
692 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
693 toc.write_to_xml(nav_map)
694 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
695 xml_declaration=True, encoding="utf-8"))
698 return OutputFile.from_filename(output_file.name)