# -*- coding: utf-8 -*- # # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # from __future__ import with_statement import os import os.path import subprocess from StringIO import StringIO from copy import deepcopy from lxml import etree import zipfile from tempfile import mkdtemp, NamedTemporaryFile from shutil import rmtree from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile from librarian.cover import ImageCover as WLCover from librarian import functions, get_resource functions.reg_person_name() def inner_xml(node): """ returns node's text and children as a string >>> print inner_xml(etree.fromstring('xyz')) xyz """ nt = node.text if node.text is not None else '' return ''.join([nt] + [etree.tostring(child) for child in node]) def set_inner_xml(node, text): """ sets node's text and children from a string >>> e = etree.fromstring('bxx') >>> set_inner_xml(e, 'xyz') >>> print etree.tostring(e) xyz """ p = etree.fromstring('%s' % text) node.text = p.text node[:] = p[:] def node_name(node): """ Find out a node's name >>> print node_name(etree.fromstring('XYZ')) XYZ """ tempnode = deepcopy(node) for p in ('pe', 'pa', 'pt', 'pr', 'motyw'): for e in tempnode.findall('.//%s' % p): t = e.tail e.clear() e.tail = t etree.strip_tags(tempnode, '*') return tempnode.text def xslt(xml, sheet): if isinstance(xml, etree._Element): xml = etree.ElementTree(xml) with open(sheet) as xsltf: return xml.xslt(etree.parse(xsltf)) def replace_characters(node): def replace_chars(text): if text is None: return None return text.replace(u"\ufeff", u"")\ .replace("---", u"\u2014")\ .replace("--", u"\u2013")\ .replace(",,", u"“")\ .replace('"', u"\u201D")\ .replace("'", u"\u2019") if node.tag in ('uwaga', 'extra'): t = node.tail node.clear() node.tail = t node.text = replace_chars(node.text) node.tail = replace_chars(node.tail) for child in node: replace_characters(child) def find_annotations(annotations, source, part_no): for child in source: if child.tag in ('pe', 'pa', 'pt', 'pr'): annotation = deepcopy(child) number = str(len(annotations)+1) annotation.set('number', number) annotation.set('part', str(part_no)) annotation.tail = '' annotations.append(annotation) tail = child.tail child.clear() child.tail = tail child.text = number if child.tag not in ('extra', 'uwaga'): find_annotations(annotations, child, part_no) def replace_by_verse(tree): """ Find stanzas and create new verses in place of a '/' character """ stanzas = tree.findall('.//' + WLNS('strofa')) for node in stanzas: for child_node in node: if child_node.tag in ('slowo_obce', 'wyroznienie'): foreign_verses = inner_xml(child_node).split('/\n') if len(foreign_verses) > 1: new_foreign = '' for foreign_verse in foreign_verses: if foreign_verse.startswith('', foreign_verse, '')) set_inner_xml(child_node, new_foreign) verses = inner_xml(node).split('/\n') if len(verses) > 1: modified_inner_xml = '' for verse in verses: if verse.startswith('', verse, '')) set_inner_xml(node, modified_inner_xml) def add_to_manifest(manifest, partno): """ Adds a node to the manifest section in content.opf file """ partstr = 'part%d' % partno e = manifest.makeelement(OPFNS('item'), attrib={ 'id': partstr, 'href': partstr + '.html', 'media-type': 'application/xhtml+xml', }) manifest.append(e) def add_to_spine(spine, partno): """ Adds a node to the spine section in content.opf file """ e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno}); spine.append(e) class TOC(object): def __init__(self, name=None, part_href=None): self.children = [] self.name = name self.part_href = part_href self.sub_number = None def add(self, name, part_href, level=0, is_part=True, index=None): assert level == 0 or index is None if level > 0 and self.children: return self.children[-1].add(name, part_href, level-1, is_part) else: t = TOC(name) t.part_href = part_href if index is not None: self.children.insert(index, t) else: self.children.append(t) if not is_part: t.sub_number = len(self.children) + 1 return t.sub_number def append(self, toc): self.children.append(toc) def extend(self, toc): self.children.extend(toc.children) def depth(self): if self.children: return max((c.depth() for c in self.children)) + 1 else: return 0 def href(self): src = self.part_href if self.sub_number is not None: src += '#sub%d' % self.sub_number return src def write_to_xml(self, nav_map, counter=1): for child in self.children: nav_point = nav_map.makeelement(NCXNS('navPoint')) nav_point.set('id', 'NavPoint-%d' % counter) nav_point.set('playOrder', str(counter)) nav_label = nav_map.makeelement(NCXNS('navLabel')) text = nav_map.makeelement(NCXNS('text')) text.text = child.name nav_label.append(text) nav_point.append(nav_label) content = nav_map.makeelement(NCXNS('content')) content.set('src', child.href()) nav_point.append(content) nav_map.append(nav_point) counter = child.write_to_xml(nav_point, counter + 1) return counter def html_part(self, depth=0): texts = [] for child in self.children: texts.append( "

" % (depth, child.href(), child.name)) texts.append(child.html_part(depth+1)) return "\n".join(texts) def html(self): with open(get_resource('epub/toc.html')) as f: t = unicode(f.read(), 'utf-8') return t % self.html_part() def used_chars(element): """ Lists characters used in an ETree Element """ chars = set((element.text or '') + (element.tail or '')) for child in element: chars = chars.union(used_chars(child)) return chars def chop(main_text): """ divide main content of the XML file into chunks """ # prepare a container for each chunk part_xml = etree.Element('utwor') etree.SubElement(part_xml, 'master') main_xml_part = part_xml[0] # master last_node_part = False for one_part in main_text: name = one_part.tag #if name == 'naglowek_czesc': # yield part_xml # last_node_part = True # main_xml_part[:] = [deepcopy(one_part)] #elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"): # yield part_xml # main_xml_part[:] = [deepcopy(one_part)] #else: if True: main_xml_part.append(deepcopy(one_part)) last_node_part = False yield part_xml def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]): """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """ toc = TOC() #for element in chunk_xml[0]: # if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"): # toc.add(node_name(element), "part%d.html" % chunk_no) # elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'): # subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False) # element.set('sub', str(subnumber)) if empty: if not _empty_html_static: _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read()) chars = set() output_html = _empty_html_static[0] else: if chunk_no == 1: html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme-FoC.xsl')) else: find_annotations(annotations, chunk_xml, chunk_no) replace_by_verse(chunk_xml) html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl')) chars = used_chars(html_tree.getroot()) output_html = etree.tostring(html_tree, method="html", pretty_print=True) return output_html, toc, chars def transform(wldoc, verbose=False, style=None, html_toc=False, sample=None, cover=None, flags=None): """ produces a EPUB file sample=n: generate sample e-book (with at least n paragraphs) cover: a cover.Cover object or True for default flags: less-advertising, without-fonts, working-copy """ def transform_file(wldoc, chunk_counter=1, first=True, sample=None): """ processes one input file and proceeds to its children """ replace_characters(wldoc.edoc.getroot()) # every input file will have a TOC entry, # pointing to starting chunk # hack for FoC: if wldoc.book_info.author is not None: toc_title = "%s, %s" % (wldoc.book_info.author.readable(), wldoc.book_info.title) note = wldoc.edoc.find('//dzielo_nadrzedne') if note is not None: toc_title += " (%s)" % note.text else: toc_title = wldoc.book_info.title toc = TOC(toc_title, "part%d.html" % chunk_counter) chars = set() if first: # write book title page html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl')) chars = used_chars(html_tree.getroot()) zip.writestr('OPS/title.html', etree.tostring(html_tree, method="html", pretty_print=True)) # add a title page TOC entry toc.add(u"Title page", "title.html") toc.add(u"Dear readers!", "part1.html") elif wldoc.book_info.parts: # write title page for every parent if sample is not None and sample <= 0: chars = set() html_string = open(get_resource('epub/emptyChunk.html')).read() else: html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl')) chars = used_chars(html_tree.getroot()) html_string = etree.tostring(html_tree, method="html", pretty_print=True) zip.writestr('OPS/part%d.html' % chunk_counter, html_string) add_to_manifest(manifest, chunk_counter) add_to_spine(spine, chunk_counter) chunk_counter += 1 if len(wldoc.edoc.getroot()) > 1: # rdf before style master main_text = wldoc.edoc.getroot()[1] else: # rdf in style master main_text = wldoc.edoc.getroot()[0] if main_text.tag == RDFNS('RDF'): main_text = None if main_text is not None: for chunk_xml in chop(main_text): empty = False if sample is not None: if sample <= 0: empty = True else: sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog')) chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty) toc.extend(chunk_toc) chars = chars.union(chunk_chars) zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html) add_to_manifest(manifest, chunk_counter) add_to_spine(spine, chunk_counter) chunk_counter += 1 for child in wldoc.parts(): child_toc, chunk_counter, chunk_chars, sample = transform_file( child, chunk_counter, first=False, sample=sample) toc.append(child_toc) chars = chars.union(chunk_chars) return toc, chunk_counter, chars, sample document = deepcopy(wldoc) del wldoc if flags: for flag in flags: document.edoc.getroot().set(flag, 'yes') opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl')) manifest = opf.find('.//' + OPFNS('manifest')) guide = opf.find('.//' + OPFNS('guide')) spine = opf.find('.//' + OPFNS('spine')) output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False) zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED) # write static elements mime = zipfile.ZipInfo() mime.filename = 'mimetype' mime.compress_type = zipfile.ZIP_STORED mime.extra = '' zip.writestr(mime, 'application/epub+zip') zip.writestr('META-INF/container.xml', '' \ '' \ '') #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png')) #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png')) zip.write('logo.png', os.path.join('OPS', 'logo.png')) if not style: style = get_resource('epub/style.css') zip.write(style, os.path.join('OPS', 'style.css')) if cover: if cover is True: cover = WLCover if cover.uses_dc_cover: if document.book_info.cover_by: document.edoc.getroot().set('data-cover-by', document.book_info.cover_by) if document.book_info.cover_source: document.edoc.getroot().set('data-cover-source', document.book_info.cover_source) cover_file = StringIO() c = cover(document.book_info) import Image c.im = Image.open('cover.jpg') c.ext = lambda: 'jpg' c.save(cover_file) c_name = 'cover.%s' % c.ext() zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue()) del cover_file cover_tree = etree.parse(get_resource('epub/cover.html')) cover_tree.find('//' + XHTMLNS('img')).set('src', c_name) zip.writestr('OPS/cover.html', etree.tostring( cover_tree, method="html", pretty_print=True)) manifest.append(etree.fromstring( '')) manifest.append(etree.fromstring( '' % (c_name, c.mime_type()))) spine.insert(0, etree.fromstring('')) opf.getroot()[0].append(etree.fromstring('')) guide.append(etree.fromstring('')) annotations = etree.Element('annotations') toc_file = etree.fromstring('' \ '' \ '') nav_map = toc_file[-1] if html_toc: manifest.append(etree.fromstring( '')) spine.append(etree.fromstring( '')) guide.append(etree.fromstring('')) toc, chunk_counter, chars, sample = transform_file(document, sample=sample) if len(toc.children) < 2: toc.add(u"Początek utworu", "part1.html") # Last modifications in container files and EPUB creation if len(annotations) > 0: toc.add("Przypisy", "annotations.html") manifest.append(etree.fromstring( '')) spine.append(etree.fromstring( '')) replace_by_verse(annotations) html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl')) chars = chars.union(used_chars(html_tree.getroot())) zip.writestr('OPS/annotations.html', etree.tostring( html_tree, method="html", pretty_print=True)) toc.add("Editorial page", "last.html") manifest.append(etree.fromstring( '')) spine.append(etree.fromstring( '')) html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl')) chars.update(used_chars(html_tree.getroot())) zip.writestr('OPS/last.html', etree.tostring( html_tree, method="html", pretty_print=True)) if not flags or not 'without-fonts' in flags: # strip fonts tmpdir = mkdtemp('-librarian-epub') cwd = os.getcwd() os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer')) for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf': optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), get_resource('fonts/' + fname), os.path.join(tmpdir, fname)] if verbose: print "Running font-optimizer" subprocess.check_call(optimizer_call) else: subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE) zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname)) manifest.append(etree.fromstring( '' % (fname, fname))) rmtree(tmpdir) os.chdir(cwd) zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True)) title = document.book_info.title attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber" for st in attributes: meta = toc_file.makeelement(NCXNS('meta')) meta.set('name', st) meta.set('content', '0') toc_file[0].append(meta) toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl'))) toc_file[0][1].set('content', str(toc.depth())) set_inner_xml(toc_file[1], ''.join(('', title, ''))) # write TOC if html_toc: toc.add(u"Table of Contents", "toc.html", index=1) zip.writestr('OPS/toc.html', toc.html().encode('utf-8')) toc.write_to_xml(nav_map) zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True)) zip.close() return OutputFile.from_filename(output_file.name)