1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
19 from librarian.cover import ImageCover as WLCover
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
36 def set_inner_xml(node, text):
37 """ sets node's text and children from a string
39 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40 >>> set_inner_xml(e, 'x<b>y</b>z')
41 >>> print etree.tostring(e)
45 p = etree.fromstring('<x>%s</x>' % text)
51 """ Find out a node's name
53 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57 tempnode = deepcopy(node)
59 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60 for e in tempnode.findall('.//%s' % p):
64 etree.strip_tags(tempnode, '*')
69 if isinstance(xml, etree._Element):
70 xml = etree.ElementTree(xml)
71 with open(sheet) as xsltf:
72 return xml.xslt(etree.parse(xsltf))
75 def replace_characters(node):
76 def replace_chars(text):
79 return text.replace(u"\ufeff", u"")\
80 .replace("---", u"\u2014")\
81 .replace("--", u"\u2013")\
83 .replace('"', u"\u201D")\
84 .replace("'", u"\u2019")
85 if node.tag in ('uwaga', 'extra'):
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'uwaga'):
109 find_annotations(annotations, child, part_no)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
115 stanzas = tree.findall('.//' + WLNS('strofa'))
117 for child_node in node:
118 if child_node.tag in ('slowo_obce', 'wyroznienie'):
119 foreign_verses = inner_xml(child_node).split('/\n')
120 if len(foreign_verses) > 1:
122 for foreign_verse in foreign_verses:
123 if foreign_verse.startswith('<wers'):
124 new_foreign += foreign_verse
126 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127 set_inner_xml(child_node, new_foreign)
128 verses = inner_xml(node).split('/\n')
130 modified_inner_xml = ''
132 if verse.startswith('<wers') or verse.startswith('<extra'):
133 modified_inner_xml += verse
135 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136 set_inner_xml(node, modified_inner_xml)
139 def add_to_manifest(manifest, partno):
140 """ Adds a node to the manifest section in content.opf file """
142 partstr = 'part%d' % partno
143 e = manifest.makeelement(OPFNS('item'), attrib={
145 'href': partstr + '.html',
146 'media-type': 'application/xhtml+xml',
151 def add_to_spine(spine, partno):
152 """ Adds a node to the spine section in content.opf file """
154 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159 def __init__(self, name=None, part_href=None):
162 self.part_href = part_href
163 self.sub_number = None
165 def add(self, name, part_href, level=0, is_part=True, index=None):
166 assert level == 0 or index is None
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_href, level-1, is_part)
171 t.part_href = part_href
172 if index is not None:
173 self.children.insert(index, t)
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
194 if self.sub_number is not None:
195 src += '#sub%d' % self.sub_number
198 def write_to_xml(self, nav_map, counter=1):
199 for child in self.children:
200 nav_point = nav_map.makeelement(NCXNS('navPoint'))
201 nav_point.set('id', 'NavPoint-%d' % counter)
202 nav_point.set('playOrder', str(counter))
204 nav_label = nav_map.makeelement(NCXNS('navLabel'))
205 text = nav_map.makeelement(NCXNS('text'))
206 text.text = child.name
207 nav_label.append(text)
208 nav_point.append(nav_label)
210 content = nav_map.makeelement(NCXNS('content'))
211 content.set('src', child.href())
212 nav_point.append(content)
213 nav_map.append(nav_point)
214 counter = child.write_to_xml(nav_point, counter + 1)
217 def html_part(self, depth=0):
219 for child in self.children:
221 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
222 (depth, child.href(), child.name))
223 texts.append(child.html_part(depth+1))
224 return "\n".join(texts)
227 with open(get_resource('epub/toc.html')) as f:
228 t = unicode(f.read(), 'utf-8')
229 return t % self.html_part()
232 def used_chars(element):
233 """ Lists characters used in an ETree Element """
234 chars = set((element.text or '') + (element.tail or ''))
235 for child in element:
236 chars = chars.union(used_chars(child))
241 """ divide main content of the XML file into chunks """
243 # prepare a container for each chunk
244 part_xml = etree.Element('utwor')
245 etree.SubElement(part_xml, 'master')
246 main_xml_part = part_xml[0] # master
248 last_node_part = False
249 for one_part in main_text:
251 #if name == 'naglowek_czesc':
253 # last_node_part = True
254 # main_xml_part[:] = [deepcopy(one_part)]
255 #elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
257 # main_xml_part[:] = [deepcopy(one_part)]
260 main_xml_part.append(deepcopy(one_part))
261 last_node_part = False
265 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
266 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
269 #for element in chunk_xml[0]:
270 # if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
271 # toc.add(node_name(element), "part%d.html" % chunk_no)
272 # elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
273 # subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
274 # element.set('sub', str(subnumber))
276 if not _empty_html_static:
277 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
279 output_html = _empty_html_static[0]
282 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme-FoC.xsl'))
284 find_annotations(annotations, chunk_xml, chunk_no)
285 replace_by_verse(chunk_xml)
286 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
287 chars = used_chars(html_tree.getroot())
288 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
289 return output_html, toc, chars
292 def transform(wldoc, verbose=False,
293 style=None, html_toc=False,
294 sample=None, cover=None, flags=None):
295 """ produces a EPUB file
297 sample=n: generate sample e-book (with at least n paragraphs)
298 cover: a cover.Cover object or True for default
299 flags: less-advertising, without-fonts, working-copy
302 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
303 """ processes one input file and proceeds to its children """
305 replace_characters(wldoc.edoc.getroot())
307 # every input file will have a TOC entry,
308 # pointing to starting chunk
311 if wldoc.book_info.author is not None:
312 toc_title = "%s, %s" % (wldoc.book_info.author.readable(), wldoc.book_info.title)
313 note = wldoc.edoc.find('//dzielo_nadrzedne')
315 toc_title += " (%s)" % note.text
317 toc_title = wldoc.book_info.title
318 toc = TOC(toc_title, "part%d.html" % chunk_counter)
321 # write book title page
322 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
323 chars = used_chars(html_tree.getroot())
324 zip.writestr('OPS/title.html',
325 etree.tostring(html_tree, method="html", pretty_print=True))
326 # add a title page TOC entry
327 toc.add(u"Title page", "title.html")
328 toc.add(u"Dear readers!", "part1.html")
329 elif wldoc.book_info.parts:
330 # write title page for every parent
331 if sample is not None and sample <= 0:
333 html_string = open(get_resource('epub/emptyChunk.html')).read()
335 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
336 chars = used_chars(html_tree.getroot())
337 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
338 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
339 add_to_manifest(manifest, chunk_counter)
340 add_to_spine(spine, chunk_counter)
343 if len(wldoc.edoc.getroot()) > 1:
344 # rdf before style master
345 main_text = wldoc.edoc.getroot()[1]
347 # rdf in style master
348 main_text = wldoc.edoc.getroot()[0]
349 if main_text.tag == RDFNS('RDF'):
352 if main_text is not None:
353 for chunk_xml in chop(main_text):
355 if sample is not None:
359 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
360 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
362 toc.extend(chunk_toc)
363 chars = chars.union(chunk_chars)
364 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
365 add_to_manifest(manifest, chunk_counter)
366 add_to_spine(spine, chunk_counter)
369 for child in wldoc.parts():
370 child_toc, chunk_counter, chunk_chars, sample = transform_file(
371 child, chunk_counter, first=False, sample=sample)
372 toc.append(child_toc)
373 chars = chars.union(chunk_chars)
375 return toc, chunk_counter, chars, sample
378 document = deepcopy(wldoc)
383 document.edoc.getroot().set(flag, 'yes')
385 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
386 manifest = opf.find('.//' + OPFNS('manifest'))
387 guide = opf.find('.//' + OPFNS('guide'))
388 spine = opf.find('.//' + OPFNS('spine'))
390 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
391 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
393 # write static elements
394 mime = zipfile.ZipInfo()
395 mime.filename = 'mimetype'
396 mime.compress_type = zipfile.ZIP_STORED
398 zip.writestr(mime, 'application/epub+zip')
399 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
400 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
401 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
402 'media-type="application/oebps-package+xml" />' \
403 '</rootfiles></container>')
404 #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
405 #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
406 zip.write('logo.png', os.path.join('OPS', 'logo.png'))
408 style = get_resource('epub/style.css')
409 zip.write(style, os.path.join('OPS', 'style.css'))
414 if cover.uses_dc_cover:
415 if document.book_info.cover_by:
416 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
417 if document.book_info.cover_source:
418 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
420 cover_file = StringIO()
421 c = cover(document.book_info)
423 c.im = Image.open('cover.jpg')
424 c.ext = lambda: 'jpg'
426 c_name = 'cover.%s' % c.ext()
427 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
430 cover_tree = etree.parse(get_resource('epub/cover.html'))
431 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
432 zip.writestr('OPS/cover.html', etree.tostring(
433 cover_tree, method="html", pretty_print=True))
435 manifest.append(etree.fromstring(
436 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
437 manifest.append(etree.fromstring(
438 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
439 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
440 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
441 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
444 annotations = etree.Element('annotations')
446 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
447 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
448 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
449 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
451 nav_map = toc_file[-1]
454 manifest.append(etree.fromstring(
455 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
456 spine.append(etree.fromstring(
457 '<itemref idref="html_toc" />'))
458 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Table of Contents"/>'))
460 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
462 if len(toc.children) < 2:
463 toc.add(u"Początek utworu", "part1.html")
465 # Last modifications in container files and EPUB creation
466 if len(annotations) > 0:
467 toc.add("Przypisy", "annotations.html")
468 manifest.append(etree.fromstring(
469 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
470 spine.append(etree.fromstring(
471 '<itemref idref="annotations" />'))
472 replace_by_verse(annotations)
473 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
474 chars = chars.union(used_chars(html_tree.getroot()))
475 zip.writestr('OPS/annotations.html', etree.tostring(
476 html_tree, method="html", pretty_print=True))
478 toc.add("Editorial page", "last.html")
479 manifest.append(etree.fromstring(
480 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
481 spine.append(etree.fromstring(
482 '<itemref idref="last" />'))
483 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
484 chars.update(used_chars(html_tree.getroot()))
485 zip.writestr('OPS/last.html', etree.tostring(
486 html_tree, method="html", pretty_print=True))
488 if not flags or not 'without-fonts' in flags:
490 tmpdir = mkdtemp('-librarian-epub')
493 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
494 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
495 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
496 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
498 print "Running font-optimizer"
499 subprocess.check_call(optimizer_call)
501 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
502 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
503 manifest.append(etree.fromstring(
504 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
508 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
509 title = document.book_info.title
510 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
511 for st in attributes:
512 meta = toc_file.makeelement(NCXNS('meta'))
514 meta.set('content', '0')
515 toc_file[0].append(meta)
516 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
517 toc_file[0][1].set('content', str(toc.depth()))
518 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
522 toc.add(u"Table of Contents", "toc.html", index=1)
523 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
524 toc.write_to_xml(nav_map)
525 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
528 return OutputFile.from_filename(output_file.name)