1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 class Stanza(object):
116 Converts / verse endings into verse elements in a stanza.
118 Slashes may only occur directly in the stanza. Any slashes in subelements
119 will be ignored, and the subelements will be put inside verse elements.
121 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122 >>> Stanza(s).versify()
123 >>> print etree.tostring(s)
124 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
128 def __init__(self, stanza_elem):
129 self.stanza = stanza_elem
131 self.open_verse = None
134 self.push_text(self.stanza.text)
135 for elem in self.stanza:
137 self.push_text(elem.tail)
138 tail = self.stanza.tail
140 self.stanza.tail = tail
141 self.stanza.extend(self.verses)
143 def open_normal_verse(self):
144 self.open_verse = self.stanza.makeelement("wers_normalny")
145 self.verses.append(self.open_verse)
147 def get_open_verse(self):
148 if self.open_verse is None:
149 self.open_normal_verse()
150 return self.open_verse
152 def push_text(self, text):
155 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157 self.open_normal_verse()
158 verse = self.get_open_verse()
160 verse[-1].tail = (verse[-1].tail or "") + verse_text
162 verse.text = (verse.text or "") + verse_text
164 def push_elem(self, elem):
165 if elem.tag.startswith("wers"):
166 verse = deepcopy(elem)
168 self.verses.append(verse)
169 self.open_verse = verse
171 appended = deepcopy(elem)
173 self.get_open_verse().append(appended)
176 def replace_by_verse(tree):
177 """ Find stanzas and create new verses in place of a '/' character """
179 stanzas = tree.findall('.//' + WLNS('strofa'))
180 for stanza in stanzas:
181 Stanza(stanza).versify()
184 def add_to_manifest(manifest, partno):
185 """ Adds a node to the manifest section in content.opf file """
187 partstr = 'part%d' % partno
188 e = manifest.makeelement(OPFNS('item'), attrib={
190 'href': partstr + '.html',
191 'media-type': 'application/xhtml+xml',
196 def add_to_spine(spine, partno):
197 """ Adds a node to the spine section in content.opf file """
199 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
204 def __init__(self, name=None, part_href=None):
207 self.part_href = part_href
208 self.sub_number = None
210 def add(self, name, part_href, level=0, is_part=True, index=None):
211 assert level == 0 or index is None
212 if level > 0 and self.children:
213 return self.children[-1].add(name, part_href, level-1, is_part)
216 t.part_href = part_href
217 if index is not None:
218 self.children.insert(index, t)
220 self.children.append(t)
222 t.sub_number = len(self.children) + 1
225 def append(self, toc):
226 self.children.append(toc)
228 def extend(self, toc):
229 self.children.extend(toc.children)
233 return max((c.depth() for c in self.children)) + 1
239 if self.sub_number is not None:
240 src += '#sub%d' % self.sub_number
243 def write_to_xml(self, nav_map, counter=1):
244 for child in self.children:
245 nav_point = nav_map.makeelement(NCXNS('navPoint'))
246 nav_point.set('id', 'NavPoint-%d' % counter)
247 nav_point.set('playOrder', str(counter))
249 nav_label = nav_map.makeelement(NCXNS('navLabel'))
250 text = nav_map.makeelement(NCXNS('text'))
251 text.text = child.name
252 nav_label.append(text)
253 nav_point.append(nav_label)
255 content = nav_map.makeelement(NCXNS('content'))
256 content.set('src', child.href())
257 nav_point.append(content)
258 nav_map.append(nav_point)
259 counter = child.write_to_xml(nav_point, counter + 1)
262 def html_part(self, depth=0):
264 for child in self.children:
266 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267 (depth, child.href(), child.name))
268 texts.append(child.html_part(depth+1))
269 return "\n".join(texts)
272 with open(get_resource('epub/toc.html')) as f:
273 t = unicode(f.read(), 'utf-8')
274 return t % self.html_part()
277 def used_chars(element):
278 """ Lists characters used in an ETree Element """
279 chars = set((element.text or '') + (element.tail or ''))
280 for child in element:
281 chars = chars.union(used_chars(child))
286 """ divide main content of the XML file into chunks """
288 # prepare a container for each chunk
289 part_xml = etree.Element('utwor')
290 etree.SubElement(part_xml, 'master')
291 main_xml_part = part_xml[0] # master
293 last_node_part = False
294 for one_part in main_text:
296 if name == 'naglowek_czesc':
298 last_node_part = True
299 main_xml_part[:] = [deepcopy(one_part)]
300 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
302 main_xml_part[:] = [deepcopy(one_part)]
304 main_xml_part.append(deepcopy(one_part))
305 last_node_part = False
309 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
310 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
313 for element in chunk_xml[0]:
314 if element.tag in ("naglowek_czesc"):
315 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
316 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
317 toc.add(node_name(element), "part%d.html" % chunk_no)
318 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
319 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
320 element.set('sub', str(subnumber))
322 if not _empty_html_static:
323 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
325 output_html = _empty_html_static[0]
327 find_annotations(annotations, chunk_xml, chunk_no)
328 replace_by_verse(chunk_xml)
329 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
330 chars = used_chars(html_tree.getroot())
331 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
332 return output_html, toc, chars
335 def transform(wldoc, verbose=False,
336 style=None, html_toc=False,
337 sample=None, cover=None, flags=None):
338 """ produces a EPUB file
340 sample=n: generate sample e-book (with at least n paragraphs)
341 cover: a cover.Cover factory or True for default
342 flags: less-advertising, without-fonts, working-copy, with-full-fonts
345 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
346 """ processes one input file and proceeds to its children """
348 replace_characters(wldoc.edoc.getroot())
350 # every input file will have a TOC entry,
351 # pointing to starting chunk
352 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
355 # write book title page
356 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
357 chars = used_chars(html_tree.getroot())
358 zip.writestr('OPS/title.html',
359 etree.tostring(html_tree, method="html", pretty_print=True))
360 # add a title page TOC entry
361 toc.add(u"Strona tytułowa", "title.html")
362 elif wldoc.book_info.parts:
363 # write title page for every parent
364 if sample is not None and sample <= 0:
366 html_string = open(get_resource('epub/emptyChunk.html')).read()
368 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
369 chars = used_chars(html_tree.getroot())
370 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
371 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
372 add_to_manifest(manifest, chunk_counter)
373 add_to_spine(spine, chunk_counter)
376 if len(wldoc.edoc.getroot()) > 1:
377 # rdf before style master
378 main_text = wldoc.edoc.getroot()[1]
380 # rdf in style master
381 main_text = wldoc.edoc.getroot()[0]
382 if main_text.tag == RDFNS('RDF'):
385 if main_text is not None:
386 for chunk_xml in chop(main_text):
388 if sample is not None:
392 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
393 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
395 toc.extend(chunk_toc)
396 chars = chars.union(chunk_chars)
397 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
398 add_to_manifest(manifest, chunk_counter)
399 add_to_spine(spine, chunk_counter)
402 for child in wldoc.parts():
403 child_toc, chunk_counter, chunk_chars, sample = transform_file(
404 child, chunk_counter, first=False, sample=sample)
405 toc.append(child_toc)
406 chars = chars.union(chunk_chars)
408 return toc, chunk_counter, chars, sample
411 document = deepcopy(wldoc)
416 document.edoc.getroot().set(flag, 'yes')
419 document.edoc.getroot().set('editors', u', '.join(sorted(
420 editor.readable() for editor in document.editors())))
421 if document.book_info.funders:
422 document.edoc.getroot().set('funders', u', '.join(
423 document.book_info.funders))
424 if document.book_info.thanks:
425 document.edoc.getroot().set('thanks', document.book_info.thanks)
427 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
428 manifest = opf.find('.//' + OPFNS('manifest'))
429 guide = opf.find('.//' + OPFNS('guide'))
430 spine = opf.find('.//' + OPFNS('spine'))
432 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
433 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
435 # write static elements
436 mime = zipfile.ZipInfo()
437 mime.filename = 'mimetype'
438 mime.compress_type = zipfile.ZIP_STORED
440 zip.writestr(mime, 'application/epub+zip')
441 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
442 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
443 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
444 'media-type="application/oebps-package+xml" />' \
445 '</rootfiles></container>')
446 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
447 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
449 style = get_resource('epub/style.css')
450 zip.write(style, os.path.join('OPS', 'style.css'))
454 cover = DefaultEbookCover
456 cover_file = StringIO()
457 bound_cover = cover(document.book_info)
458 bound_cover.save(cover_file)
459 cover_name = 'cover.%s' % bound_cover.ext()
460 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
463 cover_tree = etree.parse(get_resource('epub/cover.html'))
464 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
465 zip.writestr('OPS/cover.html', etree.tostring(
466 cover_tree, method="html", pretty_print=True))
468 if bound_cover.uses_dc_cover:
469 if document.book_info.cover_by:
470 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
471 if document.book_info.cover_source:
472 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
474 manifest.append(etree.fromstring(
475 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
476 manifest.append(etree.fromstring(
477 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
478 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
479 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
480 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
483 annotations = etree.Element('annotations')
485 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
486 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
487 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
488 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
490 nav_map = toc_file[-1]
493 manifest.append(etree.fromstring(
494 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
495 spine.append(etree.fromstring(
496 '<itemref idref="html_toc" />'))
497 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
499 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
501 if len(toc.children) < 2:
502 toc.add(u"Początek utworu", "part1.html")
504 # Last modifications in container files and EPUB creation
505 if len(annotations) > 0:
506 toc.add("Przypisy", "annotations.html")
507 manifest.append(etree.fromstring(
508 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
509 spine.append(etree.fromstring(
510 '<itemref idref="annotations" />'))
511 replace_by_verse(annotations)
512 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
513 chars = chars.union(used_chars(html_tree.getroot()))
514 zip.writestr('OPS/annotations.html', etree.tostring(
515 html_tree, method="html", pretty_print=True))
517 toc.add("Wesprzyj Wolne Lektury", "support.html")
518 manifest.append(etree.fromstring(
519 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
520 spine.append(etree.fromstring(
521 '<itemref idref="support" />'))
522 html_string = open(get_resource('epub/support.html')).read()
523 chars.update(used_chars(etree.fromstring(html_string)))
524 zip.writestr('OPS/support.html', html_string)
526 toc.add("Strona redakcyjna", "last.html")
527 manifest.append(etree.fromstring(
528 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
529 spine.append(etree.fromstring(
530 '<itemref idref="last" />'))
531 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
532 chars.update(used_chars(html_tree.getroot()))
533 zip.writestr('OPS/last.html', etree.tostring(
534 html_tree, method="html", pretty_print=True))
536 if not flags or not 'without-fonts' in flags:
538 tmpdir = mkdtemp('-librarian-epub')
544 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
545 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
546 if not flags or not 'with-full-fonts' in flags:
547 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
548 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
550 print "Running font-optimizer"
551 subprocess.check_call(optimizer_call)
553 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
554 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
556 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
557 manifest.append(etree.fromstring(
558 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
562 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
563 title = document.book_info.title
564 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
565 for st in attributes:
566 meta = toc_file.makeelement(NCXNS('meta'))
568 meta.set('content', '0')
569 toc_file[0].append(meta)
570 toc_file[0][0].set('content', str(document.book_info.url))
571 toc_file[0][1].set('content', str(toc.depth()))
572 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
576 toc.add(u"Spis treści", "toc.html", index=1)
577 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
578 toc.write_to_xml(nav_map)
579 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
582 return OutputFile.from_filename(output_file.name)