1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 #text = re.sub(r"(?<=\s\w)\s+", u"\u00a0", text) #fix for hanging single letter conjunctions – for future use.
82 return text.replace(u"\ufeff", u"")\
83 .replace("---", u"\u2014")\
84 .replace("--", u"\u2013")\
85 .replace(",,", u"\u201E")\
86 .replace('"', u"\u201D")\
87 .replace("'", u"\u2019")
88 if node.tag in ('uwaga', 'extra'):
92 node.text = replace_chars(node.text)
93 node.tail = replace_chars(node.tail)
95 replace_characters(child)
98 def find_annotations(annotations, source, part_no):
100 if child.tag in ('pe', 'pa', 'pt', 'pr'):
101 annotation = deepcopy(child)
102 number = str(len(annotations)+1)
103 annotation.set('number', number)
104 annotation.set('part', str(part_no))
106 annotations.append(annotation)
111 if child.tag not in ('extra', 'uwaga'):
112 find_annotations(annotations, child, part_no)
115 class Stanza(object):
117 Converts / verse endings into verse elements in a stanza.
119 Slashes may only occur directly in the stanza. Any slashes in subelements
120 will be ignored, and the subelements will be put inside verse elements.
122 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
123 >>> Stanza(s).versify()
124 >>> print etree.tostring(s)
125 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
126 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
129 def __init__(self, stanza_elem):
130 self.stanza = stanza_elem
132 self.open_verse = None
135 self.push_text(self.stanza.text)
136 for elem in self.stanza:
138 self.push_text(elem.tail)
139 tail = self.stanza.tail
141 self.stanza.tail = tail
142 self.stanza.extend(self.verses)
144 def open_normal_verse(self):
145 self.open_verse = self.stanza.makeelement("wers_normalny")
146 self.verses.append(self.open_verse)
148 def get_open_verse(self):
149 if self.open_verse is None:
150 self.open_normal_verse()
151 return self.open_verse
153 def push_text(self, text):
156 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
158 self.open_normal_verse()
159 verse = self.get_open_verse()
161 verse[-1].tail = (verse[-1].tail or "") + verse_text
163 verse.text = (verse.text or "") + verse_text
165 def push_elem(self, elem):
166 if elem.tag.startswith("wers"):
167 verse = deepcopy(elem)
169 self.verses.append(verse)
170 self.open_verse = verse
172 appended = deepcopy(elem)
174 self.get_open_verse().append(appended)
177 def replace_by_verse(tree):
178 """ Find stanzas and create new verses in place of a '/' character """
180 stanzas = tree.findall('.//' + WLNS('strofa'))
181 for stanza in stanzas:
182 Stanza(stanza).versify()
185 def add_to_manifest(manifest, partno):
186 """ Adds a node to the manifest section in content.opf file """
188 partstr = 'part%d' % partno
189 e = manifest.makeelement(OPFNS('item'), attrib={
191 'href': partstr + '.html',
192 'media-type': 'application/xhtml+xml',
197 def add_to_spine(spine, partno):
198 """ Adds a node to the spine section in content.opf file """
200 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
205 def __init__(self, name=None, part_href=None):
208 self.part_href = part_href
209 self.sub_number = None
211 def add(self, name, part_href, level=0, is_part=True, index=None):
212 assert level == 0 or index is None
213 if level > 0 and self.children:
214 return self.children[-1].add(name, part_href, level-1, is_part)
217 t.part_href = part_href
218 if index is not None:
219 self.children.insert(index, t)
221 self.children.append(t)
223 t.sub_number = len(self.children) + 1
226 def append(self, toc):
227 self.children.append(toc)
229 def extend(self, toc):
230 self.children.extend(toc.children)
234 return max((c.depth() for c in self.children)) + 1
240 if self.sub_number is not None:
241 src += '#sub%d' % self.sub_number
244 def write_to_xml(self, nav_map, counter=1):
245 for child in self.children:
246 nav_point = nav_map.makeelement(NCXNS('navPoint'))
247 nav_point.set('id', 'NavPoint-%d' % counter)
248 nav_point.set('playOrder', str(counter))
250 nav_label = nav_map.makeelement(NCXNS('navLabel'))
251 text = nav_map.makeelement(NCXNS('text'))
252 text.text = child.name
253 nav_label.append(text)
254 nav_point.append(nav_label)
256 content = nav_map.makeelement(NCXNS('content'))
257 content.set('src', child.href())
258 nav_point.append(content)
259 nav_map.append(nav_point)
260 counter = child.write_to_xml(nav_point, counter + 1)
263 def html_part(self, depth=0):
265 for child in self.children:
267 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
268 (depth, child.href(), child.name))
269 texts.append(child.html_part(depth+1))
270 return "\n".join(texts)
273 with open(get_resource('epub/toc.html')) as f:
274 t = unicode(f.read(), 'utf-8')
275 return t % self.html_part()
278 def used_chars(element):
279 """ Lists characters used in an ETree Element """
280 chars = set((element.text or '') + (element.tail or ''))
281 for child in element:
282 chars = chars.union(used_chars(child))
287 """ divide main content of the XML file into chunks """
289 # prepare a container for each chunk
290 part_xml = etree.Element('utwor')
291 etree.SubElement(part_xml, 'master')
292 main_xml_part = part_xml[0] # master
294 last_node_part = False
296 # the below loop are workaround for a problem with epubs in drama ebooks without acts
299 for one_part in main_text:
301 if name == 'naglowek_scena':
303 elif name == 'naglowek_akt':
306 for one_part in main_text:
308 if is_act is False and is_scene is True:
309 if name == 'naglowek_czesc':
311 last_node_part = True
312 main_xml_part[:] = [deepcopy(one_part)]
313 elif not last_node_part and name == "naglowek_scena":
315 main_xml_part[:] = [deepcopy(one_part)]
317 main_xml_part.append(deepcopy(one_part))
318 last_node_part = False
320 if name == 'naglowek_czesc':
322 last_node_part = True
323 main_xml_part[:] = [deepcopy(one_part)]
324 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
326 main_xml_part[:] = [deepcopy(one_part)]
328 main_xml_part.append(deepcopy(one_part))
329 last_node_part = False
333 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
334 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
337 for element in chunk_xml[0]:
338 if element.tag == "naglowek_czesc":
339 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
340 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
341 toc.add(node_name(element), "part%d.html" % chunk_no)
342 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
343 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
344 element.set('sub', str(subnumber))
346 if not _empty_html_static:
347 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
349 output_html = _empty_html_static[0]
351 find_annotations(annotations, chunk_xml, chunk_no)
352 replace_by_verse(chunk_xml)
353 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
354 chars = used_chars(html_tree.getroot())
355 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
356 return output_html, toc, chars
359 def transform(wldoc, verbose=False,
360 style=None, html_toc=False,
361 sample=None, cover=None, flags=None):
362 """ produces a EPUB file
364 sample=n: generate sample e-book (with at least n paragraphs)
365 cover: a cover.Cover factory or True for default
366 flags: less-advertising, without-fonts, working-copy, with-full-fonts
369 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
370 """ processes one input file and proceeds to its children """
372 replace_characters(wldoc.edoc.getroot())
374 # every input file will have a TOC entry,
375 # pointing to starting chunk
376 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
379 # write book title page
380 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
381 chars = used_chars(html_tree.getroot())
382 zip.writestr('OPS/title.html',
383 etree.tostring(html_tree, method="html", pretty_print=True))
384 # add a title page TOC entry
385 toc.add(u"Strona tytułowa", "title.html")
386 elif wldoc.book_info.parts:
387 # write title page for every parent
388 if sample is not None and sample <= 0:
390 html_string = open(get_resource('epub/emptyChunk.html')).read()
392 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
393 chars = used_chars(html_tree.getroot())
394 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
395 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
396 add_to_manifest(manifest, chunk_counter)
397 add_to_spine(spine, chunk_counter)
400 if len(wldoc.edoc.getroot()) > 1:
401 # rdf before style master
402 main_text = wldoc.edoc.getroot()[1]
404 # rdf in style master
405 main_text = wldoc.edoc.getroot()[0]
406 if main_text.tag == RDFNS('RDF'):
409 if main_text is not None:
410 for chunk_xml in chop(main_text):
412 if sample is not None:
416 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
417 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
419 toc.extend(chunk_toc)
420 chars = chars.union(chunk_chars)
421 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
422 add_to_manifest(manifest, chunk_counter)
423 add_to_spine(spine, chunk_counter)
426 for child in wldoc.parts():
427 child_toc, chunk_counter, chunk_chars, sample = transform_file(
428 child, chunk_counter, first=False, sample=sample)
429 toc.append(child_toc)
430 chars = chars.union(chunk_chars)
432 return toc, chunk_counter, chars, sample
435 document = deepcopy(wldoc)
440 document.edoc.getroot().set(flag, 'yes')
443 document.edoc.getroot().set('editors', u', '.join(sorted(
444 editor.readable() for editor in document.editors())))
445 if document.book_info.funders:
446 document.edoc.getroot().set('funders', u', '.join(
447 document.book_info.funders))
448 if document.book_info.thanks:
449 document.edoc.getroot().set('thanks', document.book_info.thanks)
451 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
452 manifest = opf.find('.//' + OPFNS('manifest'))
453 guide = opf.find('.//' + OPFNS('guide'))
454 spine = opf.find('.//' + OPFNS('spine'))
456 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
457 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
459 # write static elements
460 mime = zipfile.ZipInfo()
461 mime.filename = 'mimetype'
462 mime.compress_type = zipfile.ZIP_STORED
464 zip.writestr(mime, 'application/epub+zip')
465 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
466 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
467 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
468 'media-type="application/oebps-package+xml" />' \
469 '</rootfiles></container>')
470 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
471 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
473 style = get_resource('epub/style.css')
474 zip.write(style, os.path.join('OPS', 'style.css'))
478 cover = DefaultEbookCover
480 cover_file = StringIO()
481 bound_cover = cover(document.book_info)
482 bound_cover.save(cover_file)
483 cover_name = 'cover.%s' % bound_cover.ext()
484 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
487 cover_tree = etree.parse(get_resource('epub/cover.html'))
488 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
489 zip.writestr('OPS/cover.html', etree.tostring(
490 cover_tree, method="html", pretty_print=True))
492 if bound_cover.uses_dc_cover:
493 if document.book_info.cover_by:
494 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
495 if document.book_info.cover_source:
496 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
498 manifest.append(etree.fromstring(
499 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
500 manifest.append(etree.fromstring(
501 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
502 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
503 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
504 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
507 annotations = etree.Element('annotations')
509 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
510 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
511 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
512 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
514 nav_map = toc_file[-1]
517 manifest.append(etree.fromstring(
518 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
519 spine.append(etree.fromstring(
520 '<itemref idref="html_toc" />'))
521 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
523 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
525 if len(toc.children) < 2:
526 toc.add(u"Początek utworu", "part1.html")
528 # Last modifications in container files and EPUB creation
529 if len(annotations) > 0:
530 toc.add("Przypisy", "annotations.html")
531 manifest.append(etree.fromstring(
532 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
533 spine.append(etree.fromstring(
534 '<itemref idref="annotations" />'))
535 replace_by_verse(annotations)
536 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
537 chars = chars.union(used_chars(html_tree.getroot()))
538 zip.writestr('OPS/annotations.html', etree.tostring(
539 html_tree, method="html", pretty_print=True))
541 toc.add("Wesprzyj Wolne Lektury", "support.html")
542 manifest.append(etree.fromstring(
543 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
544 spine.append(etree.fromstring(
545 '<itemref idref="support" />'))
546 html_string = open(get_resource('epub/support.html')).read()
547 chars.update(used_chars(etree.fromstring(html_string)))
548 zip.writestr('OPS/support.html', html_string)
550 toc.add("Strona redakcyjna", "last.html")
551 manifest.append(etree.fromstring(
552 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
553 spine.append(etree.fromstring(
554 '<itemref idref="last" />'))
555 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
556 chars.update(used_chars(html_tree.getroot()))
557 zip.writestr('OPS/last.html', etree.tostring(
558 html_tree, method="html", pretty_print=True))
560 if not flags or not 'without-fonts' in flags:
562 tmpdir = mkdtemp('-librarian-epub')
568 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
569 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
570 if not flags or not 'with-full-fonts' in flags:
571 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
572 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
574 print "Running font-optimizer"
575 subprocess.check_call(optimizer_call)
577 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
578 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
580 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
581 manifest.append(etree.fromstring(
582 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
586 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
587 title = document.book_info.title
588 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
589 for st in attributes:
590 meta = toc_file.makeelement(NCXNS('meta'))
592 meta.set('content', '0')
593 toc_file[0].append(meta)
594 toc_file[0][0].set('content', str(document.book_info.url))
595 toc_file[0][1].set('content', str(toc.depth()))
596 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
600 toc.add(u"Spis treści", "toc.html", index=1)
601 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
602 toc.write_to_xml(nav_map)
603 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
606 return OutputFile.from_filename(output_file.name)