1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
25 functions.reg_lang_code_3to2()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 class Stanza(object):
116 Converts / verse endings into verse elements in a stanza.
118 Slashes may only occur directly in the stanza. Any slashes in subelements
119 will be ignored, and the subelements will be put inside verse elements.
121 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122 >>> Stanza(s).versify()
123 >>> print etree.tostring(s)
124 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
128 def __init__(self, stanza_elem):
129 self.stanza = stanza_elem
131 self.open_verse = None
134 self.push_text(self.stanza.text)
135 for elem in self.stanza:
137 self.push_text(elem.tail)
138 tail = self.stanza.tail
140 self.stanza.tail = tail
141 self.stanza.extend(self.verses)
143 def open_normal_verse(self):
144 self.open_verse = self.stanza.makeelement("wers_normalny")
145 self.verses.append(self.open_verse)
147 def get_open_verse(self):
148 if self.open_verse is None:
149 self.open_normal_verse()
150 return self.open_verse
152 def push_text(self, text):
155 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157 self.open_normal_verse()
158 verse = self.get_open_verse()
160 verse[-1].tail = (verse[-1].tail or "") + verse_text
162 verse.text = (verse.text or "") + verse_text
164 def push_elem(self, elem):
165 if elem.tag.startswith("wers"):
166 verse = deepcopy(elem)
168 self.verses.append(verse)
169 self.open_verse = verse
171 appended = deepcopy(elem)
173 self.get_open_verse().append(appended)
176 def replace_by_verse(tree):
177 """ Find stanzas and create new verses in place of a '/' character """
179 stanzas = tree.findall('.//' + WLNS('strofa'))
180 for stanza in stanzas:
181 Stanza(stanza).versify()
184 def add_to_manifest(manifest, partno):
185 """ Adds a node to the manifest section in content.opf file """
187 partstr = 'part%d' % partno
188 e = manifest.makeelement(OPFNS('item'), attrib={
190 'href': partstr + '.html',
191 'media-type': 'application/xhtml+xml',
196 def add_to_spine(spine, partno):
197 """ Adds a node to the spine section in content.opf file """
199 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
204 def __init__(self, name=None, part_href=None):
207 self.part_href = part_href
208 self.sub_number = None
210 def add(self, name, part_href, level=0, is_part=True, index=None):
211 assert level == 0 or index is None
212 if level > 0 and self.children:
213 return self.children[-1].add(name, part_href, level-1, is_part)
216 t.part_href = part_href
217 if index is not None:
218 self.children.insert(index, t)
220 self.children.append(t)
222 t.sub_number = len(self.children) + 1
225 def append(self, toc):
226 self.children.append(toc)
228 def extend(self, toc):
229 self.children.extend(toc.children)
233 return max((c.depth() for c in self.children)) + 1
239 if self.sub_number is not None:
240 src += '#sub%d' % self.sub_number
243 def write_to_xml(self, nav_map, counter=1):
244 for child in self.children:
245 nav_point = nav_map.makeelement(NCXNS('navPoint'))
246 nav_point.set('id', 'NavPoint-%d' % counter)
247 nav_point.set('playOrder', str(counter))
249 nav_label = nav_map.makeelement(NCXNS('navLabel'))
250 text = nav_map.makeelement(NCXNS('text'))
251 text.text = child.name
252 nav_label.append(text)
253 nav_point.append(nav_label)
255 content = nav_map.makeelement(NCXNS('content'))
256 content.set('src', child.href())
257 nav_point.append(content)
258 nav_map.append(nav_point)
259 counter = child.write_to_xml(nav_point, counter + 1)
262 def html_part(self, depth=0):
264 for child in self.children:
266 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267 (depth, child.href(), child.name))
268 texts.append(child.html_part(depth+1))
269 return "\n".join(texts)
272 with open(get_resource('epub/toc.html')) as f:
273 t = unicode(f.read(), 'utf-8')
274 return t % self.html_part()
277 def used_chars(element):
278 """ Lists characters used in an ETree Element """
279 chars = set((element.text or '') + (element.tail or ''))
280 for child in element:
281 chars = chars.union(used_chars(child))
286 """ divide main content of the XML file into chunks """
288 # prepare a container for each chunk
289 part_xml = etree.Element('utwor')
290 etree.SubElement(part_xml, 'master')
291 main_xml_part = part_xml[0] # master
293 last_node_part = False
295 # the below loops are workaround for a problem with epubs in drama ebooks without acts
297 for one_part in main_text:
299 if name in ('naglowek_scena'):
303 is_scene_with_acts = False
304 for one_part in main_text:
305 if one_part.tag == 'naglowek_akt':
306 is_scene_with_acts = True
309 is_scene_with_acts = False
311 for one_part in main_text:
313 if is_scene_with_acts is False and is_scene is True:
314 if name == 'naglowek_czesc':
316 last_node_part = True
317 main_xml_part[:] = [deepcopy(one_part)]
318 elif not last_node_part and name in ("naglowek_scena"):
320 main_xml_part[:] = [deepcopy(one_part)]
322 main_xml_part.append(deepcopy(one_part))
323 last_node_part = False
325 if name == 'naglowek_czesc':
327 last_node_part = True
328 main_xml_part[:] = [deepcopy(one_part)]
329 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
331 main_xml_part[:] = [deepcopy(one_part)]
333 main_xml_part.append(deepcopy(one_part))
334 last_node_part = False
338 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
339 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
342 for element in chunk_xml[0]:
343 if element.tag in ("naglowek_czesc"):
344 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
345 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
346 toc.add(node_name(element), "part%d.html" % chunk_no)
347 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
348 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
349 element.set('sub', str(subnumber))
351 if not _empty_html_static:
352 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
354 output_html = _empty_html_static[0]
356 find_annotations(annotations, chunk_xml, chunk_no)
357 replace_by_verse(chunk_xml)
358 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
359 chars = used_chars(html_tree.getroot())
360 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
361 return output_html, toc, chars
364 def transform(wldoc, verbose=False,
365 style=None, html_toc=False,
366 sample=None, cover=None, flags=None):
367 """ produces a EPUB file
369 sample=n: generate sample e-book (with at least n paragraphs)
370 cover: a cover.Cover factory or True for default
371 flags: less-advertising, without-fonts, working-copy, with-full-fonts
374 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
375 """ processes one input file and proceeds to its children """
377 replace_characters(wldoc.edoc.getroot())
379 # every input file will have a TOC entry,
380 # pointing to starting chunk
381 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
384 # write book title page
385 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
386 chars = used_chars(html_tree.getroot())
387 zip.writestr('OPS/title.html',
388 etree.tostring(html_tree, method="html", pretty_print=True))
389 # add a title page TOC entry
390 toc.add(u"Strona tytułowa", "title.html")
391 elif wldoc.book_info.parts:
392 # write title page for every parent
393 if sample is not None and sample <= 0:
395 html_string = open(get_resource('epub/emptyChunk.html')).read()
397 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
398 chars = used_chars(html_tree.getroot())
399 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
400 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
401 add_to_manifest(manifest, chunk_counter)
402 add_to_spine(spine, chunk_counter)
405 if len(wldoc.edoc.getroot()) > 1:
406 # rdf before style master
407 main_text = wldoc.edoc.getroot()[1]
409 # rdf in style master
410 main_text = wldoc.edoc.getroot()[0]
411 if main_text.tag == RDFNS('RDF'):
414 if main_text is not None:
415 for chunk_xml in chop(main_text):
417 if sample is not None:
421 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
422 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
424 toc.extend(chunk_toc)
425 chars = chars.union(chunk_chars)
426 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
427 add_to_manifest(manifest, chunk_counter)
428 add_to_spine(spine, chunk_counter)
431 for child in wldoc.parts():
432 child_toc, chunk_counter, chunk_chars, sample = transform_file(
433 child, chunk_counter, first=False, sample=sample)
434 toc.append(child_toc)
435 chars = chars.union(chunk_chars)
437 return toc, chunk_counter, chars, sample
440 document = deepcopy(wldoc)
445 document.edoc.getroot().set(flag, 'yes')
448 document.edoc.getroot().set('editors', u', '.join(sorted(
449 editor.readable() for editor in document.editors())))
450 if document.book_info.funders:
451 document.edoc.getroot().set('funders', u', '.join(
452 document.book_info.funders))
453 if document.book_info.thanks:
454 document.edoc.getroot().set('thanks', document.book_info.thanks)
456 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
457 manifest = opf.find('.//' + OPFNS('manifest'))
458 guide = opf.find('.//' + OPFNS('guide'))
459 spine = opf.find('.//' + OPFNS('spine'))
461 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
462 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
464 # write static elements
465 mime = zipfile.ZipInfo()
466 mime.filename = 'mimetype'
467 mime.compress_type = zipfile.ZIP_STORED
469 zip.writestr(mime, 'application/epub+zip')
470 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
471 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
472 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
473 'media-type="application/oebps-package+xml" />' \
474 '</rootfiles></container>')
475 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
476 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
478 style = get_resource('epub/style.css')
479 zip.write(style, os.path.join('OPS', 'style.css'))
483 cover = DefaultEbookCover
485 cover_file = StringIO()
486 bound_cover = cover(document.book_info)
487 bound_cover.save(cover_file)
488 cover_name = 'cover.%s' % bound_cover.ext()
489 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
492 cover_tree = etree.parse(get_resource('epub/cover.html'))
493 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
494 zip.writestr('OPS/cover.html', etree.tostring(
495 cover_tree, method="html", pretty_print=True))
497 if bound_cover.uses_dc_cover:
498 if document.book_info.cover_by:
499 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
500 if document.book_info.cover_source:
501 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
503 manifest.append(etree.fromstring(
504 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
505 manifest.append(etree.fromstring(
506 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
507 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
508 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
509 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
512 annotations = etree.Element('annotations')
514 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
515 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
516 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
517 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
519 nav_map = toc_file[-1]
522 manifest.append(etree.fromstring(
523 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
524 spine.append(etree.fromstring(
525 '<itemref idref="html_toc" />'))
526 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
528 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
530 if len(toc.children) < 2:
531 toc.add(u"Początek utworu", "part1.html")
533 # Last modifications in container files and EPUB creation
534 if len(annotations) > 0:
535 toc.add("Przypisy", "annotations.html")
536 manifest.append(etree.fromstring(
537 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
538 spine.append(etree.fromstring(
539 '<itemref idref="annotations" />'))
540 replace_by_verse(annotations)
541 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
542 chars = chars.union(used_chars(html_tree.getroot()))
543 zip.writestr('OPS/annotations.html', etree.tostring(
544 html_tree, method="html", pretty_print=True))
546 toc.add("Wesprzyj Wolne Lektury", "support.html")
547 manifest.append(etree.fromstring(
548 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
549 spine.append(etree.fromstring(
550 '<itemref idref="support" />'))
551 html_string = open(get_resource('epub/support.html')).read()
552 chars.update(used_chars(etree.fromstring(html_string)))
553 zip.writestr('OPS/support.html', html_string)
555 toc.add("Strona redakcyjna", "last.html")
556 manifest.append(etree.fromstring(
557 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
558 spine.append(etree.fromstring(
559 '<itemref idref="last" />'))
560 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
561 chars.update(used_chars(html_tree.getroot()))
562 zip.writestr('OPS/last.html', etree.tostring(
563 html_tree, method="html", pretty_print=True))
565 if not flags or not 'without-fonts' in flags:
567 tmpdir = mkdtemp('-librarian-epub')
573 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
574 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
575 if not flags or not 'with-full-fonts' in flags:
576 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
577 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
579 print "Running font-optimizer"
580 subprocess.check_call(optimizer_call)
582 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
583 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
585 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
586 manifest.append(etree.fromstring(
587 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
591 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
592 title = document.book_info.title
593 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
594 for st in attributes:
595 meta = toc_file.makeelement(NCXNS('meta'))
597 meta.set('content', '0')
598 toc_file[0].append(meta)
599 toc_file[0][0].set('content', str(document.book_info.url))
600 toc_file[0][1].set('content', str(toc.depth()))
601 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
605 toc.add(u"Spis treści", "toc.html", index=1)
606 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
607 toc.write_to_xml(nav_map)
608 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
611 return OutputFile.from_filename(output_file.name)