1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 class Stanza(object):
115 Converts / verse endings into verse elements in a stanza.
117 Slashes may only occur directly in the stanza. Any slashes in subelements
118 will be ignored, and the subelements will be put inside verse elements.
120 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121 >>> Stanza(s).versify()
122 >>> print etree.tostring(s)
123 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
124 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127 def __init__(self, stanza_elem):
128 self.stanza = stanza_elem
130 self.open_verse = None
133 self.push_text(self.stanza.text)
134 for elem in self.stanza:
136 self.push_text(elem.tail)
137 tail = self.stanza.tail
139 self.stanza.tail = tail
140 self.stanza.extend(self.verses)
142 def open_normal_verse(self):
143 self.open_verse = self.stanza.makeelement("wers_normalny")
144 self.verses.append(self.open_verse)
146 def get_open_verse(self):
147 if self.open_verse is None:
148 self.open_normal_verse()
149 return self.open_verse
151 def push_text(self, text):
154 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156 self.open_normal_verse()
157 verse = self.get_open_verse()
159 verse[-1].tail = (verse[-1].tail or "") + verse_text
161 verse.text = (verse.text or "") + verse_text
163 def push_elem(self, elem):
164 if elem.tag.startswith("wers"):
165 verse = deepcopy(elem)
167 self.verses.append(verse)
168 self.open_verse = verse
170 appended = deepcopy(elem)
172 self.get_open_verse().append(appended)
175 def replace_by_verse(tree):
176 """ Find stanzas and create new verses in place of a '/' character """
178 stanzas = tree.findall('.//' + WLNS('strofa'))
179 for stanza in stanzas:
180 Stanza(stanza).versify()
183 def add_to_manifest(manifest, partno):
184 """ Adds a node to the manifest section in content.opf file """
186 partstr = 'part%d' % partno
187 e = manifest.makeelement(OPFNS('item'), attrib={
189 'href': partstr + '.html',
190 'media-type': 'application/xhtml+xml',
195 def add_to_spine(spine, partno):
196 """ Adds a node to the spine section in content.opf file """
198 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
203 def __init__(self, name=None, part_href=None):
206 self.part_href = part_href
207 self.sub_number = None
209 def add(self, name, part_href, level=0, is_part=True, index=None):
210 assert level == 0 or index is None
211 if level > 0 and self.children:
212 return self.children[-1].add(name, part_href, level-1, is_part)
215 t.part_href = part_href
216 if index is not None:
217 self.children.insert(index, t)
219 self.children.append(t)
221 t.sub_number = len(self.children) + 1
224 def append(self, toc):
225 self.children.append(toc)
227 def extend(self, toc):
228 self.children.extend(toc.children)
232 return max((c.depth() for c in self.children)) + 1
238 if self.sub_number is not None:
239 src += '#sub%d' % self.sub_number
242 def write_to_xml(self, nav_map, counter=1):
243 for child in self.children:
244 nav_point = nav_map.makeelement(NCXNS('navPoint'))
245 nav_point.set('id', 'NavPoint-%d' % counter)
246 nav_point.set('playOrder', str(counter))
248 nav_label = nav_map.makeelement(NCXNS('navLabel'))
249 text = nav_map.makeelement(NCXNS('text'))
250 text.text = child.name
251 nav_label.append(text)
252 nav_point.append(nav_label)
254 content = nav_map.makeelement(NCXNS('content'))
255 content.set('src', child.href())
256 nav_point.append(content)
257 nav_map.append(nav_point)
258 counter = child.write_to_xml(nav_point, counter + 1)
261 def html_part(self, depth=0):
263 for child in self.children:
265 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266 (depth, child.href(), child.name))
267 texts.append(child.html_part(depth+1))
268 return "\n".join(texts)
271 with open(get_resource('epub/toc.html')) as f:
272 t = unicode(f.read(), 'utf-8')
273 return t % self.html_part()
276 def used_chars(element):
277 """ Lists characters used in an ETree Element """
278 chars = set((element.text or '') + (element.tail or ''))
279 for child in element:
280 chars = chars.union(used_chars(child))
285 """ divide main content of the XML file into chunks """
287 # prepare a container for each chunk
288 part_xml = etree.Element('utwor')
289 etree.SubElement(part_xml, 'master')
290 main_xml_part = part_xml[0] # master
292 last_node_part = False
293 for one_part in main_text:
295 if name == 'naglowek_czesc':
297 last_node_part = True
298 main_xml_part[:] = [deepcopy(one_part)]
299 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
301 main_xml_part[:] = [deepcopy(one_part)]
303 main_xml_part.append(deepcopy(one_part))
304 last_node_part = False
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
309 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
312 for element in chunk_xml[0]:
313 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
314 toc.add(node_name(element), "part%d.html" % chunk_no)
315 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
316 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
317 element.set('sub', str(subnumber))
319 if not _empty_html_static:
320 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
322 output_html = _empty_html_static[0]
324 find_annotations(annotations, chunk_xml, chunk_no)
325 replace_by_verse(chunk_xml)
326 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
327 chars = used_chars(html_tree.getroot())
328 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
329 return output_html, toc, chars
332 def transform(wldoc, verbose=False,
333 style=None, html_toc=False,
334 sample=None, cover=None, flags=None):
335 """ produces a EPUB file
337 sample=n: generate sample e-book (with at least n paragraphs)
338 cover: a cover.Cover factory or True for default
339 flags: less-advertising, without-fonts, working-copy, with-full-fonts
342 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
343 """ processes one input file and proceeds to its children """
345 replace_characters(wldoc.edoc.getroot())
347 # every input file will have a TOC entry,
348 # pointing to starting chunk
349 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
352 # write book title page
353 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
354 chars = used_chars(html_tree.getroot())
355 zip.writestr('OPS/title.html',
356 etree.tostring(html_tree, method="html", pretty_print=True))
357 # add a title page TOC entry
358 toc.add(u"Strona tytułowa", "title.html")
359 elif wldoc.book_info.parts:
360 # write title page for every parent
361 if sample is not None and sample <= 0:
363 html_string = open(get_resource('epub/emptyChunk.html')).read()
365 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
366 chars = used_chars(html_tree.getroot())
367 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
368 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
369 add_to_manifest(manifest, chunk_counter)
370 add_to_spine(spine, chunk_counter)
373 if len(wldoc.edoc.getroot()) > 1:
374 # rdf before style master
375 main_text = wldoc.edoc.getroot()[1]
377 # rdf in style master
378 main_text = wldoc.edoc.getroot()[0]
379 if main_text.tag == RDFNS('RDF'):
382 if main_text is not None:
383 for chunk_xml in chop(main_text):
385 if sample is not None:
389 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
390 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
392 toc.extend(chunk_toc)
393 chars = chars.union(chunk_chars)
394 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
395 add_to_manifest(manifest, chunk_counter)
396 add_to_spine(spine, chunk_counter)
399 for child in wldoc.parts():
400 child_toc, chunk_counter, chunk_chars, sample = transform_file(
401 child, chunk_counter, first=False, sample=sample)
402 toc.append(child_toc)
403 chars = chars.union(chunk_chars)
405 return toc, chunk_counter, chars, sample
408 document = deepcopy(wldoc)
413 document.edoc.getroot().set(flag, 'yes')
416 document.edoc.getroot().set('editors', u', '.join(sorted(
417 editor.readable() for editor in document.editors())))
418 if document.book_info.funders:
419 document.edoc.getroot().set('funders', u', '.join(
420 document.book_info.funders))
421 if document.book_info.thanks:
422 document.edoc.getroot().set('thanks', document.book_info.thanks)
424 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
425 manifest = opf.find('.//' + OPFNS('manifest'))
426 guide = opf.find('.//' + OPFNS('guide'))
427 spine = opf.find('.//' + OPFNS('spine'))
429 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
430 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
432 # write static elements
433 mime = zipfile.ZipInfo()
434 mime.filename = 'mimetype'
435 mime.compress_type = zipfile.ZIP_STORED
437 zip.writestr(mime, 'application/epub+zip')
438 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
439 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
440 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
441 'media-type="application/oebps-package+xml" />' \
442 '</rootfiles></container>')
443 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
444 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
446 style = get_resource('epub/style.css')
447 zip.write(style, os.path.join('OPS', 'style.css'))
451 cover = DefaultEbookCover
453 cover_file = StringIO()
454 bound_cover = cover(document.book_info)
455 bound_cover.save(cover_file)
456 cover_name = 'cover.%s' % bound_cover.ext()
457 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
460 cover_tree = etree.parse(get_resource('epub/cover.html'))
461 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
462 zip.writestr('OPS/cover.html', etree.tostring(
463 cover_tree, method="html", pretty_print=True))
465 if bound_cover.uses_dc_cover:
466 if document.book_info.cover_by:
467 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
468 if document.book_info.cover_source:
469 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
471 manifest.append(etree.fromstring(
472 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
473 manifest.append(etree.fromstring(
474 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
475 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
476 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
477 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
480 annotations = etree.Element('annotations')
482 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
483 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
484 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
485 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
487 nav_map = toc_file[-1]
490 manifest.append(etree.fromstring(
491 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
492 spine.append(etree.fromstring(
493 '<itemref idref="html_toc" />'))
494 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
496 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
498 if len(toc.children) < 2:
499 toc.add(u"Początek utworu", "part1.html")
501 # Last modifications in container files and EPUB creation
502 if len(annotations) > 0:
503 toc.add("Przypisy", "annotations.html")
504 manifest.append(etree.fromstring(
505 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
506 spine.append(etree.fromstring(
507 '<itemref idref="annotations" />'))
508 replace_by_verse(annotations)
509 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
510 chars = chars.union(used_chars(html_tree.getroot()))
511 zip.writestr('OPS/annotations.html', etree.tostring(
512 html_tree, method="html", pretty_print=True))
514 toc.add("Wesprzyj Wolne Lektury", "support.html")
515 manifest.append(etree.fromstring(
516 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
517 spine.append(etree.fromstring(
518 '<itemref idref="support" />'))
519 html_string = open(get_resource('epub/support.html')).read()
520 chars.update(used_chars(etree.fromstring(html_string)))
521 zip.writestr('OPS/support.html', html_string)
523 toc.add("Strona redakcyjna", "last.html")
524 manifest.append(etree.fromstring(
525 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
526 spine.append(etree.fromstring(
527 '<itemref idref="last" />'))
528 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
529 chars.update(used_chars(html_tree.getroot()))
530 zip.writestr('OPS/last.html', etree.tostring(
531 html_tree, method="html", pretty_print=True))
533 if not flags or not 'without-fonts' in flags:
535 tmpdir = mkdtemp('-librarian-epub')
541 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
542 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
543 if not flags or not 'with-full-fonts' in flags:
544 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
545 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
547 print "Running font-optimizer"
548 subprocess.check_call(optimizer_call)
550 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
551 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
553 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
554 manifest.append(etree.fromstring(
555 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
560 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
561 title = document.book_info.title
562 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
563 for st in attributes:
564 meta = toc_file.makeelement(NCXNS('meta'))
566 meta.set('content', '0')
567 toc_file[0].append(meta)
568 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
569 toc_file[0][1].set('content', str(toc.depth()))
570 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
574 toc.add(u"Spis treści", "toc.html", index=1)
575 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
576 toc.write_to_xml(nav_map)
577 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
580 return OutputFile.from_filename(output_file.name)