1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
19 from librarian.cover import WLCover
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
36 def set_inner_xml(node, text):
37 """ sets node's text and children from a string
39 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40 >>> set_inner_xml(e, 'x<b>y</b>z')
41 >>> print etree.tostring(e)
45 p = etree.fromstring('<x>%s</x>' % text)
51 """ Find out a node's name
53 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57 tempnode = deepcopy(node)
59 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60 for e in tempnode.findall('.//%s' % p):
64 etree.strip_tags(tempnode, '*')
69 if isinstance(xml, etree._Element):
70 xml = etree.ElementTree(xml)
71 with open(sheet) as xsltf:
72 return xml.xslt(etree.parse(xsltf))
75 def replace_characters(node):
76 def replace_chars(text):
79 return text.replace(u"\ufeff", u"")\
80 .replace("---", u"\u2014")\
81 .replace("--", u"\u2013")\
82 .replace(",,", u"\u201E")\
83 .replace('"', u"\u201D")\
84 .replace("'", u"\u2019")
85 if node.tag in ('uwaga', 'extra'):
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'uwaga'):
109 find_annotations(annotations, child, part_no)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
115 stanzas = tree.findall('.//' + WLNS('strofa'))
117 for child_node in node:
118 if child_node.tag in ('slowo_obce', 'wyroznienie'):
119 foreign_verses = inner_xml(child_node).split('/\n')
120 if len(foreign_verses) > 1:
122 for foreign_verse in foreign_verses:
123 if foreign_verse.startswith('<wers'):
124 new_foreign += foreign_verse
126 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127 set_inner_xml(child_node, new_foreign)
128 verses = inner_xml(node).split('/\n')
130 modified_inner_xml = ''
132 if verse.startswith('<wers') or verse.startswith('<extra'):
133 modified_inner_xml += verse
135 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136 set_inner_xml(node, modified_inner_xml)
139 def add_to_manifest(manifest, partno):
140 """ Adds a node to the manifest section in content.opf file """
142 partstr = 'part%d' % partno
143 e = manifest.makeelement(OPFNS('item'), attrib={
145 'href': partstr + '.html',
146 'media-type': 'application/xhtml+xml',
151 def add_to_spine(spine, partno):
152 """ Adds a node to the spine section in content.opf file """
154 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159 def __init__(self, name=None, part_href=None):
162 self.part_href = part_href
163 self.sub_number = None
165 def add(self, name, part_href, level=0, is_part=True, index=None):
166 assert level == 0 or index is None
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_href, level-1, is_part)
171 t.part_href = part_href
172 if index is not None:
173 self.children.insert(index, t)
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
194 if self.sub_number is not None:
195 src += '#sub%d' % self.sub_number
198 def write_to_xml(self, nav_map, counter=1):
199 for child in self.children:
200 nav_point = nav_map.makeelement(NCXNS('navPoint'))
201 nav_point.set('id', 'NavPoint-%d' % counter)
202 nav_point.set('playOrder', str(counter))
204 nav_label = nav_map.makeelement(NCXNS('navLabel'))
205 text = nav_map.makeelement(NCXNS('text'))
206 text.text = child.name
207 nav_label.append(text)
208 nav_point.append(nav_label)
210 content = nav_map.makeelement(NCXNS('content'))
211 content.set('src', child.href())
212 nav_point.append(content)
213 nav_map.append(nav_point)
214 counter = child.write_to_xml(nav_point, counter + 1)
217 def html_part(self, depth=0):
219 for child in self.children:
221 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
222 (depth, child.href(), child.name))
223 texts.append(child.html_part(depth+1))
224 return "\n".join(texts)
227 with open(get_resource('epub/toc.html')) as f:
228 t = unicode(f.read(), 'utf-8')
229 return t % self.html_part()
232 def used_chars(element):
233 """ Lists characters used in an ETree Element """
234 chars = set((element.text or '') + (element.tail or ''))
235 for child in element:
236 chars = chars.union(used_chars(child))
241 """ divide main content of the XML file into chunks """
243 # prepare a container for each chunk
244 part_xml = etree.Element('utwor')
245 etree.SubElement(part_xml, 'master')
246 main_xml_part = part_xml[0] # master
248 last_node_part = False
249 for one_part in main_text:
251 if name == 'naglowek_czesc':
253 last_node_part = True
254 main_xml_part[:] = [deepcopy(one_part)]
255 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
257 main_xml_part[:] = [deepcopy(one_part)]
259 main_xml_part.append(deepcopy(one_part))
260 last_node_part = False
264 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
265 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
268 for element in chunk_xml[0]:
269 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
270 toc.add(node_name(element), "part%d.html" % chunk_no)
271 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
272 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
273 element.set('sub', str(subnumber))
275 if not _empty_html_static:
276 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
278 output_html = _empty_html_static[0]
280 find_annotations(annotations, chunk_xml, chunk_no)
281 replace_by_verse(chunk_xml)
282 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
283 chars = used_chars(html_tree.getroot())
284 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
285 return output_html, toc, chars
288 def transform(wldoc, verbose=False,
289 style=None, html_toc=False,
290 sample=None, cover=None, flags=None):
291 """ produces a EPUB file
293 sample=n: generate sample e-book (with at least n paragraphs)
294 cover: a cover.Cover factory or True for default
295 flags: less-advertising, without-fonts, working-copy
298 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
299 """ processes one input file and proceeds to its children """
301 replace_characters(wldoc.edoc.getroot())
303 # every input file will have a TOC entry,
304 # pointing to starting chunk
305 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
308 # write book title page
309 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
310 chars = used_chars(html_tree.getroot())
311 zip.writestr('OPS/title.html',
312 etree.tostring(html_tree, method="html", pretty_print=True))
313 # add a title page TOC entry
314 toc.add(u"Strona tytułowa", "title.html")
315 elif wldoc.book_info.parts:
316 # write title page for every parent
317 if sample is not None and sample <= 0:
319 html_string = open(get_resource('epub/emptyChunk.html')).read()
321 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
322 chars = used_chars(html_tree.getroot())
323 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
324 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
325 add_to_manifest(manifest, chunk_counter)
326 add_to_spine(spine, chunk_counter)
329 if len(wldoc.edoc.getroot()) > 1:
330 # rdf before style master
331 main_text = wldoc.edoc.getroot()[1]
333 # rdf in style master
334 main_text = wldoc.edoc.getroot()[0]
335 if main_text.tag == RDFNS('RDF'):
338 if main_text is not None:
339 for chunk_xml in chop(main_text):
341 if sample is not None:
345 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
346 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
348 toc.extend(chunk_toc)
349 chars = chars.union(chunk_chars)
350 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
351 add_to_manifest(manifest, chunk_counter)
352 add_to_spine(spine, chunk_counter)
355 for child in wldoc.parts():
356 child_toc, chunk_counter, chunk_chars, sample = transform_file(
357 child, chunk_counter, first=False, sample=sample)
358 toc.append(child_toc)
359 chars = chars.union(chunk_chars)
361 return toc, chunk_counter, chars, sample
364 document = deepcopy(wldoc)
369 document.edoc.getroot().set(flag, 'yes')
372 document.edoc.getroot().set('editors', u', '.join(sorted(
373 editor.readable() for editor in document.editors())))
375 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
376 manifest = opf.find('.//' + OPFNS('manifest'))
377 guide = opf.find('.//' + OPFNS('guide'))
378 spine = opf.find('.//' + OPFNS('spine'))
380 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
381 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
383 # write static elements
384 mime = zipfile.ZipInfo()
385 mime.filename = 'mimetype'
386 mime.compress_type = zipfile.ZIP_STORED
388 zip.writestr(mime, 'application/epub+zip')
389 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
390 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
391 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
392 'media-type="application/oebps-package+xml" />' \
393 '</rootfiles></container>')
394 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
395 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
397 style = get_resource('epub/style.css')
398 zip.write(style, os.path.join('OPS', 'style.css'))
404 cover_file = StringIO()
405 bound_cover = cover(document.book_info)
406 bound_cover.save(cover_file)
407 cover_name = 'cover.%s' % bound_cover.ext()
408 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
411 cover_tree = etree.parse(get_resource('epub/cover.html'))
412 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
413 zip.writestr('OPS/cover.html', etree.tostring(
414 cover_tree, method="html", pretty_print=True))
416 if bound_cover.uses_dc_cover:
417 if document.book_info.cover_by:
418 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
419 if document.book_info.cover_source:
420 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
422 manifest.append(etree.fromstring(
423 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
424 manifest.append(etree.fromstring(
425 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
426 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
427 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
428 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
431 annotations = etree.Element('annotations')
433 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
434 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
435 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
436 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
438 nav_map = toc_file[-1]
441 manifest.append(etree.fromstring(
442 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
443 spine.append(etree.fromstring(
444 '<itemref idref="html_toc" />'))
445 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
447 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
449 if len(toc.children) < 2:
450 toc.add(u"Początek utworu", "part1.html")
452 # Last modifications in container files and EPUB creation
453 if len(annotations) > 0:
454 toc.add("Przypisy", "annotations.html")
455 manifest.append(etree.fromstring(
456 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
457 spine.append(etree.fromstring(
458 '<itemref idref="annotations" />'))
459 replace_by_verse(annotations)
460 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
461 chars = chars.union(used_chars(html_tree.getroot()))
462 zip.writestr('OPS/annotations.html', etree.tostring(
463 html_tree, method="html", pretty_print=True))
465 toc.add("Strona redakcyjna", "last.html")
466 manifest.append(etree.fromstring(
467 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
468 spine.append(etree.fromstring(
469 '<itemref idref="last" />'))
470 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
471 chars.update(used_chars(html_tree.getroot()))
472 zip.writestr('OPS/last.html', etree.tostring(
473 html_tree, method="html", pretty_print=True))
475 if not flags or not 'without-fonts' in flags:
477 tmpdir = mkdtemp('-librarian-epub')
483 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
484 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
485 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
486 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
488 print "Running font-optimizer"
489 subprocess.check_call(optimizer_call)
491 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
492 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
493 manifest.append(etree.fromstring(
494 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
499 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
500 title = document.book_info.title
501 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
502 for st in attributes:
503 meta = toc_file.makeelement(NCXNS('meta'))
505 meta.set('content', '0')
506 toc_file[0].append(meta)
507 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
508 toc_file[0][1].set('content', str(toc.depth()))
509 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
513 toc.add(u"Spis treści", "toc.html", index=1)
514 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
515 toc.write_to_xml(nav_map)
516 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
519 return OutputFile.from_filename(output_file.name)