1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian import functions, get_resource
22 functions.reg_person_name()
26 """ returns node's text and children as a string
28 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
32 nt = node.text if node.text is not None else ''
33 return ''.join([nt] + [etree.tostring(child) for child in node])
35 def set_inner_xml(node, text):
36 """ sets node's text and children from a string
38 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
39 >>> set_inner_xml(e, 'x<b>y</b>z')
40 >>> print etree.tostring(e)
44 p = etree.fromstring('<x>%s</x>' % text)
50 """ Find out a node's name
52 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
56 tempnode = deepcopy(node)
58 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
59 for e in tempnode.findall('.//%s' % p):
63 etree.strip_tags(tempnode, '*')
68 if isinstance(xml, etree._Element):
69 xml = etree.ElementTree(xml)
70 with open(sheet) as xsltf:
71 return xml.xslt(etree.parse(xsltf))
74 def replace_characters(node):
75 def replace_chars(text):
78 return text.replace(u"\ufeff", u"")\
79 .replace("---", u"\u2014")\
80 .replace("--", u"\u2013")\
81 .replace(",,", u"\u201E")\
82 .replace('"', u"\u201D")\
83 .replace("'", u"\u2019")
84 if node.tag in ('uwaga', 'extra'):
88 node.text = replace_chars(node.text)
89 node.tail = replace_chars(node.tail)
91 replace_characters(child)
94 def find_annotations(annotations, source, part_no):
96 if child.tag in ('pe', 'pa', 'pt', 'pr'):
97 annotation = deepcopy(child)
98 number = str(len(annotations)+1)
99 annotation.set('number', number)
100 annotation.set('part', str(part_no))
102 annotations.append(annotation)
107 if child.tag not in ('extra', 'uwaga'):
108 find_annotations(annotations, child, part_no)
111 def replace_by_verse(tree):
112 """ Find stanzas and create new verses in place of a '/' character """
114 stanzas = tree.findall('.//' + WLNS('strofa'))
116 for child_node in node:
117 if child_node.tag in ('slowo_obce', 'wyroznienie'):
118 foreign_verses = inner_xml(child_node).split('/\n')
119 if len(foreign_verses) > 1:
121 for foreign_verse in foreign_verses:
122 if foreign_verse.startswith('<wers'):
123 new_foreign += foreign_verse
125 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
126 set_inner_xml(child_node, new_foreign)
127 verses = inner_xml(node).split('/\n')
129 modified_inner_xml = ''
131 if verse.startswith('<wers') or verse.startswith('<extra'):
132 modified_inner_xml += verse
134 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
135 set_inner_xml(node, modified_inner_xml)
138 def add_to_manifest(manifest, partno):
139 """ Adds a node to the manifest section in content.opf file """
141 partstr = 'part%d' % partno
142 e = manifest.makeelement(OPFNS('item'), attrib={
144 'href': partstr + '.html',
145 'media-type': 'application/xhtml+xml',
150 def add_to_spine(spine, partno):
151 """ Adds a node to the spine section in content.opf file """
153 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
158 def __init__(self, name=None, part_href=None):
161 self.part_href = part_href
162 self.sub_number = None
164 def add(self, name, part_href, level=0, is_part=True, index=None):
165 assert level == 0 or index is None
166 if level > 0 and self.children:
167 return self.children[-1].add(name, part_href, level-1, is_part)
170 t.part_href = part_href
171 if index is not None:
172 self.children.insert(index, t)
174 self.children.append(t)
176 t.sub_number = len(self.children) + 1
179 def append(self, toc):
180 self.children.append(toc)
182 def extend(self, toc):
183 self.children.extend(toc.children)
187 return max((c.depth() for c in self.children)) + 1
193 if self.sub_number is not None:
194 src += '#sub%d' % self.sub_number
197 def write_to_xml(self, nav_map, counter=1):
198 for child in self.children:
199 nav_point = nav_map.makeelement(NCXNS('navPoint'))
200 nav_point.set('id', 'NavPoint-%d' % counter)
201 nav_point.set('playOrder', str(counter))
203 nav_label = nav_map.makeelement(NCXNS('navLabel'))
204 text = nav_map.makeelement(NCXNS('text'))
205 text.text = child.name
206 nav_label.append(text)
207 nav_point.append(nav_label)
209 content = nav_map.makeelement(NCXNS('content'))
210 content.set('src', child.href())
211 nav_point.append(content)
212 nav_map.append(nav_point)
213 counter = child.write_to_xml(nav_point, counter + 1)
216 def html_part(self, depth=0):
218 for child in self.children:
220 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
221 (depth, child.href(), child.name))
222 texts.append(child.html_part(depth+1))
223 return "\n".join(texts)
226 with open(get_resource('epub/toc.html')) as f:
227 t = unicode(f.read(), 'utf-8')
228 return t % self.html_part()
231 def used_chars(element):
232 """ Lists characters used in an ETree Element """
233 chars = set((element.text or '') + (element.tail or ''))
234 for child in element:
235 chars = chars.union(used_chars(child))
240 """ divide main content of the XML file into chunks """
242 # prepare a container for each chunk
243 part_xml = etree.Element('utwor')
244 etree.SubElement(part_xml, 'master')
245 main_xml_part = part_xml[0] # master
247 last_node_part = False
248 for one_part in main_text:
250 if name == 'naglowek_czesc':
252 last_node_part = True
253 main_xml_part[:] = [deepcopy(one_part)]
254 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
256 main_xml_part[:] = [deepcopy(one_part)]
258 main_xml_part.append(deepcopy(one_part))
259 last_node_part = False
263 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
264 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
267 for element in chunk_xml[0]:
268 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
269 toc.add(node_name(element), "part%d.html" % chunk_no)
270 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
271 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
272 element.set('sub', str(subnumber))
274 if not _empty_html_static:
275 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
277 output_html = _empty_html_static[0]
279 find_annotations(annotations, chunk_xml, chunk_no)
280 replace_by_verse(chunk_xml)
281 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
282 chars = used_chars(html_tree.getroot())
283 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
284 return output_html, toc, chars
287 def transform(wldoc, verbose=False,
288 style=None, html_toc=False,
289 sample=None, cover=None, flags=None):
290 """ produces a EPUB file
292 sample=n: generate sample e-book (with at least n paragraphs)
293 cover: a cover.Cover object
294 flags: less-advertising, without-fonts
297 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
298 """ processes one input file and proceeds to its children """
300 replace_characters(wldoc.edoc.getroot())
302 # every input file will have a TOC entry,
303 # pointing to starting chunk
304 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
307 # write book title page
308 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
309 chars = used_chars(html_tree.getroot())
310 zip.writestr('OPS/title.html',
311 etree.tostring(html_tree, method="html", pretty_print=True))
312 # add a title page TOC entry
313 toc.add(u"Strona tytułowa", "title.html")
314 elif wldoc.book_info.parts:
315 # write title page for every parent
316 if sample is not None and sample <= 0:
318 html_string = open(get_resource('epub/emptyChunk.html')).read()
320 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
321 chars = used_chars(html_tree.getroot())
322 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
323 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
324 add_to_manifest(manifest, chunk_counter)
325 add_to_spine(spine, chunk_counter)
328 if len(wldoc.edoc.getroot()) > 1:
329 # rdf before style master
330 main_text = wldoc.edoc.getroot()[1]
332 # rdf in style master
333 main_text = wldoc.edoc.getroot()[0]
334 if main_text.tag == RDFNS('RDF'):
337 if main_text is not None:
338 for chunk_xml in chop(main_text):
340 if sample is not None:
344 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
345 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
347 toc.extend(chunk_toc)
348 chars = chars.union(chunk_chars)
349 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
350 add_to_manifest(manifest, chunk_counter)
351 add_to_spine(spine, chunk_counter)
354 for child in wldoc.parts():
355 child_toc, chunk_counter, chunk_chars, sample = transform_file(
356 child, chunk_counter, first=False, sample=sample)
357 toc.append(child_toc)
358 chars = chars.union(chunk_chars)
360 return toc, chunk_counter, chars, sample
363 document = deepcopy(wldoc)
368 document.edoc.getroot().set(flag, 'yes')
370 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
371 manifest = opf.find('.//' + OPFNS('manifest'))
372 guide = opf.find('.//' + OPFNS('guide'))
373 spine = opf.find('.//' + OPFNS('spine'))
375 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
376 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
378 # write static elements
379 mime = zipfile.ZipInfo()
380 mime.filename = 'mimetype'
381 mime.compress_type = zipfile.ZIP_STORED
383 zip.writestr(mime, 'application/epub+zip')
384 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
385 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
386 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
387 'media-type="application/oebps-package+xml" />' \
388 '</rootfiles></container>')
389 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
390 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
392 style = get_resource('epub/style.css')
393 zip.write(style, os.path.join('OPS', 'style.css'))
397 cover_file = StringIO()
398 c = cover(document.book_info.author.readable(), document.book_info.title)
400 c_name = 'cover.%s' % c.ext()
401 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
404 cover_tree = etree.parse(get_resource('epub/cover.html'))
405 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
406 zip.writestr('OPS/cover.html', etree.tostring(
407 cover_tree, method="html", pretty_print=True))
409 manifest.append(etree.fromstring(
410 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
411 manifest.append(etree.fromstring(
412 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
413 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
414 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
415 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
418 annotations = etree.Element('annotations')
420 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
421 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
422 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
423 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
425 nav_map = toc_file[-1]
428 manifest.append(etree.fromstring(
429 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
430 spine.append(etree.fromstring(
431 '<itemref idref="html_toc" />'))
432 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
434 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
436 if len(toc.children) < 2:
437 toc.add(u"Początek utworu", "part1.html")
439 # Last modifications in container files and EPUB creation
440 if len(annotations) > 0:
441 toc.add("Przypisy", "annotations.html")
442 manifest.append(etree.fromstring(
443 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
444 spine.append(etree.fromstring(
445 '<itemref idref="annotations" />'))
446 replace_by_verse(annotations)
447 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
448 chars = chars.union(used_chars(html_tree.getroot()))
449 zip.writestr('OPS/annotations.html', etree.tostring(
450 html_tree, method="html", pretty_print=True))
452 toc.add("Strona redakcyjna", "last.html")
453 manifest.append(etree.fromstring(
454 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
455 spine.append(etree.fromstring(
456 '<itemref idref="last" />'))
457 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
458 chars.update(used_chars(html_tree.getroot()))
459 zip.writestr('OPS/last.html', etree.tostring(
460 html_tree, method="html", pretty_print=True))
462 if not flags or not 'without-fonts' in flags:
464 tmpdir = mkdtemp('-librarian-epub')
467 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
468 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
469 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
470 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
472 print "Running font-optimizer"
473 subprocess.check_call(optimizer_call)
475 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
476 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
477 manifest.append(etree.fromstring(
478 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
482 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
483 title = document.book_info.title
484 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
485 for st in attributes:
486 meta = toc_file.makeelement(NCXNS('meta'))
488 meta.set('content', '0')
489 toc_file[0].append(meta)
490 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
491 toc_file[0][1].set('content', str(toc.depth()))
492 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
496 toc.add(u"Spis treści", "toc.html", index=1)
497 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
498 toc.write_to_xml(nav_map)
499 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
502 return OutputFile.from_filename(output_file.name)