1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
19 from librarian.cover import WLCover
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
36 def set_inner_xml(node, text):
37 """ sets node's text and children from a string
39 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40 >>> set_inner_xml(e, 'x<b>y</b>z')
41 >>> print etree.tostring(e)
45 p = etree.fromstring('<x>%s</x>' % text)
51 """ Find out a node's name
53 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57 tempnode = deepcopy(node)
59 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60 for e in tempnode.findall('.//%s' % p):
64 etree.strip_tags(tempnode, '*')
69 if isinstance(xml, etree._Element):
70 xml = etree.ElementTree(xml)
71 with open(sheet) as xsltf:
72 return xml.xslt(etree.parse(xsltf))
75 def replace_characters(node):
76 def replace_chars(text):
79 return text.replace(u"\ufeff", u"")\
80 .replace("---", u"\u2014")\
81 .replace("--", u"\u2013")\
82 .replace(",,", u"\u201E")\
83 .replace('"', u"\u201D")\
84 .replace("'", u"\u2019")
85 if node.tag in ('uwaga', 'extra'):
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'uwaga'):
109 find_annotations(annotations, child, part_no)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
115 stanzas = tree.findall('.//' + WLNS('strofa'))
117 for child_node in node:
118 if child_node.tag in ('slowo_obce', 'wyroznienie'):
119 foreign_verses = inner_xml(child_node).split('/\n')
120 if len(foreign_verses) > 1:
122 for foreign_verse in foreign_verses:
123 if foreign_verse.startswith('<wers'):
124 new_foreign += foreign_verse
126 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127 set_inner_xml(child_node, new_foreign)
128 verses = inner_xml(node).split('/\n')
130 modified_inner_xml = ''
132 if verse.startswith('<wers') or verse.startswith('<extra'):
133 modified_inner_xml += verse
135 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136 set_inner_xml(node, modified_inner_xml)
139 def add_to_manifest(manifest, partno):
140 """ Adds a node to the manifest section in content.opf file """
142 partstr = 'part%d' % partno
143 e = manifest.makeelement(OPFNS('item'), attrib={
145 'href': partstr + '.html',
146 'media-type': 'application/xhtml+xml',
151 def add_to_spine(spine, partno):
152 """ Adds a node to the spine section in content.opf file """
154 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159 def __init__(self, name=None, part_href=None):
162 self.part_href = part_href
163 self.sub_number = None
165 def add(self, name, part_href, level=0, is_part=True, index=None):
166 assert level == 0 or index is None
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_href, level-1, is_part)
171 t.part_href = part_href
172 if index is not None:
173 self.children.insert(index, t)
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
194 if self.sub_number is not None:
195 src += '#sub%d' % self.sub_number
198 def write_to_xml(self, nav_map, counter=1):
199 for child in self.children:
200 nav_point = nav_map.makeelement(NCXNS('navPoint'))
201 nav_point.set('id', 'NavPoint-%d' % counter)
202 nav_point.set('playOrder', str(counter))
204 nav_label = nav_map.makeelement(NCXNS('navLabel'))
205 text = nav_map.makeelement(NCXNS('text'))
206 text.text = child.name
207 nav_label.append(text)
208 nav_point.append(nav_label)
210 content = nav_map.makeelement(NCXNS('content'))
211 content.set('src', child.href())
212 nav_point.append(content)
213 nav_map.append(nav_point)
214 counter = child.write_to_xml(nav_point, counter + 1)
217 def html_part(self, depth=0):
219 for child in self.children:
221 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
222 (depth, child.href(), child.name))
223 texts.append(child.html_part(depth+1))
224 return "\n".join(texts)
227 with open(get_resource('epub/toc.html')) as f:
228 t = unicode(f.read(), 'utf-8')
229 return t % self.html_part()
232 def used_chars(element):
233 """ Lists characters used in an ETree Element """
234 chars = set((element.text or '') + (element.tail or ''))
235 for child in element:
236 chars = chars.union(used_chars(child))
241 """ divide main content of the XML file into chunks """
243 # prepare a container for each chunk
244 part_xml = etree.Element('utwor')
245 etree.SubElement(part_xml, 'master')
246 main_xml_part = part_xml[0] # master
248 last_node_part = False
249 for one_part in main_text:
251 if name == 'naglowek_czesc':
253 last_node_part = True
254 main_xml_part[:] = [deepcopy(one_part)]
255 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
257 main_xml_part[:] = [deepcopy(one_part)]
259 main_xml_part.append(deepcopy(one_part))
260 last_node_part = False
264 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
265 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
268 for element in chunk_xml[0]:
269 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
270 toc.add(node_name(element), "part%d.html" % chunk_no)
271 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
272 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
273 element.set('sub', str(subnumber))
275 if not _empty_html_static:
276 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
278 output_html = _empty_html_static[0]
280 find_annotations(annotations, chunk_xml, chunk_no)
281 replace_by_verse(chunk_xml)
282 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
283 chars = used_chars(html_tree.getroot())
284 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
285 return output_html, toc, chars
288 def transform(wldoc, verbose=False,
289 style=None, html_toc=False,
290 sample=None, cover=None, flags=None):
291 """ produces a EPUB file
293 sample=n: generate sample e-book (with at least n paragraphs)
294 cover: a cover.Cover object or True for default
295 flags: less-advertising, without-fonts
298 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
299 """ processes one input file and proceeds to its children """
301 replace_characters(wldoc.edoc.getroot())
303 # every input file will have a TOC entry,
304 # pointing to starting chunk
305 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
308 # write book title page
309 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
310 chars = used_chars(html_tree.getroot())
311 zip.writestr('OPS/title.html',
312 etree.tostring(html_tree, method="html", pretty_print=True))
313 # add a title page TOC entry
314 toc.add(u"Strona tytułowa", "title.html")
315 elif wldoc.book_info.parts:
316 # write title page for every parent
317 if sample is not None and sample <= 0:
319 html_string = open(get_resource('epub/emptyChunk.html')).read()
321 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
322 chars = used_chars(html_tree.getroot())
323 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
324 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
325 add_to_manifest(manifest, chunk_counter)
326 add_to_spine(spine, chunk_counter)
329 if len(wldoc.edoc.getroot()) > 1:
330 # rdf before style master
331 main_text = wldoc.edoc.getroot()[1]
333 # rdf in style master
334 main_text = wldoc.edoc.getroot()[0]
335 if main_text.tag == RDFNS('RDF'):
338 if main_text is not None:
339 for chunk_xml in chop(main_text):
341 if sample is not None:
345 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
346 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
348 toc.extend(chunk_toc)
349 chars = chars.union(chunk_chars)
350 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
351 add_to_manifest(manifest, chunk_counter)
352 add_to_spine(spine, chunk_counter)
355 for child in wldoc.parts():
356 child_toc, chunk_counter, chunk_chars, sample = transform_file(
357 child, chunk_counter, first=False, sample=sample)
358 toc.append(child_toc)
359 chars = chars.union(chunk_chars)
361 return toc, chunk_counter, chars, sample
364 document = deepcopy(wldoc)
369 document.edoc.getroot().set(flag, 'yes')
371 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
372 manifest = opf.find('.//' + OPFNS('manifest'))
373 guide = opf.find('.//' + OPFNS('guide'))
374 spine = opf.find('.//' + OPFNS('spine'))
376 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
377 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
379 # write static elements
380 mime = zipfile.ZipInfo()
381 mime.filename = 'mimetype'
382 mime.compress_type = zipfile.ZIP_STORED
384 zip.writestr(mime, 'application/epub+zip')
385 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
386 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
387 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
388 'media-type="application/oebps-package+xml" />' \
389 '</rootfiles></container>')
390 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
391 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
393 style = get_resource('epub/style.css')
394 zip.write(style, os.path.join('OPS', 'style.css'))
399 if cover.uses_dc_cover:
400 if document.book_info.cover_by:
401 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
402 if document.book_info.cover_source:
403 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
405 cover_file = StringIO()
406 c = cover(document.book_info)
408 c_name = 'cover.%s' % c.ext()
409 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
412 cover_tree = etree.parse(get_resource('epub/cover.html'))
413 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
414 zip.writestr('OPS/cover.html', etree.tostring(
415 cover_tree, method="html", pretty_print=True))
417 manifest.append(etree.fromstring(
418 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
419 manifest.append(etree.fromstring(
420 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
421 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
422 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
423 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
426 annotations = etree.Element('annotations')
428 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
429 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
430 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
431 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
433 nav_map = toc_file[-1]
436 manifest.append(etree.fromstring(
437 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
438 spine.append(etree.fromstring(
439 '<itemref idref="html_toc" />'))
440 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
442 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
444 if len(toc.children) < 2:
445 toc.add(u"Początek utworu", "part1.html")
447 # Last modifications in container files and EPUB creation
448 if len(annotations) > 0:
449 toc.add("Przypisy", "annotations.html")
450 manifest.append(etree.fromstring(
451 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
452 spine.append(etree.fromstring(
453 '<itemref idref="annotations" />'))
454 replace_by_verse(annotations)
455 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
456 chars = chars.union(used_chars(html_tree.getroot()))
457 zip.writestr('OPS/annotations.html', etree.tostring(
458 html_tree, method="html", pretty_print=True))
460 toc.add("Strona redakcyjna", "last.html")
461 manifest.append(etree.fromstring(
462 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
463 spine.append(etree.fromstring(
464 '<itemref idref="last" />'))
465 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
466 chars.update(used_chars(html_tree.getroot()))
467 zip.writestr('OPS/last.html', etree.tostring(
468 html_tree, method="html", pretty_print=True))
470 if not flags or not 'without-fonts' in flags:
472 tmpdir = mkdtemp('-librarian-epub')
475 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
476 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
477 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
478 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
480 print "Running font-optimizer"
481 subprocess.check_call(optimizer_call)
483 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
484 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
485 manifest.append(etree.fromstring(
486 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
490 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
491 title = document.book_info.title
492 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
493 for st in attributes:
494 meta = toc_file.makeelement(NCXNS('meta'))
496 meta.set('content', '0')
497 toc_file[0].append(meta)
498 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
499 toc_file[0][1].set('content', str(toc.depth()))
500 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
504 toc.add(u"Spis treści", "toc.html", index=1)
505 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
506 toc.write_to_xml(nav_map)
507 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
510 return OutputFile.from_filename(output_file.name)