1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp, NamedTemporaryFile
16 from shutil import rmtree
18 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
19 from librarian.cover import WLCover
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
36 def set_inner_xml(node, text):
37 """ sets node's text and children from a string
39 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40 >>> set_inner_xml(e, 'x<b>y</b>z')
41 >>> print etree.tostring(e)
45 p = etree.fromstring('<x>%s</x>' % text)
51 """ Find out a node's name
53 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57 tempnode = deepcopy(node)
59 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60 for e in tempnode.findall('.//%s' % p):
64 etree.strip_tags(tempnode, '*')
69 if isinstance(xml, etree._Element):
70 xml = etree.ElementTree(xml)
71 with open(sheet) as xsltf:
72 return xml.xslt(etree.parse(xsltf))
75 def replace_characters(node):
76 def replace_chars(text):
79 return text.replace(u"\ufeff", u"")\
80 .replace("---", u"\u2014")\
81 .replace("--", u"\u2013")\
82 .replace(",,", u"\u201E")\
83 .replace('"', u"\u201D")\
84 .replace("'", u"\u2019")
85 if node.tag in ('uwaga', 'extra'):
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'uwaga'):
109 find_annotations(annotations, child, part_no)
112 def replace_by_verse(tree):
113 """ Find stanzas and create new verses in place of a '/' character """
115 stanzas = tree.findall('.//' + WLNS('strofa'))
117 for child_node in node:
118 if child_node.tag in ('slowo_obce', 'wyroznienie'):
119 foreign_verses = inner_xml(child_node).split('/\n')
120 if len(foreign_verses) > 1:
122 for foreign_verse in foreign_verses:
123 if foreign_verse.startswith('<wers'):
124 new_foreign += foreign_verse
126 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
127 set_inner_xml(child_node, new_foreign)
128 verses = inner_xml(node).split('/\n')
130 modified_inner_xml = ''
132 if verse.startswith('<wers') or verse.startswith('<extra'):
133 modified_inner_xml += verse
135 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
136 set_inner_xml(node, modified_inner_xml)
139 def add_to_manifest(manifest, partno):
140 """ Adds a node to the manifest section in content.opf file """
142 partstr = 'part%d' % partno
143 e = manifest.makeelement(OPFNS('item'), attrib={
145 'href': partstr + '.html',
146 'media-type': 'application/xhtml+xml',
151 def add_to_spine(spine, partno):
152 """ Adds a node to the spine section in content.opf file """
154 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
159 def __init__(self, name=None, part_href=None):
162 self.part_href = part_href
163 self.sub_number = None
165 def add(self, name, part_href, level=0, is_part=True, index=None):
166 assert level == 0 or index is None
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_href, level-1, is_part)
171 t.part_href = part_href
172 if index is not None:
173 self.children.insert(index, t)
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
194 if self.sub_number is not None:
195 src += '#sub%d' % self.sub_number
198 def write_to_xml(self, nav_map, counter=1):
199 for child in self.children:
200 nav_point = nav_map.makeelement(NCXNS('navPoint'))
201 nav_point.set('id', 'NavPoint-%d' % counter)
202 nav_point.set('playOrder', str(counter))
204 nav_label = nav_map.makeelement(NCXNS('navLabel'))
205 text = nav_map.makeelement(NCXNS('text'))
206 text.text = child.name
207 nav_label.append(text)
208 nav_point.append(nav_label)
210 content = nav_map.makeelement(NCXNS('content'))
211 content.set('src', child.href())
212 nav_point.append(content)
213 nav_map.append(nav_point)
214 counter = child.write_to_xml(nav_point, counter + 1)
217 def html_part(self, depth=0):
219 for child in self.children:
221 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
222 (depth, child.href(), child.name))
223 texts.append(child.html_part(depth+1))
224 return "\n".join(texts)
227 with open(get_resource('epub/toc.html')) as f:
228 t = unicode(f.read(), 'utf-8')
229 return t % self.html_part()
232 def used_chars(element):
233 """ Lists characters used in an ETree Element """
234 chars = set((element.text or '') + (element.tail or ''))
235 for child in element:
236 chars = chars.union(used_chars(child))
241 """ divide main content of the XML file into chunks """
243 # prepare a container for each chunk
244 part_xml = etree.Element('utwor')
245 etree.SubElement(part_xml, 'master')
246 main_xml_part = part_xml[0] # master
248 last_node_part = False
249 for one_part in main_text:
251 if name == 'naglowek_czesc':
253 last_node_part = True
254 main_xml_part[:] = [deepcopy(one_part)]
255 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
257 main_xml_part[:] = [deepcopy(one_part)]
259 main_xml_part.append(deepcopy(one_part))
260 last_node_part = False
264 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
265 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
268 for element in chunk_xml[0]:
269 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
270 toc.add(node_name(element), "part%d.html" % chunk_no)
271 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
272 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
273 element.set('sub', str(subnumber))
275 if not _empty_html_static:
276 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
278 output_html = _empty_html_static[0]
280 find_annotations(annotations, chunk_xml, chunk_no)
281 replace_by_verse(chunk_xml)
282 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
283 chars = used_chars(html_tree.getroot())
284 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
285 return output_html, toc, chars
288 def transform(wldoc, verbose=False,
289 style=None, html_toc=False,
290 sample=None, cover=None, flags=None):
291 """ produces a EPUB file
293 sample=n: generate sample e-book (with at least n paragraphs)
294 cover: a cover.Cover factory or True for default
295 flags: less-advertising, without-fonts, working-copy
298 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
299 """ processes one input file and proceeds to its children """
301 replace_characters(wldoc.edoc.getroot())
303 # every input file will have a TOC entry,
304 # pointing to starting chunk
305 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
308 # write book title page
309 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
310 chars = used_chars(html_tree.getroot())
311 zip.writestr('OPS/title.html',
312 etree.tostring(html_tree, method="html", pretty_print=True))
313 # add a title page TOC entry
314 toc.add(u"Strona tytułowa", "title.html")
315 elif wldoc.book_info.parts:
316 # write title page for every parent
317 if sample is not None and sample <= 0:
319 html_string = open(get_resource('epub/emptyChunk.html')).read()
321 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
322 chars = used_chars(html_tree.getroot())
323 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
324 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
325 add_to_manifest(manifest, chunk_counter)
326 add_to_spine(spine, chunk_counter)
329 if len(wldoc.edoc.getroot()) > 1:
330 # rdf before style master
331 main_text = wldoc.edoc.getroot()[1]
333 # rdf in style master
334 main_text = wldoc.edoc.getroot()[0]
335 if main_text.tag == RDFNS('RDF'):
338 if main_text is not None:
339 for chunk_xml in chop(main_text):
341 if sample is not None:
345 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
346 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
348 toc.extend(chunk_toc)
349 chars = chars.union(chunk_chars)
350 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
351 add_to_manifest(manifest, chunk_counter)
352 add_to_spine(spine, chunk_counter)
355 for child in wldoc.parts():
356 child_toc, chunk_counter, chunk_chars, sample = transform_file(
357 child, chunk_counter, first=False, sample=sample)
358 toc.append(child_toc)
359 chars = chars.union(chunk_chars)
361 return toc, chunk_counter, chars, sample
364 document = deepcopy(wldoc)
369 document.edoc.getroot().set(flag, 'yes')
371 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
372 manifest = opf.find('.//' + OPFNS('manifest'))
373 guide = opf.find('.//' + OPFNS('guide'))
374 spine = opf.find('.//' + OPFNS('spine'))
376 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
377 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
379 # write static elements
380 mime = zipfile.ZipInfo()
381 mime.filename = 'mimetype'
382 mime.compress_type = zipfile.ZIP_STORED
384 zip.writestr(mime, 'application/epub+zip')
385 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
386 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
387 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
388 'media-type="application/oebps-package+xml" />' \
389 '</rootfiles></container>')
390 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
391 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
393 style = get_resource('epub/style.css')
394 zip.write(style, os.path.join('OPS', 'style.css'))
400 cover_file = StringIO()
401 bound_cover = cover(document.book_info)
402 bound_cover.save(cover_file)
403 cover_name = 'cover.%s' % bound_cover.ext()
404 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
407 cover_tree = etree.parse(get_resource('epub/cover.html'))
408 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
409 zip.writestr('OPS/cover.html', etree.tostring(
410 cover_tree, method="html", pretty_print=True))
412 if bound_cover.uses_dc_cover:
413 if document.book_info.cover_by:
414 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
415 if document.book_info.cover_source:
416 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
418 manifest.append(etree.fromstring(
419 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
420 manifest.append(etree.fromstring(
421 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
422 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
423 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
424 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
427 annotations = etree.Element('annotations')
429 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
430 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
431 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
432 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
434 nav_map = toc_file[-1]
437 manifest.append(etree.fromstring(
438 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
439 spine.append(etree.fromstring(
440 '<itemref idref="html_toc" />'))
441 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
443 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
445 if len(toc.children) < 2:
446 toc.add(u"Początek utworu", "part1.html")
448 # Last modifications in container files and EPUB creation
449 if len(annotations) > 0:
450 toc.add("Przypisy", "annotations.html")
451 manifest.append(etree.fromstring(
452 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
453 spine.append(etree.fromstring(
454 '<itemref idref="annotations" />'))
455 replace_by_verse(annotations)
456 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
457 chars = chars.union(used_chars(html_tree.getroot()))
458 zip.writestr('OPS/annotations.html', etree.tostring(
459 html_tree, method="html", pretty_print=True))
461 toc.add("Strona redakcyjna", "last.html")
462 manifest.append(etree.fromstring(
463 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
464 spine.append(etree.fromstring(
465 '<itemref idref="last" />'))
466 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
467 chars.update(used_chars(html_tree.getroot()))
468 zip.writestr('OPS/last.html', etree.tostring(
469 html_tree, method="html", pretty_print=True))
471 if not flags or not 'without-fonts' in flags:
473 tmpdir = mkdtemp('-librarian-epub')
476 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
477 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
478 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
479 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
481 print "Running font-optimizer"
482 subprocess.check_call(optimizer_call)
484 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
485 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
486 manifest.append(etree.fromstring(
487 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
491 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
492 title = document.book_info.title
493 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
494 for st in attributes:
495 meta = toc_file.makeelement(NCXNS('meta'))
497 meta.set('content', '0')
498 toc_file[0].append(meta)
499 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
500 toc_file[0][1].set('content', str(toc.depth()))
501 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
505 toc.add(u"Spis treści", "toc.html", index=1)
506 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
507 toc.write_to_xml(nav_map)
508 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
511 return OutputFile.from_filename(output_file.name)