1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
8 from copy import deepcopy
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import WLCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 def replace_by_verse(tree):
114 """ Find stanzas and create new verses in place of a '/' character """
116 stanzas = tree.findall('.//' + WLNS('strofa'))
118 for child_node in node:
119 if child_node.tag in ('slowo_obce', 'wyroznienie'):
120 foreign_verses = inner_xml(child_node).split('/\n')
121 if len(foreign_verses) > 1:
123 for foreign_verse in foreign_verses:
124 if foreign_verse.startswith('<wers'):
125 new_foreign += foreign_verse
127 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128 set_inner_xml(child_node, new_foreign)
129 verses = inner_xml(node).split('/\n')
131 modified_inner_xml = ''
133 if verse.startswith('<wers') or verse.startswith('<extra'):
134 modified_inner_xml += verse
136 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137 set_inner_xml(node, modified_inner_xml)
140 def add_to_manifest(manifest, partno):
141 """ Adds a node to the manifest section in content.opf file """
143 partstr = 'part%d' % partno
144 e = manifest.makeelement(OPFNS('item'), attrib={
146 'href': partstr + '.html',
147 'media-type': 'application/xhtml+xml',
152 def add_to_spine(spine, partno):
153 """ Adds a node to the spine section in content.opf file """
155 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
160 def __init__(self, name=None, part_href=None):
163 self.part_href = part_href
164 self.sub_number = None
166 def add(self, name, part_href, level=0, is_part=True, index=None):
167 assert level == 0 or index is None
168 if level > 0 and self.children:
169 return self.children[-1].add(name, part_href, level-1, is_part)
172 t.part_href = part_href
173 if index is not None:
174 self.children.insert(index, t)
176 self.children.append(t)
178 t.sub_number = len(self.children) + 1
181 def append(self, toc):
182 self.children.append(toc)
184 def extend(self, toc):
185 self.children.extend(toc.children)
189 return max((c.depth() for c in self.children)) + 1
195 if self.sub_number is not None:
196 src += '#sub%d' % self.sub_number
199 def write_to_xml(self, nav_map, counter=1):
200 for child in self.children:
201 nav_point = nav_map.makeelement(NCXNS('navPoint'))
202 nav_point.set('id', 'NavPoint-%d' % counter)
203 nav_point.set('playOrder', str(counter))
205 nav_label = nav_map.makeelement(NCXNS('navLabel'))
206 text = nav_map.makeelement(NCXNS('text'))
207 text.text = child.name
208 nav_label.append(text)
209 nav_point.append(nav_label)
211 content = nav_map.makeelement(NCXNS('content'))
212 content.set('src', child.href())
213 nav_point.append(content)
214 nav_map.append(nav_point)
215 counter = child.write_to_xml(nav_point, counter + 1)
218 def html_part(self, depth=0):
220 for child in self.children:
222 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
223 (depth, child.href(), child.name))
224 texts.append(child.html_part(depth+1))
225 return "\n".join(texts)
228 with open(get_resource('epub/toc.html')) as f:
229 t = unicode(f.read(), 'utf-8')
230 return t % self.html_part()
233 def used_chars(element):
234 """ Lists characters used in an ETree Element """
235 chars = set((element.text or '') + (element.tail or ''))
236 for child in element:
237 chars = chars.union(used_chars(child))
242 """ divide main content of the XML file into chunks """
244 # prepare a container for each chunk
245 part_xml = etree.Element('utwor')
246 etree.SubElement(part_xml, 'master')
247 main_xml_part = part_xml[0] # master
249 last_node_part = False
250 for one_part in main_text:
252 if name == 'naglowek_czesc':
254 last_node_part = True
255 main_xml_part[:] = [deepcopy(one_part)]
256 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
258 main_xml_part[:] = [deepcopy(one_part)]
260 main_xml_part.append(deepcopy(one_part))
261 last_node_part = False
265 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
266 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
269 for element in chunk_xml[0]:
270 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
271 toc.add(node_name(element), "part%d.html" % chunk_no)
272 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
273 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
274 element.set('sub', str(subnumber))
276 if not _empty_html_static:
277 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
279 output_html = _empty_html_static[0]
281 find_annotations(annotations, chunk_xml, chunk_no)
282 replace_by_verse(chunk_xml)
283 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
284 chars = used_chars(html_tree.getroot())
285 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
286 return output_html, toc, chars
289 def transform(wldoc, verbose=False,
290 style=None, html_toc=False,
291 sample=None, cover=None, flags=None):
292 """ produces a EPUB file
294 sample=n: generate sample e-book (with at least n paragraphs)
295 cover: a cover.Cover object or True for default
296 flags: less-advertising, without-fonts, images, not-wl
299 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
300 """ processes one input file and proceeds to its children """
302 replace_characters(wldoc.edoc.getroot())
304 # every input file will have a TOC entry,
305 # pointing to starting chunk
306 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
309 # write book title page
310 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
311 chars = used_chars(html_tree.getroot())
312 zip.writestr('OPS/title.html',
313 etree.tostring(html_tree, method="html", pretty_print=True))
314 # add a title page TOC entry
315 toc.add(u"Strona tytułowa", "title.html")
316 elif wldoc.book_info.parts:
317 # write title page for every parent
318 if sample is not None and sample <= 0:
320 html_string = open(get_resource('epub/emptyChunk.html')).read()
322 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
323 chars = used_chars(html_tree.getroot())
324 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
325 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
326 add_to_manifest(manifest, chunk_counter)
327 add_to_spine(spine, chunk_counter)
330 if len(wldoc.edoc.getroot()) > 1:
331 # rdf before style master
332 main_text = wldoc.edoc.getroot()[1]
334 # rdf in style master
335 main_text = wldoc.edoc.getroot()[0]
336 if main_text.tag == RDFNS('RDF'):
339 if main_text is not None:
340 for chunk_xml in chop(main_text):
342 if sample is not None:
346 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
347 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
349 toc.extend(chunk_toc)
350 chars = chars.union(chunk_chars)
351 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
352 add_to_manifest(manifest, chunk_counter)
353 add_to_spine(spine, chunk_counter)
356 for child in wldoc.parts():
357 child_toc, chunk_counter, chunk_chars, sample = transform_file(
358 child, chunk_counter, first=False, sample=sample)
359 toc.append(child_toc)
360 chars = chars.union(chunk_chars)
362 return toc, chunk_counter, chars, sample
365 document = deepcopy(wldoc)
370 document.edoc.getroot().set(flag, 'yes')
372 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
373 manifest = opf.find('.//' + OPFNS('manifest'))
374 guide = opf.find('.//' + OPFNS('guide'))
375 spine = opf.find('.//' + OPFNS('spine'))
377 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
379 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
381 # write static elements
382 mime = zipfile.ZipInfo()
383 mime.filename = 'mimetype'
384 mime.compress_type = zipfile.ZIP_STORED
386 zip.writestr(mime, 'application/epub+zip')
387 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
388 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
389 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
390 'media-type="application/oebps-package+xml" />' \
391 '</rootfiles></container>')
392 if not flags or 'not-wl' not in flags:
393 manifest.append(etree.fromstring(
394 '<item id="logo_wolnelektury" href="logo_wolnelektury.png" media-type="image/png" />'))
395 manifest.append(etree.fromstring(
396 '<item id="jedenprocent" href="jedenprocent.png" media-type="image/png" />'))
397 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
398 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
401 style = get_resource('epub/style.css')
402 zip.write(style, os.path.join('OPS', 'style.css'))
407 if cover.uses_dc_cover:
408 if document.book_info.cover_by:
409 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
410 if document.book_info.cover_source:
411 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
413 cover_file = StringIO()
414 c = cover(document.book_info)
416 c_name = 'cover.%s' % c.ext()
417 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
420 cover_tree = etree.parse(get_resource('epub/cover.html'))
421 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
422 zip.writestr('OPS/cover.html', etree.tostring(
423 cover_tree, method="html", pretty_print=True))
425 manifest.append(etree.fromstring(
426 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
427 manifest.append(etree.fromstring(
428 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
429 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
430 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
431 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
433 if flags and 'images' in flags:
434 for ilustr in document.edoc.findall('//ilustr'):
435 src = ilustr.get('src')
436 mime = ImageCover(src)().mime_type()
437 zip.write(src, os.path.join('OPS', src))
438 manifest.append(etree.fromstring(
439 '<item id="%s" href="%s" media-type="%s" />' % (src, src, mime)))
440 # get it up to master
442 while after.getparent().tag not in ['powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny']:
443 after = after.getparent()
444 if not(after is ilustr):
445 moved = deepcopy(ilustr)
451 for ilustr in document.edoc.findall('//ilustr'):
454 annotations = etree.Element('annotations')
456 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
457 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
458 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
459 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
461 nav_map = toc_file[-1]
464 manifest.append(etree.fromstring(
465 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
466 spine.append(etree.fromstring(
467 '<itemref idref="html_toc" />'))
468 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
470 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
472 if len(toc.children) < 2:
473 toc.add(u"Początek utworu", "part1.html")
475 # Last modifications in container files and EPUB creation
476 if len(annotations) > 0:
477 toc.add("Przypisy", "annotations.html")
478 manifest.append(etree.fromstring(
479 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
480 spine.append(etree.fromstring(
481 '<itemref idref="annotations" />'))
482 replace_by_verse(annotations)
483 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
484 chars = chars.union(used_chars(html_tree.getroot()))
485 zip.writestr('OPS/annotations.html', etree.tostring(
486 html_tree, method="html", pretty_print=True))
488 toc.add("Strona redakcyjna", "last.html")
489 manifest.append(etree.fromstring(
490 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
491 spine.append(etree.fromstring(
492 '<itemref idref="last" />'))
493 stopka = document.edoc.find('//stopka')
494 if stopka is not None:
495 stopka.tag = 'stopka_'
496 replace_by_verse(stopka)
497 html_tree = xslt(stopka, get_resource('epub/xsltScheme.xsl'))
499 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
500 chars.update(used_chars(html_tree.getroot()))
501 zip.writestr('OPS/last.html', etree.tostring(
502 html_tree, method="html", pretty_print=True))
504 if not flags or not 'without-fonts' in flags:
506 tmpdir = mkdtemp('-librarian-epub')
509 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
510 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
511 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
512 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
514 print "Running font-optimizer"
515 subprocess.check_call(optimizer_call)
517 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
518 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
519 manifest.append(etree.fromstring(
520 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
524 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
525 title = document.book_info.title
526 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
527 for st in attributes:
528 meta = toc_file.makeelement(NCXNS('meta'))
530 meta.set('content', '0')
531 toc_file[0].append(meta)
532 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
533 toc_file[0][1].set('content', str(toc.depth()))
534 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
538 toc.add(u"Spis treści", "toc.html", index=1)
539 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
540 toc.write_to_xml(nav_map)
541 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
544 return OutputFile.from_filename(output_file.name)