1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, IOFile
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 class Stanza(object):
115 Converts / verse endings into verse elements in a stanza.
117 Slashes may only occur directly in the stanza. Any slashes in subelements
118 will be ignored, and the subelements will be put inside verse elements.
120 >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121 >>> Stanza(s).versify()
122 >>> print etree.tostring(s)
123 <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
124 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127 def __init__(self, stanza_elem):
128 self.stanza = stanza_elem
130 self.open_verse = None
133 self.push_text(self.stanza.text)
134 for elem in self.stanza:
136 self.push_text(elem.tail)
137 tail = self.stanza.tail
139 self.stanza.tail = tail
140 self.stanza.extend(self.verses)
142 def open_normal_verse(self):
143 self.open_verse = self.stanza.makeelement("wers_normalny")
144 self.verses.append(self.open_verse)
146 def get_open_verse(self):
147 if self.open_verse is None:
148 self.open_normal_verse()
149 return self.open_verse
151 def push_text(self, text):
152 if not text or not text.strip():
154 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156 self.open_normal_verse()
157 verse = self.get_open_verse()
159 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
161 verse.text = (verse.text or "") + verse_text.strip()
163 def push_elem(self, elem):
164 if elem.tag.startswith("wers"):
165 verse = deepcopy(elem)
167 self.verses.append(verse)
168 self.open_verse = verse
170 appended = deepcopy(elem)
172 self.get_open_verse().append(appended)
175 def replace_by_verse(tree):
176 """ Find stanzas and create new verses in place of a '/' character """
178 stanzas = tree.findall('.//' + WLNS('strofa'))
179 for stanza in stanzas:
180 Stanza(stanza).versify()
183 def add_to_manifest(manifest, partno):
184 """ Adds a node to the manifest section in content.opf file """
186 partstr = 'part%d' % partno
187 e = manifest.makeelement(OPFNS('item'), attrib={
189 'href': partstr + '.html',
190 'media-type': 'application/xhtml+xml',
195 def add_to_spine(spine, partno):
196 """ Adds a node to the spine section in content.opf file """
198 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
203 def __init__(self, name=None, part_href=None):
206 self.part_href = part_href
207 self.sub_number = None
209 def add(self, name, part_href, level=0, is_part=True, index=None):
210 assert level == 0 or index is None
211 if level > 0 and self.children:
212 return self.children[-1].add(name, part_href, level-1, is_part)
215 t.part_href = part_href
216 if index is not None:
217 self.children.insert(index, t)
219 self.children.append(t)
221 t.sub_number = len(self.children) + 1
224 def append(self, toc):
225 self.children.append(toc)
227 def extend(self, toc):
228 self.children.extend(toc.children)
232 return max((c.depth() for c in self.children)) + 1
238 if self.sub_number is not None:
239 src += '#sub%d' % self.sub_number
242 def write_to_xml(self, nav_map, counter=1):
243 for child in self.children:
244 nav_point = nav_map.makeelement(NCXNS('navPoint'))
245 nav_point.set('id', 'NavPoint-%d' % counter)
246 nav_point.set('playOrder', str(counter))
248 nav_label = nav_map.makeelement(NCXNS('navLabel'))
249 text = nav_map.makeelement(NCXNS('text'))
250 text.text = child.name
251 nav_label.append(text)
252 nav_point.append(nav_label)
254 content = nav_map.makeelement(NCXNS('content'))
255 content.set('src', child.href())
256 nav_point.append(content)
257 nav_map.append(nav_point)
258 counter = child.write_to_xml(nav_point, counter + 1)
261 def html_part(self, depth=0):
263 for child in self.children:
265 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266 (depth, child.href(), child.name))
267 texts.append(child.html_part(depth+1))
268 return "\n".join(texts)
271 with open(get_resource('epub/toc.html')) as f:
272 t = unicode(f.read(), 'utf-8')
273 return t % self.html_part()
276 def used_chars(element):
277 """ Lists characters used in an ETree Element """
278 chars = set((element.text or '') + (element.tail or ''))
279 for child in element:
280 chars = chars.union(used_chars(child))
285 """ divide main content of the XML file into chunks """
287 # prepare a container for each chunk
288 part_xml = etree.Element('utwor')
289 etree.SubElement(part_xml, 'master')
290 main_xml_part = part_xml[0] # master
292 last_node_part = False
293 for one_part in main_text:
295 if name == 'naglowek_czesc':
297 last_node_part = True
298 main_xml_part[:] = [deepcopy(one_part)]
299 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
301 main_xml_part[:] = [deepcopy(one_part)]
303 main_xml_part.append(deepcopy(one_part))
304 last_node_part = False
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=None):
309 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
310 if _empty_html_static is None:
311 _empty_html_static = []
314 for element in chunk_xml[0]:
315 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
316 toc.add(node_name(element), "part%d.html" % chunk_no)
317 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
318 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
319 element.set('sub', str(subnumber))
321 if not _empty_html_static:
322 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
324 output_html = _empty_html_static[0]
326 find_annotations(annotations, chunk_xml, chunk_no)
327 replace_by_verse(chunk_xml)
328 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
329 chars = used_chars(html_tree.getroot())
330 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
331 return output_html, toc, chars
334 def transform(wldoc, verbose=False,
335 style=None, html_toc=False,
336 sample=None, cover=None, flags=None):
337 """ produces a EPUB file
339 sample=n: generate sample e-book (with at least n paragraphs)
340 cover: a cover.Cover factory or True for default
341 flags: less-advertising, without-fonts, working-copy
344 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
345 """ processes one input file and proceeds to its children """
347 replace_characters(wldoc.edoc.getroot())
349 # every input file will have a TOC entry,
350 # pointing to starting chunk
351 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
354 # write book title page
355 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
356 chars = used_chars(html_tree.getroot())
357 zip.writestr('OPS/title.html', etree.tostring(html_tree, method="html", pretty_print=True))
358 # add a title page TOC entry
359 toc.add(u"Strona tytułowa", "title.html")
360 elif wldoc.book_info.parts:
361 # write title page for every parent
362 if sample is not None and sample <= 0:
364 html_string = open(get_resource('epub/emptyChunk.html')).read()
366 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
367 chars = used_chars(html_tree.getroot())
368 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
369 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
370 add_to_manifest(manifest, chunk_counter)
371 add_to_spine(spine, chunk_counter)
374 if len(wldoc.edoc.getroot()) > 1:
375 # rdf before style master
376 main_text = wldoc.edoc.getroot()[1]
378 # rdf in style master
379 main_text = wldoc.edoc.getroot()[0]
380 if main_text.tag == RDFNS('RDF'):
383 if main_text is not None:
384 for chunk_xml in chop(main_text):
386 if sample is not None:
390 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
391 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
393 toc.extend(chunk_toc)
394 chars = chars.union(chunk_chars)
395 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
396 add_to_manifest(manifest, chunk_counter)
397 add_to_spine(spine, chunk_counter)
400 # for child in wldoc.parts():
401 # child_toc, chunk_counter, chunk_chars, sample = transform_file(
402 # child, chunk_counter, first=False, sample=sample)
403 # toc.append(child_toc)
404 # chars = chars.union(chunk_chars)
406 return toc, chunk_counter, chars, sample
408 document = deepcopy(wldoc)
413 document.edoc.getroot().set(flag, 'yes')
416 # document.edoc.getroot().set('editors', u', '.join(sorted(
417 # editor.readable() for editor in document.editors())))
419 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
420 manifest = opf.find('.//' + OPFNS('manifest'))
421 guide = opf.find('.//' + OPFNS('guide'))
422 spine = opf.find('.//' + OPFNS('spine'))
424 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
425 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
427 # write static elements
428 mime = zipfile.ZipInfo()
429 mime.filename = 'mimetype'
430 mime.compress_type = zipfile.ZIP_STORED
432 zip.writestr(mime, 'application/epub+zip')
434 'META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" '
435 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
436 '<rootfiles><rootfile full-path="OPS/content.opf" '
437 'media-type="application/oebps-package+xml" />'
438 '</rootfiles></container>')
439 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
440 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
442 style = get_resource('epub/style.css')
443 zip.write(style, os.path.join('OPS', 'style.css'))
446 cover_file = StringIO()
447 bound_cover = cover(document.book_info)
448 bound_cover.save(cover_file)
449 cover_name = 'cover.%s' % bound_cover.ext()
450 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
453 cover_tree = etree.parse(get_resource('epub/cover.html'))
454 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
455 zip.writestr('OPS/cover.html', etree.tostring(
456 cover_tree, method="html", pretty_print=True))
458 if bound_cover.uses_dc_cover:
459 if document.book_info.cover_by:
460 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
461 if document.book_info.cover_source:
462 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
464 manifest.append(etree.fromstring(
465 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
466 manifest.append(etree.fromstring(
467 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
468 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
469 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
470 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
472 annotations = etree.Element('annotations')
474 toc_file = etree.fromstring(
475 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
476 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
477 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
478 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
480 nav_map = toc_file[-1]
483 manifest.append(etree.fromstring(
484 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
485 spine.append(etree.fromstring(
486 '<itemref idref="html_toc" />'))
487 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
489 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
491 if len(toc.children) < 2:
492 toc.add(u"Początek utworu", "part1.html")
494 # Last modifications in container files and EPUB creation
495 if len(annotations) > 0:
496 toc.add("Przypisy", "annotations.html")
497 manifest.append(etree.fromstring(
498 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
499 spine.append(etree.fromstring(
500 '<itemref idref="annotations" />'))
501 replace_by_verse(annotations)
502 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
503 chars = chars.union(used_chars(html_tree.getroot()))
504 zip.writestr('OPS/annotations.html', etree.tostring(
505 html_tree, method="html", pretty_print=True))
507 toc.add("Strona redakcyjna", "last.html")
508 manifest.append(etree.fromstring(
509 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
510 spine.append(etree.fromstring(
511 '<itemref idref="last" />'))
512 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
513 chars.update(used_chars(html_tree.getroot()))
514 zip.writestr('OPS/last.html', etree.tostring(
515 html_tree, method="html", pretty_print=True))
517 if not flags or 'without-fonts' not in flags:
519 tmpdir = mkdtemp('-librarian-epub')
525 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
526 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
527 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
528 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
530 print "Running font-optimizer"
531 subprocess.check_call(optimizer_call)
533 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
534 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
535 manifest.append(etree.fromstring(
536 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
541 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
542 title = document.book_info.title
543 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
544 for st in attributes:
545 meta = toc_file.makeelement(NCXNS('meta'))
547 meta.set('content', '0')
548 toc_file[0].append(meta)
549 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
550 toc_file[0][1].set('content', str(toc.depth()))
551 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
555 toc.add(u"Spis treści", "toc.html", index=1)
556 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
557 toc.write_to_xml(nav_map)
558 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
561 return IOFile.from_filename(output_file.name)