1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, IOFile
21 from librarian import functions, get_resource
23 functions.reg_person_name()
27 """ returns node's text and children as a string
29 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
33 nt = node.text if node.text is not None else ''
34 return ''.join([nt] + [etree.tostring(child) for child in node])
36 def set_inner_xml(node, text):
37 """ sets node's text and children from a string
39 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
40 >>> set_inner_xml(e, 'x<b>y</b>z')
41 >>> print etree.tostring(e)
45 p = etree.fromstring('<x>%s</x>' % text)
51 """ Find out a node's name
53 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
57 tempnode = deepcopy(node)
59 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
60 for e in tempnode.findall('.//%s' % p):
64 etree.strip_tags(tempnode, '*')
69 if isinstance(xml, etree._Element):
70 xml = etree.ElementTree(xml)
71 with open(sheet) as xsltf:
72 return xml.xslt(etree.parse(xsltf))
75 def replace_characters(node):
76 def replace_chars(text):
79 return text.replace(u"\ufeff", u"")\
80 .replace("---", u"\u2014")\
81 .replace("--", u"\u2013")\
82 .replace(",,", u"\u201E")\
83 .replace('"', u"\u201D")\
84 .replace("'", u"\u2019")
85 if node.tag in ('uwaga', 'extra'):
89 node.text = replace_chars(node.text)
90 node.tail = replace_chars(node.tail)
92 replace_characters(child)
95 def find_annotations(annotations, source, part_no):
97 if child.tag in ('pe', 'pa', 'pt', 'pr'):
98 annotation = deepcopy(child)
99 number = str(len(annotations)+1)
100 annotation.set('number', number)
101 annotation.set('part', str(part_no))
103 annotations.append(annotation)
108 if child.tag not in ('extra', 'uwaga'):
109 find_annotations(annotations, child, part_no)
112 class Stanza(object):
114 Converts / verse endings into verse elements in a stanza.
116 Slashes may only occur directly in the stanza. Any slashes in subelements
117 will be ignored, and the subelements will be put inside verse elements.
119 >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
120 >>> Stanza(s).versify()
121 >>> print etree.tostring(s)
122 <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
123 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
126 def __init__(self, stanza_elem):
127 self.stanza = stanza_elem
129 self.open_verse = None
132 self.push_text(self.stanza.text)
133 for elem in self.stanza:
135 self.push_text(elem.tail)
136 tail = self.stanza.tail
138 self.stanza.tail = tail
139 self.stanza.extend(self.verses)
141 def open_normal_verse(self):
142 self.open_verse = self.stanza.makeelement("wers_normalny")
143 self.verses.append(self.open_verse)
145 def get_open_verse(self):
146 if self.open_verse is None:
147 self.open_normal_verse()
148 return self.open_verse
150 def push_text(self, text):
151 if not text or not text.strip():
153 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
155 self.open_normal_verse()
156 verse = self.get_open_verse()
158 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
160 verse.text = (verse.text or "") + verse_text.strip()
162 def push_elem(self, elem):
163 if elem.tag.startswith("wers"):
164 verse = deepcopy(elem)
166 self.verses.append(verse)
167 self.open_verse = verse
169 appended = deepcopy(elem)
171 self.get_open_verse().append(appended)
174 def replace_by_verse(tree):
175 """ Find stanzas and create new verses in place of a '/' character """
177 stanzas = tree.findall('.//' + WLNS('strofa'))
178 for stanza in stanzas:
179 Stanza(stanza).versify()
182 def add_to_manifest(manifest, partno):
183 """ Adds a node to the manifest section in content.opf file """
185 partstr = 'part%d' % partno
186 e = manifest.makeelement(OPFNS('item'), attrib={
188 'href': partstr + '.html',
189 'media-type': 'application/xhtml+xml',
194 def add_to_spine(spine, partno):
195 """ Adds a node to the spine section in content.opf file """
197 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
202 def __init__(self, name=None, part_href=None):
205 self.part_href = part_href
206 self.sub_number = None
208 def add(self, name, part_href, level=0, is_part=True, index=None):
209 assert level == 0 or index is None
210 if level > 0 and self.children:
211 return self.children[-1].add(name, part_href, level-1, is_part)
214 t.part_href = part_href
215 if index is not None:
216 self.children.insert(index, t)
218 self.children.append(t)
220 t.sub_number = len(self.children) + 1
223 def append(self, toc):
224 self.children.append(toc)
226 def extend(self, toc):
227 self.children.extend(toc.children)
231 return max((c.depth() for c in self.children)) + 1
237 if self.sub_number is not None:
238 src += '#sub%d' % self.sub_number
241 def write_to_xml(self, nav_map, counter=1):
242 for child in self.children:
243 nav_point = nav_map.makeelement(NCXNS('navPoint'))
244 nav_point.set('id', 'NavPoint-%d' % counter)
245 nav_point.set('playOrder', str(counter))
247 nav_label = nav_map.makeelement(NCXNS('navLabel'))
248 text = nav_map.makeelement(NCXNS('text'))
249 text.text = child.name
250 nav_label.append(text)
251 nav_point.append(nav_label)
253 content = nav_map.makeelement(NCXNS('content'))
254 content.set('src', child.href())
255 nav_point.append(content)
256 nav_map.append(nav_point)
257 counter = child.write_to_xml(nav_point, counter + 1)
260 def html_part(self, depth=0):
262 for child in self.children:
264 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
265 (depth, child.href(), child.name))
266 texts.append(child.html_part(depth+1))
267 return "\n".join(texts)
270 with open(get_resource('epub/toc.html')) as f:
271 t = unicode(f.read(), 'utf-8')
272 return t % self.html_part()
275 def used_chars(element):
276 """ Lists characters used in an ETree Element """
277 chars = set((element.text or '') + (element.tail or ''))
278 for child in element:
279 chars = chars.union(used_chars(child))
284 """ divide main content of the XML file into chunks """
286 # prepare a container for each chunk
287 part_xml = etree.Element('utwor')
288 etree.SubElement(part_xml, 'master')
289 main_xml_part = part_xml[0] # master
291 last_node_part = False
292 for one_part in main_text:
294 if name == 'naglowek_czesc':
296 last_node_part = True
297 main_xml_part[:] = [deepcopy(one_part)]
298 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
300 main_xml_part[:] = [deepcopy(one_part)]
302 main_xml_part.append(deepcopy(one_part))
303 last_node_part = False
307 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
308 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
311 for element in chunk_xml[0]:
312 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
313 toc.add(node_name(element), "part%d.html" % chunk_no)
314 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
315 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
316 element.set('sub', str(subnumber))
318 if not _empty_html_static:
319 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
321 output_html = _empty_html_static[0]
323 find_annotations(annotations, chunk_xml, chunk_no)
324 replace_by_verse(chunk_xml)
325 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
326 chars = used_chars(html_tree.getroot())
327 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
328 return output_html, toc, chars
331 def transform(wldoc, verbose=False,
332 style=None, html_toc=False,
333 sample=None, cover=None, flags=None):
334 """ produces a EPUB file
336 sample=n: generate sample e-book (with at least n paragraphs)
337 cover: a cover.Cover factory or True for default
338 flags: less-advertising, without-fonts, working-copy
341 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
342 """ processes one input file and proceeds to its children """
344 replace_characters(wldoc.edoc.getroot())
346 # every input file will have a TOC entry,
347 # pointing to starting chunk
348 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
351 # write book title page
352 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
353 chars = used_chars(html_tree.getroot())
354 zip.writestr('OPS/title.html',
355 etree.tostring(html_tree, method="html", pretty_print=True))
356 # add a title page TOC entry
357 toc.add(u"Strona tytułowa", "title.html")
358 elif wldoc.book_info.parts:
359 # write title page for every parent
360 if sample is not None and sample <= 0:
362 html_string = open(get_resource('epub/emptyChunk.html')).read()
364 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
365 chars = used_chars(html_tree.getroot())
366 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
367 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
368 add_to_manifest(manifest, chunk_counter)
369 add_to_spine(spine, chunk_counter)
372 if len(wldoc.edoc.getroot()) > 1:
373 # rdf before style master
374 main_text = wldoc.edoc.getroot()[1]
376 # rdf in style master
377 main_text = wldoc.edoc.getroot()[0]
378 if main_text.tag == RDFNS('RDF'):
381 if main_text is not None:
382 for chunk_xml in chop(main_text):
384 if sample is not None:
388 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
389 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
391 toc.extend(chunk_toc)
392 chars = chars.union(chunk_chars)
393 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
394 add_to_manifest(manifest, chunk_counter)
395 add_to_spine(spine, chunk_counter)
398 # for child in wldoc.parts():
399 # child_toc, chunk_counter, chunk_chars, sample = transform_file(
400 # child, chunk_counter, first=False, sample=sample)
401 # toc.append(child_toc)
402 # chars = chars.union(chunk_chars)
404 return toc, chunk_counter, chars, sample
407 document = deepcopy(wldoc)
412 document.edoc.getroot().set(flag, 'yes')
415 # document.edoc.getroot().set('editors', u', '.join(sorted(
416 # editor.readable() for editor in document.editors())))
418 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
419 manifest = opf.find('.//' + OPFNS('manifest'))
420 guide = opf.find('.//' + OPFNS('guide'))
421 spine = opf.find('.//' + OPFNS('spine'))
423 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
424 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
426 # write static elements
427 mime = zipfile.ZipInfo()
428 mime.filename = 'mimetype'
429 mime.compress_type = zipfile.ZIP_STORED
431 zip.writestr(mime, 'application/epub+zip')
432 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
433 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
434 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
435 'media-type="application/oebps-package+xml" />' \
436 '</rootfiles></container>')
437 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
438 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
440 style = get_resource('epub/style.css')
441 zip.write(style, os.path.join('OPS', 'style.css'))
444 cover_file = StringIO()
445 bound_cover = cover(document.book_info)
446 bound_cover.save(cover_file)
447 cover_name = 'cover.%s' % bound_cover.ext()
448 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
451 cover_tree = etree.parse(get_resource('epub/cover.html'))
452 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
453 zip.writestr('OPS/cover.html', etree.tostring(
454 cover_tree, method="html", pretty_print=True))
456 if bound_cover.uses_dc_cover:
457 if document.book_info.cover_by:
458 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
459 if document.book_info.cover_source:
460 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
462 manifest.append(etree.fromstring(
463 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
464 manifest.append(etree.fromstring(
465 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
466 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
467 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
468 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
471 annotations = etree.Element('annotations')
473 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
474 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
475 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
476 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
478 nav_map = toc_file[-1]
481 manifest.append(etree.fromstring(
482 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
483 spine.append(etree.fromstring(
484 '<itemref idref="html_toc" />'))
485 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
487 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
489 if len(toc.children) < 2:
490 toc.add(u"Początek utworu", "part1.html")
492 # Last modifications in container files and EPUB creation
493 if len(annotations) > 0:
494 toc.add("Przypisy", "annotations.html")
495 manifest.append(etree.fromstring(
496 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
497 spine.append(etree.fromstring(
498 '<itemref idref="annotations" />'))
499 replace_by_verse(annotations)
500 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
501 chars = chars.union(used_chars(html_tree.getroot()))
502 zip.writestr('OPS/annotations.html', etree.tostring(
503 html_tree, method="html", pretty_print=True))
505 toc.add("Strona redakcyjna", "last.html")
506 manifest.append(etree.fromstring(
507 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
508 spine.append(etree.fromstring(
509 '<itemref idref="last" />'))
510 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
511 chars.update(used_chars(html_tree.getroot()))
512 zip.writestr('OPS/last.html', etree.tostring(
513 html_tree, method="html", pretty_print=True))
515 if not flags or not 'without-fonts' in flags:
517 tmpdir = mkdtemp('-librarian-epub')
523 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
524 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
525 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
526 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
528 print "Running font-optimizer"
529 subprocess.check_call(optimizer_call)
531 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
532 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
533 manifest.append(etree.fromstring(
534 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
539 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
540 title = document.book_info.title
541 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
542 for st in attributes:
543 meta = toc_file.makeelement(NCXNS('meta'))
545 meta.set('content', '0')
546 toc_file[0].append(meta)
547 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
548 toc_file[0][1].set('content', str(toc.depth()))
549 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
553 toc.add(u"Spis treści", "toc.html", index=1)
554 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
555 toc.write_to_xml(nav_map)
556 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
559 return IOFile.from_filename(output_file.name)