1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import WLCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 class Stanza(object):
115 Converts / verse endings into verse elements in a stanza.
117 Slashes may only occur directly in the stanza. Any slashes in subelements
118 will be ignored, and the subelements will be put inside verse elements.
120 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121 >>> Stanza(s).versify()
122 >>> print etree.tostring(s)
123 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
124 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127 def __init__(self, stanza_elem):
128 self.stanza = stanza_elem
130 self.open_verse = None
133 self.push_text(self.stanza.text)
134 for elem in self.stanza:
136 self.push_text(elem.tail)
137 tail = self.stanza.tail
139 self.stanza.tail = tail
140 self.stanza.extend(self.verses)
142 def open_normal_verse(self):
143 self.open_verse = self.stanza.makeelement("wers_normalny")
144 self.verses.append(self.open_verse)
146 def get_open_verse(self):
147 if self.open_verse is None:
148 self.open_normal_verse()
149 return self.open_verse
151 def push_text(self, text):
154 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156 self.open_normal_verse()
157 verse = self.get_open_verse()
159 verse[-1].tail = (verse[-1].tail or "") + verse_text
161 verse.text = (verse.text or "") + verse_text
163 def push_elem(self, elem):
164 if elem.tag.startswith("wers"):
165 verse = deepcopy(elem)
167 self.verses.append(verse)
168 self.open_verse = verse
170 appended = deepcopy(elem)
172 self.get_open_verse().append(appended)
175 def replace_by_verse(tree):
176 """ Find stanzas and create new verses in place of a '/' character """
178 stanzas = tree.findall('.//' + WLNS('strofa'))
179 for stanza in stanzas:
180 Stanza(stanza).versify()
183 def add_to_manifest(manifest, partno):
184 """ Adds a node to the manifest section in content.opf file """
186 partstr = 'part%d' % partno
187 e = manifest.makeelement(OPFNS('item'), attrib={
189 'href': partstr + '.html',
190 'media-type': 'application/xhtml+xml',
195 def add_to_spine(spine, partno):
196 """ Adds a node to the spine section in content.opf file """
198 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
203 def __init__(self, name=None, part_href=None):
206 self.part_href = part_href
207 self.sub_number = None
209 def add(self, name, part_href, level=0, is_part=True, index=None):
210 assert level == 0 or index is None
211 if level > 0 and self.children:
212 return self.children[-1].add(name, part_href, level-1, is_part)
215 t.part_href = part_href
216 if index is not None:
217 self.children.insert(index, t)
219 self.children.append(t)
221 t.sub_number = len(self.children) + 1
224 def append(self, toc):
225 self.children.append(toc)
227 def extend(self, toc):
228 self.children.extend(toc.children)
232 return max((c.depth() for c in self.children)) + 1
238 if self.sub_number is not None:
239 src += '#sub%d' % self.sub_number
242 def write_to_xml(self, nav_map, counter=1):
243 for child in self.children:
244 nav_point = nav_map.makeelement(NCXNS('navPoint'))
245 nav_point.set('id', 'NavPoint-%d' % counter)
246 nav_point.set('playOrder', str(counter))
248 nav_label = nav_map.makeelement(NCXNS('navLabel'))
249 text = nav_map.makeelement(NCXNS('text'))
250 text.text = child.name
251 nav_label.append(text)
252 nav_point.append(nav_label)
254 content = nav_map.makeelement(NCXNS('content'))
255 content.set('src', child.href())
256 nav_point.append(content)
257 nav_map.append(nav_point)
258 counter = child.write_to_xml(nav_point, counter + 1)
261 def html_part(self, depth=0):
263 for child in self.children:
265 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266 (depth, child.href(), child.name))
267 texts.append(child.html_part(depth+1))
268 return "\n".join(texts)
271 with open(get_resource('epub/toc.html')) as f:
272 t = unicode(f.read(), 'utf-8')
273 return t % self.html_part()
276 def used_chars(element):
277 """ Lists characters used in an ETree Element """
278 chars = set((element.text or '') + (element.tail or ''))
279 for child in element:
280 chars = chars.union(used_chars(child))
285 """ divide main content of the XML file into chunks """
287 # prepare a container for each chunk
288 part_xml = etree.Element('utwor')
289 etree.SubElement(part_xml, 'master')
290 main_xml_part = part_xml[0] # master
292 last_node_part = False
293 for one_part in main_text:
295 if name == 'naglowek_czesc':
297 last_node_part = True
298 main_xml_part[:] = [deepcopy(one_part)]
299 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
301 main_xml_part[:] = [deepcopy(one_part)]
303 main_xml_part.append(deepcopy(one_part))
304 last_node_part = False
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
309 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
312 for element in chunk_xml[0]:
313 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
314 toc.add(node_name(element), "part%d.html" % chunk_no)
315 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
316 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
317 element.set('sub', str(subnumber))
319 if not _empty_html_static:
320 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
322 output_html = _empty_html_static[0]
324 find_annotations(annotations, chunk_xml, chunk_no)
325 replace_by_verse(chunk_xml)
326 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
327 chars = used_chars(html_tree.getroot())
328 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
329 return output_html, toc, chars
332 def transform(wldoc, verbose=False,
333 style=None, html_toc=False,
334 sample=None, cover=None, flags=None):
335 """ produces a EPUB file
337 sample=n: generate sample e-book (with at least n paragraphs)
338 cover: a cover.Cover factory or True for default
339 flags: less-advertising, without-fonts, working-copy
342 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
343 """ processes one input file and proceeds to its children """
345 replace_characters(wldoc.edoc.getroot())
347 # every input file will have a TOC entry,
348 # pointing to starting chunk
349 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
352 # write book title page
353 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
354 chars = used_chars(html_tree.getroot())
355 zip.writestr('OPS/title.html',
356 etree.tostring(html_tree, method="html", pretty_print=True))
357 # add a title page TOC entry
358 toc.add(u"Strona tytułowa", "title.html")
359 elif wldoc.book_info.parts:
360 # write title page for every parent
361 if sample is not None and sample <= 0:
363 html_string = open(get_resource('epub/emptyChunk.html')).read()
365 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
366 chars = used_chars(html_tree.getroot())
367 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
368 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
369 add_to_manifest(manifest, chunk_counter)
370 add_to_spine(spine, chunk_counter)
373 if len(wldoc.edoc.getroot()) > 1:
374 # rdf before style master
375 main_text = wldoc.edoc.getroot()[1]
377 # rdf in style master
378 main_text = wldoc.edoc.getroot()[0]
379 if main_text.tag == RDFNS('RDF'):
382 if main_text is not None:
383 for chunk_xml in chop(main_text):
385 if sample is not None:
389 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
390 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
392 toc.extend(chunk_toc)
393 chars = chars.union(chunk_chars)
394 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
395 add_to_manifest(manifest, chunk_counter)
396 add_to_spine(spine, chunk_counter)
399 for child in wldoc.parts():
400 child_toc, chunk_counter, chunk_chars, sample = transform_file(
401 child, chunk_counter, first=False, sample=sample)
402 toc.append(child_toc)
403 chars = chars.union(chunk_chars)
405 return toc, chunk_counter, chars, sample
408 document = deepcopy(wldoc)
413 document.edoc.getroot().set(flag, 'yes')
416 document.edoc.getroot().set('editors', u', '.join(sorted(
417 editor.readable() for editor in document.editors())))
418 if document.book_info.funders:
419 document.edoc.getroot().set('funders', u', '.join(
420 document.book_info.funders))
422 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
423 manifest = opf.find('.//' + OPFNS('manifest'))
424 guide = opf.find('.//' + OPFNS('guide'))
425 spine = opf.find('.//' + OPFNS('spine'))
427 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
428 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
430 # write static elements
431 mime = zipfile.ZipInfo()
432 mime.filename = 'mimetype'
433 mime.compress_type = zipfile.ZIP_STORED
435 zip.writestr(mime, 'application/epub+zip')
436 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
437 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
438 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
439 'media-type="application/oebps-package+xml" />' \
440 '</rootfiles></container>')
441 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
442 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
444 style = get_resource('epub/style.css')
445 zip.write(style, os.path.join('OPS', 'style.css'))
451 cover_file = StringIO()
452 bound_cover = cover(document.book_info)
453 bound_cover.save(cover_file)
454 cover_name = 'cover.%s' % bound_cover.ext()
455 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
458 cover_tree = etree.parse(get_resource('epub/cover.html'))
459 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
460 zip.writestr('OPS/cover.html', etree.tostring(
461 cover_tree, method="html", pretty_print=True))
463 if bound_cover.uses_dc_cover:
464 if document.book_info.cover_by:
465 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
466 if document.book_info.cover_source:
467 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
469 manifest.append(etree.fromstring(
470 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
471 manifest.append(etree.fromstring(
472 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
473 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
474 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
475 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
478 annotations = etree.Element('annotations')
480 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
481 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
482 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
483 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
485 nav_map = toc_file[-1]
488 manifest.append(etree.fromstring(
489 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
490 spine.append(etree.fromstring(
491 '<itemref idref="html_toc" />'))
492 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
494 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
496 if len(toc.children) < 2:
497 toc.add(u"Początek utworu", "part1.html")
499 # Last modifications in container files and EPUB creation
500 if len(annotations) > 0:
501 toc.add("Przypisy", "annotations.html")
502 manifest.append(etree.fromstring(
503 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
504 spine.append(etree.fromstring(
505 '<itemref idref="annotations" />'))
506 replace_by_verse(annotations)
507 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
508 chars = chars.union(used_chars(html_tree.getroot()))
509 zip.writestr('OPS/annotations.html', etree.tostring(
510 html_tree, method="html", pretty_print=True))
512 toc.add("Weprzyj Wolne Lektury", "support.html")
513 manifest.append(etree.fromstring(
514 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
515 spine.append(etree.fromstring(
516 '<itemref idref="support" />'))
517 html_string = open(get_resource('epub/support.html')).read()
518 chars.update(used_chars(etree.fromstring(html_string)))
519 zip.writestr('OPS/support.html', html_string)
521 toc.add("Strona redakcyjna", "last.html")
522 manifest.append(etree.fromstring(
523 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
524 spine.append(etree.fromstring(
525 '<itemref idref="last" />'))
526 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
527 chars.update(used_chars(html_tree.getroot()))
528 zip.writestr('OPS/last.html', etree.tostring(
529 html_tree, method="html", pretty_print=True))
531 if not flags or not 'without-fonts' in flags:
533 tmpdir = mkdtemp('-librarian-epub')
539 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
540 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
541 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
542 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
544 print "Running font-optimizer"
545 subprocess.check_call(optimizer_call)
547 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
548 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
549 manifest.append(etree.fromstring(
550 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
555 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
556 title = document.book_info.title
557 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
558 for st in attributes:
559 meta = toc_file.makeelement(NCXNS('meta'))
561 meta.set('content', '0')
562 toc_file[0].append(meta)
563 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
564 toc_file[0][1].set('content', str(toc.depth()))
565 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
569 toc.add(u"Spis treści", "toc.html", index=1)
570 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
571 toc.write_to_xml(nav_map)
572 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
575 return OutputFile.from_filename(output_file.name)