1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import WLCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 class Stanza(object):
115 Converts / verse endings into verse elements in a stanza.
117 Slashes may only occur directly in the stanza. Any slashes in subelements
118 will be ignored, and the subelements will be put inside verse elements.
120 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121 >>> Stanza(s).versify()
122 >>> print etree.tostring(s)
123 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
124 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127 def __init__(self, stanza_elem):
128 self.stanza = stanza_elem
130 self.open_verse = None
133 self.push_text(self.stanza.text)
134 for elem in self.stanza:
136 self.push_text(elem.tail)
137 tail = self.stanza.tail
139 self.stanza.tail = tail
140 self.stanza.extend(self.verses)
142 def open_normal_verse(self):
143 self.open_verse = self.stanza.makeelement("wers_normalny")
144 self.verses.append(self.open_verse)
146 def get_open_verse(self):
147 if self.open_verse is None:
148 self.open_normal_verse()
149 return self.open_verse
151 def push_text(self, text):
154 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156 self.open_normal_verse()
157 verse = self.get_open_verse()
159 verse[-1].tail = (verse[-1].tail or "") + verse_text
161 verse.text = (verse.text or "") + verse_text
163 def push_elem(self, elem):
164 if elem.tag.startswith("wers"):
165 verse = deepcopy(elem)
167 self.verses.append(verse)
168 self.open_verse = verse
170 appended = deepcopy(elem)
172 self.get_open_verse().append(appended)
175 def replace_by_verse(tree):
176 """ Find stanzas and create new verses in place of a '/' character """
178 stanzas = tree.findall('.//' + WLNS('strofa'))
179 for stanza in stanzas:
180 Stanza(stanza).versify()
183 def add_to_manifest(manifest, partno):
184 """ Adds a node to the manifest section in content.opf file """
186 partstr = 'part%d' % partno
187 e = manifest.makeelement(OPFNS('item'), attrib={
189 'href': partstr + '.html',
190 'media-type': 'application/xhtml+xml',
195 def add_to_spine(spine, partno):
196 """ Adds a node to the spine section in content.opf file """
198 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
203 def __init__(self, name=None, part_href=None):
206 self.part_href = part_href
207 self.sub_number = None
209 def add(self, name, part_href, level=0, is_part=True, index=None):
210 assert level == 0 or index is None
211 if level > 0 and self.children:
212 return self.children[-1].add(name, part_href, level-1, is_part)
215 t.part_href = part_href
216 if index is not None:
217 self.children.insert(index, t)
219 self.children.append(t)
221 t.sub_number = len(self.children) + 1
224 def append(self, toc):
225 self.children.append(toc)
227 def extend(self, toc):
228 self.children.extend(toc.children)
232 return max((c.depth() for c in self.children)) + 1
238 if self.sub_number is not None:
239 src += '#sub%d' % self.sub_number
242 def write_to_xml(self, nav_map, counter=1):
243 for child in self.children:
244 nav_point = nav_map.makeelement(NCXNS('navPoint'))
245 nav_point.set('id', 'NavPoint-%d' % counter)
246 nav_point.set('playOrder', str(counter))
248 nav_label = nav_map.makeelement(NCXNS('navLabel'))
249 text = nav_map.makeelement(NCXNS('text'))
250 text.text = child.name
251 nav_label.append(text)
252 nav_point.append(nav_label)
254 content = nav_map.makeelement(NCXNS('content'))
255 content.set('src', child.href())
256 nav_point.append(content)
257 nav_map.append(nav_point)
258 counter = child.write_to_xml(nav_point, counter + 1)
261 def html_part(self, depth=0):
263 for child in self.children:
265 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266 (depth, child.href(), child.name))
267 texts.append(child.html_part(depth+1))
268 return "\n".join(texts)
271 with open(get_resource('epub/toc.html')) as f:
272 t = unicode(f.read(), 'utf-8')
273 return t % self.html_part()
276 def used_chars(element):
277 """ Lists characters used in an ETree Element """
278 chars = set((element.text or '') + (element.tail or ''))
279 for child in element:
280 chars = chars.union(used_chars(child))
285 """ divide main content of the XML file into chunks """
287 # prepare a container for each chunk
288 part_xml = etree.Element('utwor')
289 etree.SubElement(part_xml, 'master')
290 main_xml_part = part_xml[0] # master
292 last_node_part = False
293 for one_part in main_text:
295 if name == 'naglowek_czesc':
297 last_node_part = True
298 main_xml_part[:] = [deepcopy(one_part)]
299 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
301 main_xml_part[:] = [deepcopy(one_part)]
303 main_xml_part.append(deepcopy(one_part))
304 last_node_part = False
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
309 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
312 for element in chunk_xml[0]:
313 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
314 toc.add(node_name(element), "part%d.html" % chunk_no)
315 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
316 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
317 element.set('sub', str(subnumber))
319 if not _empty_html_static:
320 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
322 output_html = _empty_html_static[0]
324 find_annotations(annotations, chunk_xml, chunk_no)
325 replace_by_verse(chunk_xml)
326 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
327 chars = used_chars(html_tree.getroot())
328 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
329 return output_html, toc, chars
332 def transform(wldoc, verbose=False,
333 style=None, html_toc=False,
334 sample=None, cover=None, flags=None):
335 """ produces a EPUB file
337 sample=n: generate sample e-book (with at least n paragraphs)
338 cover: a cover.Cover factory or True for default
339 flags: less-advertising, without-fonts, working-copy
342 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
343 """ processes one input file and proceeds to its children """
345 replace_characters(wldoc.edoc.getroot())
347 # every input file will have a TOC entry,
348 # pointing to starting chunk
349 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
352 # write book title page
353 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
354 chars = used_chars(html_tree.getroot())
355 zip.writestr('OPS/title.html',
356 etree.tostring(html_tree, method="html", pretty_print=True))
357 # add a title page TOC entry
358 toc.add(u"Strona tytułowa", "title.html")
359 elif wldoc.book_info.parts:
360 # write title page for every parent
361 if sample is not None and sample <= 0:
363 html_string = open(get_resource('epub/emptyChunk.html')).read()
365 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
366 chars = used_chars(html_tree.getroot())
367 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
368 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
369 add_to_manifest(manifest, chunk_counter)
370 add_to_spine(spine, chunk_counter)
373 if len(wldoc.edoc.getroot()) > 1:
374 # rdf before style master
375 main_text = wldoc.edoc.getroot()[1]
377 # rdf in style master
378 main_text = wldoc.edoc.getroot()[0]
379 if main_text.tag == RDFNS('RDF'):
382 if main_text is not None:
383 for chunk_xml in chop(main_text):
385 if sample is not None:
389 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
390 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
392 toc.extend(chunk_toc)
393 chars = chars.union(chunk_chars)
394 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
395 add_to_manifest(manifest, chunk_counter)
396 add_to_spine(spine, chunk_counter)
399 for child in wldoc.parts():
400 child_toc, chunk_counter, chunk_chars, sample = transform_file(
401 child, chunk_counter, first=False, sample=sample)
402 toc.append(child_toc)
403 chars = chars.union(chunk_chars)
405 return toc, chunk_counter, chars, sample
408 document = deepcopy(wldoc)
413 document.edoc.getroot().set(flag, 'yes')
416 document.edoc.getroot().set('editors', u', '.join(sorted(
417 editor.readable() for editor in document.editors())))
419 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
420 manifest = opf.find('.//' + OPFNS('manifest'))
421 guide = opf.find('.//' + OPFNS('guide'))
422 spine = opf.find('.//' + OPFNS('spine'))
424 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
425 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
427 # write static elements
428 mime = zipfile.ZipInfo()
429 mime.filename = 'mimetype'
430 mime.compress_type = zipfile.ZIP_STORED
432 zip.writestr(mime, 'application/epub+zip')
433 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
434 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
435 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
436 'media-type="application/oebps-package+xml" />' \
437 '</rootfiles></container>')
438 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
439 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
441 style = get_resource('epub/style.css')
442 zip.write(style, os.path.join('OPS', 'style.css'))
448 cover_file = StringIO()
449 bound_cover = cover(document.book_info)
450 bound_cover.save(cover_file)
451 cover_name = 'cover.%s' % bound_cover.ext()
452 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
455 cover_tree = etree.parse(get_resource('epub/cover.html'))
456 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
457 zip.writestr('OPS/cover.html', etree.tostring(
458 cover_tree, method="html", pretty_print=True))
460 if bound_cover.uses_dc_cover:
461 if document.book_info.cover_by:
462 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
463 if document.book_info.cover_source:
464 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
466 manifest.append(etree.fromstring(
467 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
468 manifest.append(etree.fromstring(
469 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
470 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
471 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
472 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
475 annotations = etree.Element('annotations')
477 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
478 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
479 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
480 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
482 nav_map = toc_file[-1]
485 manifest.append(etree.fromstring(
486 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
487 spine.append(etree.fromstring(
488 '<itemref idref="html_toc" />'))
489 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
491 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
493 if len(toc.children) < 2:
494 toc.add(u"Początek utworu", "part1.html")
496 # Last modifications in container files and EPUB creation
497 if len(annotations) > 0:
498 toc.add("Przypisy", "annotations.html")
499 manifest.append(etree.fromstring(
500 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
501 spine.append(etree.fromstring(
502 '<itemref idref="annotations" />'))
503 replace_by_verse(annotations)
504 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
505 chars = chars.union(used_chars(html_tree.getroot()))
506 zip.writestr('OPS/annotations.html', etree.tostring(
507 html_tree, method="html", pretty_print=True))
509 toc.add("Weprzyj Wolne Lektury", "support.html")
510 manifest.append(etree.fromstring(
511 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
512 spine.append(etree.fromstring(
513 '<itemref idref="support" />'))
514 html_string = open(get_resource('epub/support.html')).read()
515 chars.update(used_chars(etree.fromstring(html_string)))
516 zip.writestr('OPS/support.html', html_string)
518 toc.add("Strona redakcyjna", "last.html")
519 manifest.append(etree.fromstring(
520 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
521 spine.append(etree.fromstring(
522 '<itemref idref="last" />'))
523 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
524 chars.update(used_chars(html_tree.getroot()))
525 zip.writestr('OPS/last.html', etree.tostring(
526 html_tree, method="html", pretty_print=True))
528 if not flags or not 'without-fonts' in flags:
530 tmpdir = mkdtemp('-librarian-epub')
536 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
537 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
538 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
539 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
541 print "Running font-optimizer"
542 subprocess.check_call(optimizer_call)
544 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
545 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
546 manifest.append(etree.fromstring(
547 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
552 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
553 title = document.book_info.title
554 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
555 for st in attributes:
556 meta = toc_file.makeelement(NCXNS('meta'))
558 meta.set('content', '0')
559 toc_file[0].append(meta)
560 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
561 toc_file[0][1].set('content', str(toc.depth()))
562 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
566 toc.add(u"Spis treści", "toc.html", index=1)
567 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
568 toc.write_to_xml(nav_map)
569 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
572 return OutputFile.from_filename(output_file.name)