1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
22 from librarian.cover import WLCover
24 from librarian import functions, get_resource
26 functions.reg_person_name()
30 """ returns node's text and children as a string
32 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
36 nt = node.text if node.text is not None else ''
37 return ''.join([nt] + [etree.tostring(child) for child in node])
39 def set_inner_xml(node, text):
40 """ sets node's text and children from a string
42 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
43 >>> set_inner_xml(e, 'x<b>y</b>z')
44 >>> print etree.tostring(e)
48 p = etree.fromstring('<x>%s</x>' % text)
54 """ Find out a node's name
56 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
60 tempnode = deepcopy(node)
62 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
63 for e in tempnode.findall('.//%s' % p):
67 etree.strip_tags(tempnode, '*')
72 if isinstance(xml, etree._Element):
73 xml = etree.ElementTree(xml)
74 with open(sheet) as xsltf:
75 return xml.xslt(etree.parse(xsltf))
78 def replace_characters(node):
79 def replace_chars(text):
82 return text.replace(u"\ufeff", u"")\
83 .replace("---", u"\u2014")\
84 .replace("--", u"\u2013")\
85 .replace(",,", u"\u201E")\
86 .replace('"', u"\u201D")\
87 .replace("'", u"\u2019")
88 if node.tag in ('uwaga', 'extra'):
92 node.text = replace_chars(node.text)
93 node.tail = replace_chars(node.tail)
95 replace_characters(child)
98 def find_annotations(annotations, source, part_no):
100 if child.tag in ('pe', 'pa', 'pt', 'pr'):
101 annotation = deepcopy(child)
102 number = str(len(annotations)+1)
103 annotation.set('number', number)
104 annotation.set('part', str(part_no))
106 annotations.append(annotation)
111 if child.tag not in ('extra', 'uwaga'):
112 find_annotations(annotations, child, part_no)
115 class Stanza(object):
117 Converts / verse endings into verse elements in a stanza.
119 Slashes may only occur directly in the stanza. Any slashes in subelements
120 will be ignored, and the subelements will be put inside verse elements.
122 >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
123 >>> Stanza(s).versify()
124 >>> print etree.tostring(s)
125 <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
126 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
129 def __init__(self, stanza_elem):
130 self.stanza = stanza_elem
132 self.open_verse = None
135 self.push_text(self.stanza.text)
136 for elem in self.stanza:
138 self.push_text(elem.tail)
139 tail = self.stanza.tail
141 self.stanza.tail = tail
142 self.stanza.extend(self.verses)
144 def open_normal_verse(self):
145 self.open_verse = self.stanza.makeelement("wers_normalny")
146 self.verses.append(self.open_verse)
148 def get_open_verse(self):
149 if self.open_verse is None:
150 self.open_normal_verse()
151 return self.open_verse
153 def push_text(self, text):
154 if not text or not text.strip():
156 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
158 self.open_normal_verse()
159 verse = self.get_open_verse()
161 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
163 verse.text = (verse.text or "") + verse_text.strip()
165 def push_elem(self, elem):
166 if elem.tag.startswith("wers"):
167 verse = deepcopy(elem)
169 self.verses.append(verse)
170 self.open_verse = verse
172 appended = deepcopy(elem)
174 self.get_open_verse().append(appended)
177 def replace_by_verse(tree):
178 """ Find stanzas and create new verses in place of a '/' character """
180 stanzas = tree.findall('.//' + WLNS('strofa'))
181 for stanza in stanzas:
182 Stanza(stanza).versify()
185 def add_to_manifest(manifest, partno):
186 """ Adds a node to the manifest section in content.opf file """
188 partstr = 'part%d' % partno
189 e = manifest.makeelement(OPFNS('item'), attrib={
191 'href': partstr + '.html',
192 'media-type': 'application/xhtml+xml',
197 def add_to_spine(spine, partno):
198 """ Adds a node to the spine section in content.opf file """
200 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
205 def __init__(self, name=None, part_href=None):
208 self.part_href = part_href
209 self.sub_number = None
211 def add(self, name, part_href, level=0, is_part=True, index=None):
212 assert level == 0 or index is None
213 if level > 0 and self.children:
214 return self.children[-1].add(name, part_href, level-1, is_part)
217 t.part_href = part_href
218 if index is not None:
219 self.children.insert(index, t)
221 self.children.append(t)
223 t.sub_number = len(self.children) + 1
226 def append(self, toc):
227 self.children.append(toc)
229 def extend(self, toc):
230 self.children.extend(toc.children)
234 return max((c.depth() for c in self.children)) + 1
240 if self.sub_number is not None:
241 src += '#sub%d' % self.sub_number
244 def write_to_xml(self, nav_map, counter=1):
245 for child in self.children:
246 nav_point = nav_map.makeelement(NCXNS('navPoint'))
247 nav_point.set('id', 'NavPoint-%d' % counter)
248 nav_point.set('playOrder', str(counter))
250 nav_label = nav_map.makeelement(NCXNS('navLabel'))
251 text = nav_map.makeelement(NCXNS('text'))
252 text.text = child.name
253 nav_label.append(text)
254 nav_point.append(nav_label)
256 content = nav_map.makeelement(NCXNS('content'))
257 content.set('src', child.href())
258 nav_point.append(content)
259 nav_map.append(nav_point)
260 counter = child.write_to_xml(nav_point, counter + 1)
263 def html_part(self, depth=0):
265 for child in self.children:
267 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
268 (depth, child.href(), child.name))
269 texts.append(child.html_part(depth+1))
270 return "\n".join(texts)
273 with open(get_resource('epub/toc.html')) as f:
274 t = unicode(f.read(), 'utf-8')
275 return t % self.html_part()
278 def used_chars(element):
279 """ Lists characters used in an ETree Element """
280 chars = set((element.text or '') + (element.tail or ''))
281 for child in element:
282 chars = chars.union(used_chars(child))
287 """ divide main content of the XML file into chunks """
289 # prepare a container for each chunk
290 part_xml = etree.Element('utwor')
291 etree.SubElement(part_xml, 'master')
292 main_xml_part = part_xml[0] # master
294 last_node_part = False
295 for one_part in main_text:
297 if name == 'naglowek_czesc':
299 last_node_part = True
300 main_xml_part[:] = [deepcopy(one_part)]
301 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
303 main_xml_part[:] = [deepcopy(one_part)]
305 main_xml_part.append(deepcopy(one_part))
306 last_node_part = False
310 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
311 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
314 for element in chunk_xml[0]:
315 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
316 toc.add(node_name(element), "part%d.html" % chunk_no)
317 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
318 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
319 element.set('sub', str(subnumber))
321 if not _empty_html_static:
322 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
324 output_html = _empty_html_static[0]
326 find_annotations(annotations, chunk_xml, chunk_no)
327 replace_by_verse(chunk_xml)
328 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
329 chars = used_chars(html_tree.getroot())
330 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
331 return output_html, toc, chars
334 def transform(wldoc, verbose=False, style=None, html_toc=False,
335 sample=None, cover=None, flags=None, ilustr_path=''):
336 """ produces a EPUB file
338 sample=n: generate sample e-book (with at least n paragraphs)
339 cover: a cover.Cover factory or True for default
340 flags: less-advertising, without-fonts, working-copy
343 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
344 """ processes one input file and proceeds to its children """
346 replace_characters(wldoc.edoc.getroot())
348 # every input file will have a TOC entry,
349 # pointing to starting chunk
350 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
353 # write book title page
354 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
355 chars = used_chars(html_tree.getroot())
356 zip.writestr('OPS/title.html',
357 etree.tostring(html_tree, method="html", pretty_print=True))
358 # add a title page TOC entry
359 toc.add(u"Strona tytułowa", "title.html")
360 elif wldoc.book_info.parts:
361 # write title page for every parent
362 if sample is not None and sample <= 0:
364 html_string = open(get_resource('epub/emptyChunk.html')).read()
366 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
367 chars = used_chars(html_tree.getroot())
368 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
369 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
370 add_to_manifest(manifest, chunk_counter)
371 add_to_spine(spine, chunk_counter)
374 if len(wldoc.edoc.getroot()) > 1:
375 # rdf before style master
376 main_text = wldoc.edoc.getroot()[1]
378 # rdf in style master
379 main_text = wldoc.edoc.getroot()[0]
380 if main_text.tag == RDFNS('RDF'):
383 if main_text is not None:
384 for chunk_xml in chop(main_text):
386 if sample is not None:
390 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
391 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
393 toc.extend(chunk_toc)
394 chars = chars.union(chunk_chars)
395 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
396 add_to_manifest(manifest, chunk_counter)
397 add_to_spine(spine, chunk_counter)
400 for child in wldoc.parts():
401 child_toc, chunk_counter, chunk_chars, sample = transform_file(
402 child, chunk_counter, first=False, sample=sample)
403 toc.append(child_toc)
404 chars = chars.union(chunk_chars)
406 return toc, chunk_counter, chars, sample
409 document = deepcopy(wldoc)
414 document.edoc.getroot().set(flag, 'yes')
417 document.edoc.getroot().set('editors', u', '.join(sorted(
418 editor.readable() for editor in document.editors())))
420 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
421 manifest = opf.find('.//' + OPFNS('manifest'))
422 guide = opf.find('.//' + OPFNS('guide'))
423 spine = opf.find('.//' + OPFNS('spine'))
425 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
426 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
428 if os.path.isdir(ilustr_path):
429 for i, filename in enumerate(os.listdir(ilustr_path)):
430 file_path = os.path.join(ilustr_path, filename)
431 zip.write(file_path, os.path.join('OPS', filename))
432 image_id = 'image%s' % i
433 manifest.append(etree.fromstring(
434 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
436 # write static elements
437 mime = zipfile.ZipInfo()
438 mime.filename = 'mimetype'
439 mime.compress_type = zipfile.ZIP_STORED
441 zip.writestr(mime, 'application/epub+zip')
442 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
443 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
444 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
445 'media-type="application/oebps-package+xml" />' \
446 '</rootfiles></container>')
447 #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
448 zip.write(get_resource('res/koedlogo.png'), os.path.join('OPS', 'logo_koed.png'))
449 #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
451 style = get_resource('epub/style.css')
452 zip.write(style, os.path.join('OPS', 'style.css'))
460 cover_file = StringIO()
461 bound_cover = cover(document.book_info)
462 bound_cover.save(cover_file)
463 cover_name = 'cover.%s' % bound_cover.ext()
464 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
467 cover_tree = etree.parse(get_resource('epub/cover.html'))
468 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
469 zip.writestr('OPS/cover.html', etree.tostring(
470 cover_tree, method="html", pretty_print=True))
472 if bound_cover.uses_dc_cover:
473 if document.book_info.cover_by:
474 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
475 if document.book_info.cover_source:
476 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
478 manifest.append(etree.fromstring(
479 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
480 manifest.append(etree.fromstring(
481 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
482 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
483 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
484 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
487 annotations = etree.Element('annotations')
489 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
490 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
491 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
492 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
494 nav_map = toc_file[-1]
497 manifest.append(etree.fromstring(
498 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
499 spine.append(etree.fromstring(
500 '<itemref idref="html_toc" />'))
501 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
503 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
505 if len(toc.children) < 2:
506 toc.add(u"Początek utworu", "part1.html")
508 # Last modifications in container files and EPUB creation
509 if len(annotations) > 0:
510 toc.add("Przypisy", "annotations.html")
511 manifest.append(etree.fromstring(
512 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
513 spine.append(etree.fromstring(
514 '<itemref idref="annotations" />'))
515 replace_by_verse(annotations)
516 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
517 chars = chars.union(used_chars(html_tree.getroot()))
518 zip.writestr('OPS/annotations.html', etree.tostring(
519 html_tree, method="html", pretty_print=True))
521 toc.add("Strona redakcyjna", "last.html")
522 manifest.append(etree.fromstring(
523 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
524 spine.append(etree.fromstring(
525 '<itemref idref="last" />'))
526 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
527 chars.update(used_chars(html_tree.getroot()))
528 zip.writestr('OPS/last.html', etree.tostring(
529 html_tree, method="html", pretty_print=True))
531 if not flags or not 'without-fonts' in flags:
533 tmpdir = mkdtemp('-librarian-epub')
539 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
540 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
541 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
542 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
544 print "Running font-optimizer"
545 subprocess.check_call(optimizer_call)
547 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
548 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
549 manifest.append(etree.fromstring(
550 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
555 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
556 title = document.book_info.title
557 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
558 for st in attributes:
559 meta = toc_file.makeelement(NCXNS('meta'))
561 meta.set('content', '0')
562 toc_file[0].append(meta)
563 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
564 toc_file[0][1].set('content', str(toc.depth()))
565 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
569 toc.add(u"Spis treści", "toc.html", index=1)
570 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
571 toc.write_to_xml(nav_map)
572 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
575 return OutputFile.from_filename(output_file.name)