1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
20 from librarian.cover import DefaultEbookCover
22 from librarian import functions, get_resource
24 from librarian.hyphenator import Hyphenator
26 functions.reg_person_name()
27 functions.reg_lang_code_3to2()
29 def set_hyph_language(source_tree):
30 def get_short_lng_code(text):
33 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
35 list = line.strip().split('|')
42 bibl_lng = etree.XPath('//dc:language//text()', namespaces = {'dc':str(DCNS)})(source_tree)
43 short_lng = get_short_lng_code(bibl_lng[0])
45 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' + short_lng + '.dic'))
49 def hyphenate_and_fix_conjunctions(source_tree, hyph):
51 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
53 parent = t.getparent()
55 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
57 newt += hyph.inserted(w, u'\u00AD')
58 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
65 """ returns node's text and children as a string
67 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
71 nt = node.text if node.text is not None else ''
72 return ''.join([nt] + [etree.tostring(child) for child in node])
74 def set_inner_xml(node, text):
75 """ sets node's text and children from a string
77 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
78 >>> set_inner_xml(e, 'x<b>y</b>z')
79 >>> print etree.tostring(e)
83 p = etree.fromstring('<x>%s</x>' % text)
89 """ Find out a node's name
91 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
95 tempnode = deepcopy(node)
97 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
98 for e in tempnode.findall('.//%s' % p):
102 etree.strip_tags(tempnode, '*')
106 def xslt(xml, sheet):
107 if isinstance(xml, etree._Element):
108 xml = etree.ElementTree(xml)
109 with open(sheet) as xsltf:
110 return xml.xslt(etree.parse(xsltf))
113 def replace_characters(node):
114 def replace_chars(text):
117 return text.replace(u"\ufeff", u"")\
118 .replace("---", u"\u2014")\
119 .replace("--", u"\u2013")\
120 .replace(",,", u"\u201E")\
121 .replace('"', u"\u201D")\
122 .replace("'", u"\u2019")
123 if node.tag in ('uwaga', 'extra'):
127 node.text = replace_chars(node.text)
128 node.tail = replace_chars(node.tail)
130 replace_characters(child)
133 def find_annotations(annotations, source, part_no):
135 if child.tag in ('pe', 'pa', 'pt', 'pr'):
136 annotation = deepcopy(child)
137 number = str(len(annotations)+1)
138 annotation.set('number', number)
139 annotation.set('part', str(part_no))
141 annotations.append(annotation)
146 if child.tag not in ('extra', 'uwaga'):
147 find_annotations(annotations, child, part_no)
150 class Stanza(object):
152 Converts / verse endings into verse elements in a stanza.
154 Slashes may only occur directly in the stanza. Any slashes in subelements
155 will be ignored, and the subelements will be put inside verse elements.
157 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
158 >>> Stanza(s).versify()
159 >>> print etree.tostring(s)
160 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
161 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
164 def __init__(self, stanza_elem):
165 self.stanza = stanza_elem
167 self.open_verse = None
170 self.push_text(self.stanza.text)
171 for elem in self.stanza:
173 self.push_text(elem.tail)
174 tail = self.stanza.tail
176 self.stanza.tail = tail
177 self.stanza.extend(self.verses)
179 def open_normal_verse(self):
180 self.open_verse = self.stanza.makeelement("wers_normalny")
181 self.verses.append(self.open_verse)
183 def get_open_verse(self):
184 if self.open_verse is None:
185 self.open_normal_verse()
186 return self.open_verse
188 def push_text(self, text):
191 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
193 self.open_normal_verse()
194 verse = self.get_open_verse()
196 verse[-1].tail = (verse[-1].tail or "") + verse_text
198 verse.text = (verse.text or "") + verse_text
200 def push_elem(self, elem):
201 if elem.tag.startswith("wers"):
202 verse = deepcopy(elem)
204 self.verses.append(verse)
205 self.open_verse = verse
207 appended = deepcopy(elem)
209 self.get_open_verse().append(appended)
212 def replace_by_verse(tree):
213 """ Find stanzas and create new verses in place of a '/' character """
215 stanzas = tree.findall('.//' + WLNS('strofa'))
216 for stanza in stanzas:
217 Stanza(stanza).versify()
220 def add_to_manifest(manifest, partno):
221 """ Adds a node to the manifest section in content.opf file """
223 partstr = 'part%d' % partno
224 e = manifest.makeelement(OPFNS('item'), attrib={
226 'href': partstr + '.html',
227 'media-type': 'application/xhtml+xml',
232 def add_to_spine(spine, partno):
233 """ Adds a node to the spine section in content.opf file """
235 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
240 def __init__(self, name=None, part_href=None):
243 self.part_href = part_href
244 self.sub_number = None
246 def add(self, name, part_href, level=0, is_part=True, index=None):
247 assert level == 0 or index is None
248 if level > 0 and self.children:
249 return self.children[-1].add(name, part_href, level-1, is_part)
252 t.part_href = part_href
253 if index is not None:
254 self.children.insert(index, t)
256 self.children.append(t)
258 t.sub_number = len(self.children) + 1
261 def append(self, toc):
262 self.children.append(toc)
264 def extend(self, toc):
265 self.children.extend(toc.children)
269 return max((c.depth() for c in self.children)) + 1
275 if self.sub_number is not None:
276 src += '#sub%d' % self.sub_number
279 def write_to_xml(self, nav_map, counter=1):
280 for child in self.children:
281 nav_point = nav_map.makeelement(NCXNS('navPoint'))
282 nav_point.set('id', 'NavPoint-%d' % counter)
283 nav_point.set('playOrder', str(counter))
285 nav_label = nav_map.makeelement(NCXNS('navLabel'))
286 text = nav_map.makeelement(NCXNS('text'))
287 if child.name is not None:
288 text.text = re.sub(r'\n', ' ', child.name)
290 text.text = child.name
291 nav_label.append(text)
292 nav_point.append(nav_label)
294 content = nav_map.makeelement(NCXNS('content'))
295 content.set('src', child.href())
296 nav_point.append(content)
297 nav_map.append(nav_point)
298 counter = child.write_to_xml(nav_point, counter + 1)
301 def html_part(self, depth=0):
303 for child in self.children:
305 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
306 (depth, child.href(), child.name))
307 texts.append(child.html_part(depth+1))
308 return "\n".join(texts)
311 with open(get_resource('epub/toc.html')) as f:
312 t = unicode(f.read(), 'utf-8')
313 return t % self.html_part()
316 def used_chars(element):
317 """ Lists characters used in an ETree Element """
318 chars = set((element.text or '') + (element.tail or ''))
319 for child in element:
320 chars = chars.union(used_chars(child))
325 """ divide main content of the XML file into chunks """
327 # prepare a container for each chunk
328 part_xml = etree.Element('utwor')
329 etree.SubElement(part_xml, 'master')
330 main_xml_part = part_xml[0] # master
332 last_node_part = False
334 # the below loop are workaround for a problem with epubs in drama ebooks without acts
337 for one_part in main_text:
339 if name == 'naglowek_scena':
341 elif name == 'naglowek_akt':
344 for one_part in main_text:
346 if is_act is False and is_scene is True:
347 if name == 'naglowek_czesc':
349 last_node_part = True
350 main_xml_part[:] = [deepcopy(one_part)]
351 elif not last_node_part and name == "naglowek_scena":
353 main_xml_part[:] = [deepcopy(one_part)]
355 main_xml_part.append(deepcopy(one_part))
356 last_node_part = False
358 if name == 'naglowek_czesc':
360 last_node_part = True
361 main_xml_part[:] = [deepcopy(one_part)]
362 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
364 main_xml_part[:] = [deepcopy(one_part)]
366 main_xml_part.append(deepcopy(one_part))
367 last_node_part = False
371 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
372 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
375 for element in chunk_xml[0]:
376 if element.tag == "naglowek_czesc":
377 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
378 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
379 toc.add(node_name(element), "part%d.html" % chunk_no)
380 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
381 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
382 element.set('sub', str(subnumber))
384 if not _empty_html_static:
385 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
387 output_html = _empty_html_static[0]
389 find_annotations(annotations, chunk_xml, chunk_no)
390 replace_by_verse(chunk_xml)
391 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
392 chars = used_chars(html_tree.getroot())
393 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
394 return output_html, toc, chars
397 def transform(wldoc, verbose=False,
398 style=None, html_toc=False,
399 sample=None, cover=None, flags=None):
400 """ produces a EPUB file
402 sample=n: generate sample e-book (with at least n paragraphs)
403 cover: a cover.Cover factory or True for default
404 flags: less-advertising, without-fonts, working-copy, with-full-fonts
407 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
408 """ processes one input file and proceeds to its children """
410 replace_characters(wldoc.edoc.getroot())
412 hyphenator = set_hyph_language(wldoc.edoc.getroot())
413 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
416 # every input file will have a TOC entry,
417 # pointing to starting chunk
418 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
421 # write book title page
422 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
423 chars = used_chars(html_tree.getroot())
424 zip.writestr('OPS/title.html',
425 etree.tostring(html_tree, method="html", pretty_print=True))
426 # add a title page TOC entry
427 toc.add(u"Strona tytułowa", "title.html")
428 elif wldoc.book_info.parts:
429 # write title page for every parent
430 if sample is not None and sample <= 0:
432 html_string = open(get_resource('epub/emptyChunk.html')).read()
434 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
435 chars = used_chars(html_tree.getroot())
436 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
437 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
438 add_to_manifest(manifest, chunk_counter)
439 add_to_spine(spine, chunk_counter)
442 if len(wldoc.edoc.getroot()) > 1:
443 # rdf before style master
444 main_text = wldoc.edoc.getroot()[1]
446 # rdf in style master
447 main_text = wldoc.edoc.getroot()[0]
448 if main_text.tag == RDFNS('RDF'):
451 if main_text is not None:
452 for chunk_xml in chop(main_text):
454 if sample is not None:
458 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
459 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
461 toc.extend(chunk_toc)
462 chars = chars.union(chunk_chars)
463 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
464 add_to_manifest(manifest, chunk_counter)
465 add_to_spine(spine, chunk_counter)
468 for child in wldoc.parts():
469 child_toc, chunk_counter, chunk_chars, sample = transform_file(
470 child, chunk_counter, first=False, sample=sample)
471 toc.append(child_toc)
472 chars = chars.union(chunk_chars)
474 return toc, chunk_counter, chars, sample
477 document = deepcopy(wldoc)
482 document.edoc.getroot().set(flag, 'yes')
485 document.edoc.getroot().set('editors', u', '.join(sorted(
486 editor.readable() for editor in document.editors())))
487 if document.book_info.funders:
488 document.edoc.getroot().set('funders', u', '.join(
489 document.book_info.funders))
490 if document.book_info.thanks:
491 document.edoc.getroot().set('thanks', document.book_info.thanks)
493 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
494 manifest = opf.find('.//' + OPFNS('manifest'))
495 guide = opf.find('.//' + OPFNS('guide'))
496 spine = opf.find('.//' + OPFNS('spine'))
498 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
499 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
501 # write static elements
502 mime = zipfile.ZipInfo()
503 mime.filename = 'mimetype'
504 mime.compress_type = zipfile.ZIP_STORED
506 zip.writestr(mime, 'application/epub+zip')
507 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
508 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
509 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
510 'media-type="application/oebps-package+xml" />' \
511 '</rootfiles></container>')
512 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
513 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
515 style = get_resource('epub/style.css')
516 zip.write(style, os.path.join('OPS', 'style.css'))
520 cover = DefaultEbookCover
522 cover_file = StringIO()
523 bound_cover = cover(document.book_info)
524 bound_cover.save(cover_file)
525 cover_name = 'cover.%s' % bound_cover.ext()
526 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
529 cover_tree = etree.parse(get_resource('epub/cover.html'))
530 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
531 zip.writestr('OPS/cover.html', etree.tostring(
532 cover_tree, method="html", pretty_print=True))
534 if bound_cover.uses_dc_cover:
535 if document.book_info.cover_by:
536 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
537 if document.book_info.cover_source:
538 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
540 manifest.append(etree.fromstring(
541 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
542 manifest.append(etree.fromstring(
543 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
544 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
545 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
546 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
549 annotations = etree.Element('annotations')
551 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
552 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
553 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
554 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
556 nav_map = toc_file[-1]
559 manifest.append(etree.fromstring(
560 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
561 spine.append(etree.fromstring(
562 '<itemref idref="html_toc" />'))
563 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
565 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
567 if len(toc.children) < 2:
568 toc.add(u"Początek utworu", "part1.html")
570 # Last modifications in container files and EPUB creation
571 if len(annotations) > 0:
572 toc.add("Przypisy", "annotations.html")
573 manifest.append(etree.fromstring(
574 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
575 spine.append(etree.fromstring(
576 '<itemref idref="annotations" />'))
577 replace_by_verse(annotations)
578 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
579 chars = chars.union(used_chars(html_tree.getroot()))
580 zip.writestr('OPS/annotations.html', etree.tostring(
581 html_tree, method="html", pretty_print=True))
583 toc.add("Wesprzyj Wolne Lektury", "support.html")
584 manifest.append(etree.fromstring(
585 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
586 spine.append(etree.fromstring(
587 '<itemref idref="support" />'))
588 html_string = open(get_resource('epub/support.html')).read()
589 chars.update(used_chars(etree.fromstring(html_string)))
590 zip.writestr('OPS/support.html', html_string)
592 toc.add("Strona redakcyjna", "last.html")
593 manifest.append(etree.fromstring(
594 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
595 spine.append(etree.fromstring(
596 '<itemref idref="last" />'))
597 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
598 chars.update(used_chars(html_tree.getroot()))
599 zip.writestr('OPS/last.html', etree.tostring(
600 html_tree, method="html", pretty_print=True))
602 if not flags or not 'without-fonts' in flags:
604 tmpdir = mkdtemp('-librarian-epub')
610 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
611 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
612 if not flags or not 'with-full-fonts' in flags:
613 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
614 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
616 print "Running font-optimizer"
617 subprocess.check_call(optimizer_call)
619 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
620 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
622 zip.write(get_resource('fonts/' + fname), os.path.join('OPS', fname))
623 manifest.append(etree.fromstring(
624 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
628 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
629 title = document.book_info.title
630 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
631 for st in attributes:
632 meta = toc_file.makeelement(NCXNS('meta'))
634 meta.set('content', '0')
635 toc_file[0].append(meta)
636 toc_file[0][0].set('content', str(document.book_info.url))
637 toc_file[0][1].set('content', str(toc.depth()))
638 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
642 toc.add(u"Spis treści", "toc.html", index=1)
643 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
644 toc.write_to_xml(nav_map)
645 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True, xml_declaration = True, encoding='UTF-8'))
648 return OutputFile.from_filename(output_file.name)