1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import DefaultEbookCover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet, **kwargs):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 transform = etree.XSLT(etree.parse(xsltf))
121 params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
122 return transform(xml, **params)
125 def replace_characters(node):
126 def replace_chars(text):
129 return text.replace(u"\ufeff", u"")\
130 .replace("---", u"\u2014")\
131 .replace("--", u"\u2013")\
132 .replace(",,", u"\u201E")\
133 .replace('"', u"\u201D")\
134 .replace("'", u"\u2019")
135 if node.tag in ('uwaga', 'extra'):
139 node.text = replace_chars(node.text)
140 node.tail = replace_chars(node.tail)
142 replace_characters(child)
145 def find_annotations(annotations, source, part_no):
147 if child.tag in ('pe', 'pa', 'pt', 'pr'):
148 annotation = deepcopy(child)
149 number = str(len(annotations) + 1)
150 annotation.set('number', number)
151 annotation.set('part', str(part_no))
153 annotations.append(annotation)
158 if child.tag not in ('extra', 'uwaga'):
159 find_annotations(annotations, child, part_no)
162 class Stanza(object):
164 Converts / verse endings into verse elements in a stanza.
166 Slashes may only occur directly in the stanza. Any slashes in subelements
167 will be ignored, and the subelements will be put inside verse elements.
169 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
170 >>> Stanza(s).versify()
171 >>> print etree.tostring(s)
172 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
173 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
176 def __init__(self, stanza_elem):
177 self.stanza = stanza_elem
179 self.open_verse = None
182 self.push_text(self.stanza.text)
183 for elem in self.stanza:
185 self.push_text(elem.tail)
186 tail = self.stanza.tail
188 self.stanza.tail = tail
189 self.stanza.extend(self.verses)
191 def open_normal_verse(self):
192 self.open_verse = self.stanza.makeelement("wers_normalny")
193 self.verses.append(self.open_verse)
195 def get_open_verse(self):
196 if self.open_verse is None:
197 self.open_normal_verse()
198 return self.open_verse
200 def push_text(self, text):
203 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
205 self.open_normal_verse()
206 verse = self.get_open_verse()
208 verse[-1].tail = (verse[-1].tail or "") + verse_text
210 verse.text = (verse.text or "") + verse_text
212 def push_elem(self, elem):
213 if elem.tag.startswith("wers"):
214 verse = deepcopy(elem)
216 self.verses.append(verse)
217 self.open_verse = verse
219 appended = deepcopy(elem)
221 self.get_open_verse().append(appended)
224 def replace_by_verse(tree):
225 """ Find stanzas and create new verses in place of a '/' character """
227 stanzas = tree.findall('.//' + WLNS('strofa'))
228 for stanza in stanzas:
229 Stanza(stanza).versify()
232 def add_to_manifest(manifest, partno):
233 """ Adds a node to the manifest section in content.opf file """
235 partstr = 'part%d' % partno
236 e = manifest.makeelement(
237 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
238 'media-type': 'application/xhtml+xml'}
243 def add_to_spine(spine, partno):
244 """ Adds a node to the spine section in content.opf file """
246 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
251 def __init__(self, name=None, part_href=None):
254 self.part_href = part_href
255 self.sub_number = None
257 def add(self, name, part_href, level=0, is_part=True, index=None):
258 assert level == 0 or index is None
259 if level > 0 and self.children:
260 return self.children[-1].add(name, part_href, level - 1, is_part)
263 t.part_href = part_href
264 if index is not None:
265 self.children.insert(index, t)
267 self.children.append(t)
269 t.sub_number = len(self.children) + 1
272 def append(self, toc):
273 self.children.append(toc)
275 def extend(self, toc):
276 self.children.extend(toc.children)
280 return max((c.depth() for c in self.children)) + 1
286 if self.sub_number is not None:
287 src += '#sub%d' % self.sub_number
290 def write_to_xml(self, nav_map, counter=1):
291 for child in self.children:
292 nav_point = nav_map.makeelement(NCXNS('navPoint'))
293 nav_point.set('id', 'NavPoint-%d' % counter)
294 nav_point.set('playOrder', str(counter))
296 nav_label = nav_map.makeelement(NCXNS('navLabel'))
297 text = nav_map.makeelement(NCXNS('text'))
298 if child.name is not None:
299 text.text = re.sub(r'\n', ' ', child.name)
301 text.text = child.name
302 nav_label.append(text)
303 nav_point.append(nav_label)
305 content = nav_map.makeelement(NCXNS('content'))
306 content.set('src', child.href())
307 nav_point.append(content)
308 nav_map.append(nav_point)
309 counter = child.write_to_xml(nav_point, counter + 1)
312 def html_part(self, depth=0):
314 for child in self.children:
316 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
317 (depth, child.href(), child.name))
318 texts.append(child.html_part(depth + 1))
319 return "\n".join(texts)
322 with open(get_resource('epub/toc.html')) as f:
323 t = unicode(f.read(), 'utf-8')
324 return t % self.html_part()
327 def used_chars(element):
328 """ Lists characters used in an ETree Element """
329 chars = set((element.text or '') + (element.tail or ''))
330 for child in element:
331 chars = chars.union(used_chars(child))
336 """ divide main content of the XML file into chunks """
338 # prepare a container for each chunk
339 part_xml = etree.Element('utwor')
340 etree.SubElement(part_xml, 'master')
341 main_xml_part = part_xml[0] # master
343 last_node_part = False
345 # the below loop are workaround for a problem with epubs in drama ebooks without acts
348 for one_part in main_text:
350 if name == 'naglowek_scena':
352 elif name == 'naglowek_akt':
355 for one_part in main_text:
357 if is_act is False and is_scene is True:
358 if name == 'naglowek_czesc':
360 last_node_part = True
361 main_xml_part[:] = [deepcopy(one_part)]
362 elif not last_node_part and name == "naglowek_scena":
364 main_xml_part[:] = [deepcopy(one_part)]
366 main_xml_part.append(deepcopy(one_part))
367 last_node_part = False
369 if name == 'naglowek_czesc':
371 last_node_part = True
372 main_xml_part[:] = [deepcopy(one_part)]
373 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
375 main_xml_part[:] = [deepcopy(one_part)]
377 main_xml_part.append(deepcopy(one_part))
378 last_node_part = False
382 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
383 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
386 for element in chunk_xml[0]:
387 if element.tag == "naglowek_czesc":
388 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
389 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
390 toc.add(node_name(element), "part%d.html" % chunk_no)
391 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
392 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
393 element.set('sub', str(subnumber))
395 if not _empty_html_static:
396 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
398 output_html = _empty_html_static[0]
400 find_annotations(annotations, chunk_xml, chunk_no)
401 replace_by_verse(chunk_xml)
402 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
403 chars = used_chars(html_tree.getroot())
404 output_html = etree.tostring(
405 html_tree, pretty_print=True, xml_declaration=True,
407 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
408 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
410 return output_html, toc, chars
413 def transform(wldoc, verbose=False, style=None, html_toc=False,
414 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
415 """ produces a EPUB file
417 sample=n: generate sample e-book (with at least n paragraphs)
418 cover: a cover.Cover factory or True for default
419 flags: less-advertising, without-fonts, working-copy
422 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
423 """ processes one input file and proceeds to its children """
425 replace_characters(wldoc.edoc.getroot())
427 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
428 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
430 # every input file will have a TOC entry,
431 # pointing to starting chunk
432 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
435 # write book title page
436 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
437 chars = used_chars(html_tree.getroot())
441 html_tree, pretty_print=True, xml_declaration=True,
443 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
444 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
447 # add a title page TOC entry
448 toc.add(u"Strona tytułowa", "title.html")
449 elif wldoc.book_info.parts:
450 # write title page for every parent
451 if sample is not None and sample <= 0:
453 html_string = open(get_resource('epub/emptyChunk.html')).read()
455 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
456 chars = used_chars(html_tree.getroot())
457 html_string = etree.tostring(
458 html_tree, pretty_print=True, xml_declaration=True,
460 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
461 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
463 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
464 add_to_manifest(manifest, chunk_counter)
465 add_to_spine(spine, chunk_counter)
468 if len(wldoc.edoc.getroot()) > 1:
469 # rdf before style master
470 main_text = wldoc.edoc.getroot()[1]
472 # rdf in style master
473 main_text = wldoc.edoc.getroot()[0]
474 if main_text.tag == RDFNS('RDF'):
477 if main_text is not None:
478 for chunk_xml in chop(main_text):
480 if sample is not None:
484 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
485 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
487 toc.extend(chunk_toc)
488 chars = chars.union(chunk_chars)
489 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
490 add_to_manifest(manifest, chunk_counter)
491 add_to_spine(spine, chunk_counter)
494 for child in wldoc.parts():
495 child_toc, chunk_counter, chunk_chars, sample = transform_file(
496 child, chunk_counter, first=False, sample=sample)
497 toc.append(child_toc)
498 chars = chars.union(chunk_chars)
500 return toc, chunk_counter, chars, sample
502 document = deepcopy(wldoc)
507 document.edoc.getroot().set(flag, 'yes')
509 document.clean_ed_note()
510 document.clean_ed_note('abstrakt')
513 editors = document.editors()
515 document.edoc.getroot().set('editors', u', '.join(sorted(
516 editor.readable() for editor in editors)))
517 if document.book_info.funders:
518 document.edoc.getroot().set('funders', u', '.join(
519 document.book_info.funders))
520 if document.book_info.thanks:
521 document.edoc.getroot().set('thanks', document.book_info.thanks)
523 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
524 manifest = opf.find('.//' + OPFNS('manifest'))
525 guide = opf.find('.//' + OPFNS('guide'))
526 spine = opf.find('.//' + OPFNS('spine'))
528 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
529 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
531 functions.reg_mathml_epub(zip)
533 if os.path.isdir(ilustr_path):
534 for i, filename in enumerate(os.listdir(ilustr_path)):
535 file_path = os.path.join(ilustr_path, filename)
536 zip.write(file_path, os.path.join('OPS', filename))
537 image_id = 'image%s' % i
538 manifest.append(etree.fromstring(
539 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
541 # write static elements
542 mime = zipfile.ZipInfo()
543 mime.filename = 'mimetype'
544 mime.compress_type = zipfile.ZIP_STORED
546 zip.writestr(mime, 'application/epub+zip')
548 'META-INF/container.xml',
549 '<?xml version="1.0" ?>'
550 '<container version="1.0" '
551 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
552 '<rootfiles><rootfile full-path="OPS/content.opf" '
553 'media-type="application/oebps-package+xml" />'
554 '</rootfiles></container>'
556 zip.write(get_resource('res/wl-logo-small.png'),
557 os.path.join('OPS', 'logo_wolnelektury.png'))
558 zip.write(get_resource('res/jedenprocent.png'),
559 os.path.join('OPS', 'jedenprocent.png'))
561 style = get_resource('epub/style.css')
562 zip.write(style, os.path.join('OPS', 'style.css'))
566 cover = DefaultEbookCover
568 cover_file = StringIO()
569 bound_cover = cover(document.book_info)
570 bound_cover.save(cover_file)
571 cover_name = 'cover.%s' % bound_cover.ext()
572 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
575 cover_tree = etree.parse(get_resource('epub/cover.html'))
576 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
577 zip.writestr('OPS/cover.html', etree.tostring(
578 cover_tree, pretty_print=True, xml_declaration=True,
580 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
581 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
584 if bound_cover.uses_dc_cover:
585 if document.book_info.cover_by:
586 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
587 if document.book_info.cover_source:
588 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
590 manifest.append(etree.fromstring(
591 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
592 manifest.append(etree.fromstring(
593 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
594 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
595 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
596 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
598 annotations = etree.Element('annotations')
600 toc_file = etree.fromstring(
601 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
602 '"-//NISO//DTD ncx 2005-1//EN" '
603 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
604 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
605 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
608 nav_map = toc_file[-1]
611 manifest.append(etree.fromstring(
612 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
613 spine.append(etree.fromstring(
614 '<itemref idref="html_toc" />'))
615 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
617 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
619 if len(toc.children) < 2:
620 toc.add(u"Początek utworu", "part1.html")
622 # Last modifications in container files and EPUB creation
623 if len(annotations) > 0:
624 toc.add("Przypisy", "annotations.html")
625 manifest.append(etree.fromstring(
626 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
627 spine.append(etree.fromstring(
628 '<itemref idref="annotations" />'))
629 replace_by_verse(annotations)
630 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
631 chars = chars.union(used_chars(html_tree.getroot()))
632 zip.writestr('OPS/annotations.html', etree.tostring(
633 html_tree, pretty_print=True, xml_declaration=True,
635 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
636 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
639 toc.add("Wesprzyj Wolne Lektury", "support.html")
640 manifest.append(etree.fromstring(
641 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
642 spine.append(etree.fromstring(
643 '<itemref idref="support" />'))
644 html_string = open(get_resource('epub/support.html')).read()
645 chars.update(used_chars(etree.fromstring(html_string)))
646 zip.writestr('OPS/support.html', html_string)
648 toc.add("Strona redakcyjna", "last.html")
649 manifest.append(etree.fromstring(
650 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
651 spine.append(etree.fromstring(
652 '<itemref idref="last" />'))
653 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
654 chars.update(used_chars(html_tree.getroot()))
655 zip.writestr('OPS/last.html', etree.tostring(
656 html_tree, pretty_print=True, xml_declaration=True,
658 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
659 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
662 if not flags or 'without-fonts' not in flags:
664 tmpdir = mkdtemp('-librarian-epub')
670 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
671 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
672 optimizer_call = ['perl', 'subset.pl', '--chars',
673 ''.join(chars).encode('utf-8'),
674 get_resource('fonts/' + fname),
675 os.path.join(tmpdir, fname)]
677 print "Running font-optimizer"
678 subprocess.check_call(optimizer_call)
680 dev_null = open(os.devnull, 'w')
681 subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null)
682 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
683 manifest.append(etree.fromstring(
684 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
688 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
689 xml_declaration=True, encoding="utf-8"))
690 title = document.book_info.title
691 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
692 for st in attributes:
693 meta = toc_file.makeelement(NCXNS('meta'))
695 meta.set('content', '0')
696 toc_file[0].append(meta)
697 toc_file[0][0].set('content', str(document.book_info.url))
698 toc_file[0][1].set('content', str(toc.depth()))
699 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
703 toc.add(u"Spis treści", "toc.html", index=1)
704 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
705 toc.write_to_xml(nav_map)
706 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
707 xml_declaration=True, encoding="utf-8"))
710 return OutputFile.from_filename(output_file.name)