1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import DefaultEbookCover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 return xml.xslt(etree.parse(xsltf))
123 def replace_characters(node):
124 def replace_chars(text):
127 return text.replace(u"\ufeff", u"")\
128 .replace("---", u"\u2014")\
129 .replace("--", u"\u2013")\
130 .replace(",,", u"\u201E")\
131 .replace('"', u"\u201D")\
132 .replace("'", u"\u2019")
133 if node.tag in ('uwaga', 'extra'):
137 node.text = replace_chars(node.text)
138 node.tail = replace_chars(node.tail)
140 replace_characters(child)
143 def find_annotations(annotations, source, part_no):
145 if child.tag in ('pe', 'pa', 'pt', 'pr'):
146 annotation = deepcopy(child)
147 number = str(len(annotations) + 1)
148 annotation.set('number', number)
149 annotation.set('part', str(part_no))
151 annotations.append(annotation)
156 if child.tag not in ('extra', 'uwaga'):
157 find_annotations(annotations, child, part_no)
160 class Stanza(object):
162 Converts / verse endings into verse elements in a stanza.
164 Slashes may only occur directly in the stanza. Any slashes in subelements
165 will be ignored, and the subelements will be put inside verse elements.
167 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
168 >>> Stanza(s).versify()
169 >>> print etree.tostring(s)
170 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
171 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
174 def __init__(self, stanza_elem):
175 self.stanza = stanza_elem
177 self.open_verse = None
180 self.push_text(self.stanza.text)
181 for elem in self.stanza:
183 self.push_text(elem.tail)
184 tail = self.stanza.tail
186 self.stanza.tail = tail
187 self.stanza.extend(self.verses)
189 def open_normal_verse(self):
190 self.open_verse = self.stanza.makeelement("wers_normalny")
191 self.verses.append(self.open_verse)
193 def get_open_verse(self):
194 if self.open_verse is None:
195 self.open_normal_verse()
196 return self.open_verse
198 def push_text(self, text):
201 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
203 self.open_normal_verse()
204 verse = self.get_open_verse()
206 verse[-1].tail = (verse[-1].tail or "") + verse_text
208 verse.text = (verse.text or "") + verse_text
210 def push_elem(self, elem):
211 if elem.tag.startswith("wers"):
212 verse = deepcopy(elem)
214 self.verses.append(verse)
215 self.open_verse = verse
217 appended = deepcopy(elem)
219 self.get_open_verse().append(appended)
222 def replace_by_verse(tree):
223 """ Find stanzas and create new verses in place of a '/' character """
225 stanzas = tree.findall('.//' + WLNS('strofa'))
226 for stanza in stanzas:
227 Stanza(stanza).versify()
230 def add_to_manifest(manifest, partno):
231 """ Adds a node to the manifest section in content.opf file """
233 partstr = 'part%d' % partno
234 e = manifest.makeelement(
235 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
236 'media-type': 'application/xhtml+xml'}
241 def add_to_spine(spine, partno):
242 """ Adds a node to the spine section in content.opf file """
244 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
249 def __init__(self, name=None, part_href=None):
252 self.part_href = part_href
253 self.sub_number = None
255 def add(self, name, part_href, level=0, is_part=True, index=None):
256 assert level == 0 or index is None
257 if level > 0 and self.children:
258 return self.children[-1].add(name, part_href, level - 1, is_part)
261 t.part_href = part_href
262 if index is not None:
263 self.children.insert(index, t)
265 self.children.append(t)
267 t.sub_number = len(self.children) + 1
270 def append(self, toc):
271 self.children.append(toc)
273 def extend(self, toc):
274 self.children.extend(toc.children)
278 return max((c.depth() for c in self.children)) + 1
284 if self.sub_number is not None:
285 src += '#sub%d' % self.sub_number
288 def write_to_xml(self, nav_map, counter=1):
289 for child in self.children:
290 nav_point = nav_map.makeelement(NCXNS('navPoint'))
291 nav_point.set('id', 'NavPoint-%d' % counter)
292 nav_point.set('playOrder', str(counter))
294 nav_label = nav_map.makeelement(NCXNS('navLabel'))
295 text = nav_map.makeelement(NCXNS('text'))
296 if child.name is not None:
297 text.text = re.sub(r'\n', ' ', child.name)
299 text.text = child.name
300 nav_label.append(text)
301 nav_point.append(nav_label)
303 content = nav_map.makeelement(NCXNS('content'))
304 content.set('src', child.href())
305 nav_point.append(content)
306 nav_map.append(nav_point)
307 counter = child.write_to_xml(nav_point, counter + 1)
310 def html_part(self, depth=0):
312 for child in self.children:
314 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
315 (depth, child.href(), child.name))
316 texts.append(child.html_part(depth + 1))
317 return "\n".join(texts)
320 with open(get_resource('epub/toc.html')) as f:
321 t = unicode(f.read(), 'utf-8')
322 return t % self.html_part()
325 def used_chars(element):
326 """ Lists characters used in an ETree Element """
327 chars = set((element.text or '') + (element.tail or ''))
328 for child in element:
329 chars = chars.union(used_chars(child))
334 """ divide main content of the XML file into chunks """
336 # prepare a container for each chunk
337 part_xml = etree.Element('utwor')
338 etree.SubElement(part_xml, 'master')
339 main_xml_part = part_xml[0] # master
341 last_node_part = False
343 # the below loop are workaround for a problem with epubs in drama ebooks without acts
346 for one_part in main_text:
348 if name == 'naglowek_scena':
350 elif name == 'naglowek_akt':
353 for one_part in main_text:
355 if is_act is False and is_scene is True:
356 if name == 'naglowek_czesc':
358 last_node_part = True
359 main_xml_part[:] = [deepcopy(one_part)]
360 elif not last_node_part and name == "naglowek_scena":
362 main_xml_part[:] = [deepcopy(one_part)]
364 main_xml_part.append(deepcopy(one_part))
365 last_node_part = False
367 if name == 'naglowek_czesc':
369 last_node_part = True
370 main_xml_part[:] = [deepcopy(one_part)]
371 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
373 main_xml_part[:] = [deepcopy(one_part)]
375 main_xml_part.append(deepcopy(one_part))
376 last_node_part = False
380 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
381 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
384 for element in chunk_xml[0]:
385 if element.tag == "naglowek_czesc":
386 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
387 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
388 toc.add(node_name(element), "part%d.html" % chunk_no)
389 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
390 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
391 element.set('sub', str(subnumber))
393 if not _empty_html_static:
394 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
396 output_html = _empty_html_static[0]
398 find_annotations(annotations, chunk_xml, chunk_no)
399 replace_by_verse(chunk_xml)
400 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
401 chars = used_chars(html_tree.getroot())
402 output_html = etree.tostring(
403 html_tree, pretty_print=True, xml_declaration=True,
405 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
406 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
408 return output_html, toc, chars
411 def transform(wldoc, verbose=False, style=None, html_toc=False,
412 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path=''):
413 """ produces a EPUB file
415 sample=n: generate sample e-book (with at least n paragraphs)
416 cover: a cover.Cover factory or True for default
417 flags: less-advertising, without-fonts, working-copy
420 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
421 """ processes one input file and proceeds to its children """
423 replace_characters(wldoc.edoc.getroot())
425 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
426 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
428 # every input file will have a TOC entry,
429 # pointing to starting chunk
430 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
433 # write book title page
434 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
435 chars = used_chars(html_tree.getroot())
439 html_tree, pretty_print=True, xml_declaration=True,
441 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
442 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
445 # add a title page TOC entry
446 toc.add(u"Strona tytułowa", "title.html")
447 elif wldoc.book_info.parts:
448 # write title page for every parent
449 if sample is not None and sample <= 0:
451 html_string = open(get_resource('epub/emptyChunk.html')).read()
453 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
454 chars = used_chars(html_tree.getroot())
455 html_string = etree.tostring(
456 html_tree, pretty_print=True, xml_declaration=True,
458 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
459 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
461 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
462 add_to_manifest(manifest, chunk_counter)
463 add_to_spine(spine, chunk_counter)
466 if len(wldoc.edoc.getroot()) > 1:
467 # rdf before style master
468 main_text = wldoc.edoc.getroot()[1]
470 # rdf in style master
471 main_text = wldoc.edoc.getroot()[0]
472 if main_text.tag == RDFNS('RDF'):
475 if main_text is not None:
476 for chunk_xml in chop(main_text):
478 if sample is not None:
482 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
483 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
485 toc.extend(chunk_toc)
486 chars = chars.union(chunk_chars)
487 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
488 add_to_manifest(manifest, chunk_counter)
489 add_to_spine(spine, chunk_counter)
492 for child in wldoc.parts():
493 child_toc, chunk_counter, chunk_chars, sample = transform_file(
494 child, chunk_counter, first=False, sample=sample)
495 toc.append(child_toc)
496 chars = chars.union(chunk_chars)
498 return toc, chunk_counter, chars, sample
500 document = deepcopy(wldoc)
505 document.edoc.getroot().set(flag, 'yes')
507 document.clean_ed_note()
508 document.clean_ed_note('abstrakt')
511 editors = document.editors()
513 document.edoc.getroot().set('editors', u', '.join(sorted(
514 editor.readable() for editor in editors)))
515 if document.book_info.funders:
516 document.edoc.getroot().set('funders', u', '.join(
517 document.book_info.funders))
518 if document.book_info.thanks:
519 document.edoc.getroot().set('thanks', document.book_info.thanks)
521 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
522 manifest = opf.find('.//' + OPFNS('manifest'))
523 guide = opf.find('.//' + OPFNS('guide'))
524 spine = opf.find('.//' + OPFNS('spine'))
526 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
527 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
529 functions.reg_mathml_epub(zip)
531 for i, filename in enumerate(os.listdir(ilustr_path)):
532 file_path = os.path.join(ilustr_path, filename)
533 zip.write(file_path, os.path.join('OPS', filename))
534 image_id = 'image%s' % i
535 manifest.append(etree.fromstring(
536 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
538 # write static elements
539 mime = zipfile.ZipInfo()
540 mime.filename = 'mimetype'
541 mime.compress_type = zipfile.ZIP_STORED
543 zip.writestr(mime, 'application/epub+zip')
545 'META-INF/container.xml',
546 '<?xml version="1.0" ?>'
547 '<container version="1.0" '
548 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
549 '<rootfiles><rootfile full-path="OPS/content.opf" '
550 'media-type="application/oebps-package+xml" />'
551 '</rootfiles></container>'
553 zip.write(get_resource('res/wl-logo-small.png'),
554 os.path.join('OPS', 'logo_wolnelektury.png'))
555 zip.write(get_resource('res/jedenprocent.png'),
556 os.path.join('OPS', 'jedenprocent.png'))
558 style = get_resource('epub/style.css')
559 zip.write(style, os.path.join('OPS', 'style.css'))
563 cover = DefaultEbookCover
565 cover_file = StringIO()
566 bound_cover = cover(document.book_info)
567 bound_cover.save(cover_file)
568 cover_name = 'cover.%s' % bound_cover.ext()
569 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
572 cover_tree = etree.parse(get_resource('epub/cover.html'))
573 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
574 zip.writestr('OPS/cover.html', etree.tostring(
575 cover_tree, pretty_print=True, xml_declaration=True,
577 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
578 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
581 if bound_cover.uses_dc_cover:
582 if document.book_info.cover_by:
583 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
584 if document.book_info.cover_source:
585 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
587 manifest.append(etree.fromstring(
588 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
589 manifest.append(etree.fromstring(
590 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
591 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
592 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
593 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
595 annotations = etree.Element('annotations')
597 toc_file = etree.fromstring(
598 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
599 '"-//NISO//DTD ncx 2005-1//EN" '
600 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
601 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
602 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
605 nav_map = toc_file[-1]
608 manifest.append(etree.fromstring(
609 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
610 spine.append(etree.fromstring(
611 '<itemref idref="html_toc" />'))
612 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
614 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
616 if len(toc.children) < 2:
617 toc.add(u"Początek utworu", "part1.html")
619 # Last modifications in container files and EPUB creation
620 if len(annotations) > 0:
621 toc.add("Przypisy", "annotations.html")
622 manifest.append(etree.fromstring(
623 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
624 spine.append(etree.fromstring(
625 '<itemref idref="annotations" />'))
626 replace_by_verse(annotations)
627 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
628 chars = chars.union(used_chars(html_tree.getroot()))
629 zip.writestr('OPS/annotations.html', etree.tostring(
630 html_tree, pretty_print=True, xml_declaration=True,
632 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
633 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
636 toc.add("Wesprzyj Wolne Lektury", "support.html")
637 manifest.append(etree.fromstring(
638 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
639 spine.append(etree.fromstring(
640 '<itemref idref="support" />'))
641 html_string = open(get_resource('epub/support.html')).read()
642 chars.update(used_chars(etree.fromstring(html_string)))
643 zip.writestr('OPS/support.html', html_string)
645 toc.add("Strona redakcyjna", "last.html")
646 manifest.append(etree.fromstring(
647 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
648 spine.append(etree.fromstring(
649 '<itemref idref="last" />'))
650 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
651 chars.update(used_chars(html_tree.getroot()))
652 zip.writestr('OPS/last.html', etree.tostring(
653 html_tree, pretty_print=True, xml_declaration=True,
655 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
656 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
659 if not flags or 'without-fonts' not in flags:
661 tmpdir = mkdtemp('-librarian-epub')
667 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
668 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
669 optimizer_call = ['perl', 'subset.pl', '--chars',
670 ''.join(chars).encode('utf-8'),
671 get_resource('fonts/' + fname),
672 os.path.join(tmpdir, fname)]
674 print "Running font-optimizer"
675 subprocess.check_call(optimizer_call)
677 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
678 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
679 manifest.append(etree.fromstring(
680 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
684 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
685 xml_declaration=True, encoding="utf-8"))
686 title = document.book_info.title
687 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
688 for st in attributes:
689 meta = toc_file.makeelement(NCXNS('meta'))
691 meta.set('content', '0')
692 toc_file[0].append(meta)
693 toc_file[0][0].set('content', str(document.book_info.url))
694 toc_file[0][1].set('content', str(toc.depth()))
695 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
699 toc.add(u"Spis treści", "toc.html", index=1)
700 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
701 toc.write_to_xml(nav_map)
702 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
703 xml_declaration=True, encoding="utf-8"))
706 return OutputFile.from_filename(output_file.name)