1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import DefaultEbookCover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 return xml.xslt(etree.parse(xsltf))
123 def replace_characters(node):
124 def replace_chars(text):
127 return text.replace(u"\ufeff", u"")\
128 .replace("---", u"\u2014")\
129 .replace("--", u"\u2013")\
130 .replace(",,", u"\u201E")\
131 .replace('"', u"\u201D")\
132 .replace("'", u"\u2019")
133 if node.tag in ('uwaga', 'extra'):
137 node.text = replace_chars(node.text)
138 node.tail = replace_chars(node.tail)
140 replace_characters(child)
143 def find_annotations(annotations, source, part_no):
145 if child.tag in ('pe', 'pa', 'pt', 'pr'):
146 annotation = deepcopy(child)
147 number = str(len(annotations) + 1)
148 annotation.set('number', number)
149 annotation.set('part', str(part_no))
151 annotations.append(annotation)
156 if child.tag not in ('extra', 'uwaga'):
157 find_annotations(annotations, child, part_no)
160 class Stanza(object):
162 Converts / verse endings into verse elements in a stanza.
164 Slashes may only occur directly in the stanza. Any slashes in subelements
165 will be ignored, and the subelements will be put inside verse elements.
167 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
168 >>> Stanza(s).versify()
169 >>> print etree.tostring(s)
170 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
171 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
174 def __init__(self, stanza_elem):
175 self.stanza = stanza_elem
177 self.open_verse = None
180 self.push_text(self.stanza.text)
181 for elem in self.stanza:
183 self.push_text(elem.tail)
184 tail = self.stanza.tail
186 self.stanza.tail = tail
187 self.stanza.extend(self.verses)
189 def open_normal_verse(self):
190 self.open_verse = self.stanza.makeelement("wers_normalny")
191 self.verses.append(self.open_verse)
193 def get_open_verse(self):
194 if self.open_verse is None:
195 self.open_normal_verse()
196 return self.open_verse
198 def push_text(self, text):
201 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
203 self.open_normal_verse()
204 verse = self.get_open_verse()
206 verse[-1].tail = (verse[-1].tail or "") + verse_text
208 verse.text = (verse.text or "") + verse_text
210 def push_elem(self, elem):
211 if elem.tag.startswith("wers"):
212 verse = deepcopy(elem)
214 self.verses.append(verse)
215 self.open_verse = verse
217 appended = deepcopy(elem)
219 self.get_open_verse().append(appended)
222 def replace_by_verse(tree):
223 """ Find stanzas and create new verses in place of a '/' character """
225 stanzas = tree.findall('.//' + WLNS('strofa'))
226 for stanza in stanzas:
227 Stanza(stanza).versify()
230 def add_to_manifest(manifest, partno):
231 """ Adds a node to the manifest section in content.opf file """
233 partstr = 'part%d' % partno
234 e = manifest.makeelement(
235 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
236 'media-type': 'application/xhtml+xml'}
241 def add_to_spine(spine, partno):
242 """ Adds a node to the spine section in content.opf file """
244 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
249 def __init__(self, name=None, part_href=None):
252 self.part_href = part_href
253 self.sub_number = None
255 def add(self, name, part_href, level=0, is_part=True, index=None):
256 assert level == 0 or index is None
257 if level > 0 and self.children:
258 return self.children[-1].add(name, part_href, level - 1, is_part)
261 t.part_href = part_href
262 if index is not None:
263 self.children.insert(index, t)
265 self.children.append(t)
267 t.sub_number = len(self.children) + 1
270 def append(self, toc):
271 self.children.append(toc)
273 def extend(self, toc):
274 self.children.extend(toc.children)
278 return max((c.depth() for c in self.children)) + 1
284 if self.sub_number is not None:
285 src += '#sub%d' % self.sub_number
288 def write_to_xml(self, nav_map, counter=1):
289 for child in self.children:
290 nav_point = nav_map.makeelement(NCXNS('navPoint'))
291 nav_point.set('id', 'NavPoint-%d' % counter)
292 nav_point.set('playOrder', str(counter))
294 nav_label = nav_map.makeelement(NCXNS('navLabel'))
295 text = nav_map.makeelement(NCXNS('text'))
296 if child.name is not None:
297 text.text = re.sub(r'\n', ' ', child.name)
299 text.text = child.name
300 nav_label.append(text)
301 nav_point.append(nav_label)
303 content = nav_map.makeelement(NCXNS('content'))
304 content.set('src', child.href())
305 nav_point.append(content)
306 nav_map.append(nav_point)
307 counter = child.write_to_xml(nav_point, counter + 1)
310 def html_part(self, depth=0):
312 for child in self.children:
314 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
315 (depth, child.href(), child.name))
316 texts.append(child.html_part(depth + 1))
317 return "\n".join(texts)
320 with open(get_resource('epub/toc.html')) as f:
321 t = unicode(f.read(), 'utf-8')
322 return t % self.html_part()
325 def used_chars(element):
326 """ Lists characters used in an ETree Element """
327 chars = set((element.text or '') + (element.tail or ''))
328 for child in element:
329 chars = chars.union(used_chars(child))
334 """ divide main content of the XML file into chunks """
336 # prepare a container for each chunk
337 part_xml = etree.Element('utwor')
338 etree.SubElement(part_xml, 'master')
339 main_xml_part = part_xml[0] # master
341 last_node_part = False
343 # the below loop are workaround for a problem with epubs in drama ebooks without acts
346 for one_part in main_text:
348 if name == 'naglowek_scena':
350 elif name == 'naglowek_akt':
353 for one_part in main_text:
355 if is_act is False and is_scene is True:
356 if name == 'naglowek_czesc':
358 last_node_part = True
359 main_xml_part[:] = [deepcopy(one_part)]
360 elif not last_node_part and name == "naglowek_scena":
362 main_xml_part[:] = [deepcopy(one_part)]
364 main_xml_part.append(deepcopy(one_part))
365 last_node_part = False
367 if name == 'naglowek_czesc':
369 last_node_part = True
370 main_xml_part[:] = [deepcopy(one_part)]
371 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
373 main_xml_part[:] = [deepcopy(one_part)]
375 main_xml_part.append(deepcopy(one_part))
376 last_node_part = False
380 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
381 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
384 for element in chunk_xml[0]:
385 if element.tag == "naglowek_czesc":
386 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
387 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
388 toc.add(node_name(element), "part%d.html" % chunk_no)
389 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
390 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
391 element.set('sub', str(subnumber))
393 if not _empty_html_static:
394 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
396 output_html = _empty_html_static[0]
398 find_annotations(annotations, chunk_xml, chunk_no)
399 replace_by_verse(chunk_xml)
400 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
401 chars = used_chars(html_tree.getroot())
402 output_html = etree.tostring(
403 html_tree, pretty_print=True, xml_declaration=True,
405 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
406 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
408 return output_html, toc, chars
411 def transform(wldoc, verbose=False, style=None, html_toc=False,
412 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path=''):
413 """ produces a EPUB file
415 sample=n: generate sample e-book (with at least n paragraphs)
416 cover: a cover.Cover factory or True for default
417 flags: less-advertising, without-fonts, working-copy
420 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
421 """ processes one input file and proceeds to its children """
423 replace_characters(wldoc.edoc.getroot())
425 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
426 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
428 # every input file will have a TOC entry,
429 # pointing to starting chunk
430 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
433 # write book title page
434 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
435 chars = used_chars(html_tree.getroot())
439 html_tree, pretty_print=True, xml_declaration=True,
441 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
442 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
445 # add a title page TOC entry
446 toc.add(u"Strona tytułowa", "title.html")
447 elif wldoc.book_info.parts:
448 # write title page for every parent
449 if sample is not None and sample <= 0:
451 html_string = open(get_resource('epub/emptyChunk.html')).read()
453 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
454 chars = used_chars(html_tree.getroot())
455 html_string = etree.tostring(
456 html_tree, pretty_print=True, xml_declaration=True,
458 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
459 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
461 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
462 add_to_manifest(manifest, chunk_counter)
463 add_to_spine(spine, chunk_counter)
466 if len(wldoc.edoc.getroot()) > 1:
467 # rdf before style master
468 main_text = wldoc.edoc.getroot()[1]
470 # rdf in style master
471 main_text = wldoc.edoc.getroot()[0]
472 if main_text.tag == RDFNS('RDF'):
475 if main_text is not None:
476 for chunk_xml in chop(main_text):
478 if sample is not None:
482 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
483 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
485 toc.extend(chunk_toc)
486 chars = chars.union(chunk_chars)
487 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
488 add_to_manifest(manifest, chunk_counter)
489 add_to_spine(spine, chunk_counter)
492 for child in wldoc.parts():
493 child_toc, chunk_counter, chunk_chars, sample = transform_file(
494 child, chunk_counter, first=False, sample=sample)
495 toc.append(child_toc)
496 chars = chars.union(chunk_chars)
498 return toc, chunk_counter, chars, sample
500 document = deepcopy(wldoc)
505 document.edoc.getroot().set(flag, 'yes')
507 document.clean_ed_note()
508 document.clean_ed_note('abstrakt')
511 editors = document.editors()
513 document.edoc.getroot().set('editors', u', '.join(sorted(
514 editor.readable() for editor in editors)))
515 if document.book_info.funders:
516 document.edoc.getroot().set('funders', u', '.join(
517 document.book_info.funders))
518 if document.book_info.thanks:
519 document.edoc.getroot().set('thanks', document.book_info.thanks)
521 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
522 manifest = opf.find('.//' + OPFNS('manifest'))
523 guide = opf.find('.//' + OPFNS('guide'))
524 spine = opf.find('.//' + OPFNS('spine'))
526 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
527 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
529 functions.reg_mathml_epub(zip)
531 for filename in os.listdir(ilustr_path):
532 file_path = os.path.join(ilustr_path, filename)
533 zip.write(file_path, os.path.join('OPS', filename))
534 manifest.append(etree.fromstring(
535 '<item id="%s" href="%s" media-type="%s" />' % (filename, filename, guess_type(file_path)[0])))
537 # write static elements
538 mime = zipfile.ZipInfo()
539 mime.filename = 'mimetype'
540 mime.compress_type = zipfile.ZIP_STORED
542 zip.writestr(mime, 'application/epub+zip')
544 'META-INF/container.xml',
545 '<?xml version="1.0" ?>'
546 '<container version="1.0" '
547 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
548 '<rootfiles><rootfile full-path="OPS/content.opf" '
549 'media-type="application/oebps-package+xml" />'
550 '</rootfiles></container>'
552 zip.write(get_resource('res/wl-logo-small.png'),
553 os.path.join('OPS', 'logo_wolnelektury.png'))
554 zip.write(get_resource('res/jedenprocent.png'),
555 os.path.join('OPS', 'jedenprocent.png'))
557 style = get_resource('epub/style.css')
558 zip.write(style, os.path.join('OPS', 'style.css'))
562 cover = DefaultEbookCover
564 cover_file = StringIO()
565 bound_cover = cover(document.book_info)
566 bound_cover.save(cover_file)
567 cover_name = 'cover.%s' % bound_cover.ext()
568 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
571 cover_tree = etree.parse(get_resource('epub/cover.html'))
572 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
573 zip.writestr('OPS/cover.html', etree.tostring(
574 cover_tree, pretty_print=True, xml_declaration=True,
576 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
577 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
580 if bound_cover.uses_dc_cover:
581 if document.book_info.cover_by:
582 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
583 if document.book_info.cover_source:
584 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
586 manifest.append(etree.fromstring(
587 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
588 manifest.append(etree.fromstring(
589 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
590 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
591 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
592 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
594 annotations = etree.Element('annotations')
596 toc_file = etree.fromstring(
597 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
598 '"-//NISO//DTD ncx 2005-1//EN" '
599 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
600 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
601 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
604 nav_map = toc_file[-1]
607 manifest.append(etree.fromstring(
608 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
609 spine.append(etree.fromstring(
610 '<itemref idref="html_toc" />'))
611 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
613 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
615 if len(toc.children) < 2:
616 toc.add(u"Początek utworu", "part1.html")
618 # Last modifications in container files and EPUB creation
619 if len(annotations) > 0:
620 toc.add("Przypisy", "annotations.html")
621 manifest.append(etree.fromstring(
622 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
623 spine.append(etree.fromstring(
624 '<itemref idref="annotations" />'))
625 replace_by_verse(annotations)
626 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
627 chars = chars.union(used_chars(html_tree.getroot()))
628 zip.writestr('OPS/annotations.html', etree.tostring(
629 html_tree, pretty_print=True, xml_declaration=True,
631 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
632 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
635 toc.add("Wesprzyj Wolne Lektury", "support.html")
636 manifest.append(etree.fromstring(
637 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
638 spine.append(etree.fromstring(
639 '<itemref idref="support" />'))
640 html_string = open(get_resource('epub/support.html')).read()
641 chars.update(used_chars(etree.fromstring(html_string)))
642 zip.writestr('OPS/support.html', html_string)
644 toc.add("Strona redakcyjna", "last.html")
645 manifest.append(etree.fromstring(
646 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
647 spine.append(etree.fromstring(
648 '<itemref idref="last" />'))
649 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
650 chars.update(used_chars(html_tree.getroot()))
651 zip.writestr('OPS/last.html', etree.tostring(
652 html_tree, pretty_print=True, xml_declaration=True,
654 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
655 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
658 if not flags or 'without-fonts' not in flags:
660 tmpdir = mkdtemp('-librarian-epub')
666 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
667 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
668 optimizer_call = ['perl', 'subset.pl', '--chars',
669 ''.join(chars).encode('utf-8'),
670 get_resource('fonts/' + fname),
671 os.path.join(tmpdir, fname)]
673 print "Running font-optimizer"
674 subprocess.check_call(optimizer_call)
676 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
677 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
678 manifest.append(etree.fromstring(
679 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
683 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
684 xml_declaration=True, encoding="utf-8"))
685 title = document.book_info.title
686 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
687 for st in attributes:
688 meta = toc_file.makeelement(NCXNS('meta'))
690 meta.set('content', '0')
691 toc_file[0].append(meta)
692 toc_file[0][0].set('content', str(document.book_info.url))
693 toc_file[0][1].set('content', str(toc.depth()))
694 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
698 toc.add(u"Spis treści", "toc.html", index=1)
699 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
700 toc.write_to_xml(nav_map)
701 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
702 xml_declaration=True, encoding="utf-8"))
705 return OutputFile.from_filename(output_file.name)