1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import DefaultEbookCover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 return xml.xslt(etree.parse(xsltf))
123 def replace_characters(node):
124 def replace_chars(text):
127 return text.replace(u"\ufeff", u"")\
128 .replace("---", u"\u2014")\
129 .replace("--", u"\u2013")\
130 .replace(",,", u"\u201E")\
131 .replace('"', u"\u201D")\
132 .replace("'", u"\u2019")
133 if node.tag in ('uwaga', 'extra'):
137 node.text = replace_chars(node.text)
138 node.tail = replace_chars(node.tail)
140 replace_characters(child)
143 def find_annotations(annotations, source, part_no):
145 if child.tag in ('pe', 'pa', 'pt', 'pr'):
146 annotation = deepcopy(child)
147 number = str(len(annotations) + 1)
148 annotation.set('number', number)
149 annotation.set('part', str(part_no))
151 annotations.append(annotation)
156 if child.tag not in ('extra', 'uwaga'):
157 find_annotations(annotations, child, part_no)
160 class Stanza(object):
162 Converts / verse endings into verse elements in a stanza.
164 Slashes may only occur directly in the stanza. Any slashes in subelements
165 will be ignored, and the subelements will be put inside verse elements.
167 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
168 >>> Stanza(s).versify()
169 >>> print etree.tostring(s)
170 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
171 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
174 def __init__(self, stanza_elem):
175 self.stanza = stanza_elem
177 self.open_verse = None
180 self.push_text(self.stanza.text)
181 for elem in self.stanza:
183 self.push_text(elem.tail)
184 tail = self.stanza.tail
186 self.stanza.tail = tail
187 self.stanza.extend(self.verses)
189 def open_normal_verse(self):
190 self.open_verse = self.stanza.makeelement("wers_normalny")
191 self.verses.append(self.open_verse)
193 def get_open_verse(self):
194 if self.open_verse is None:
195 self.open_normal_verse()
196 return self.open_verse
198 def push_text(self, text):
201 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
203 self.open_normal_verse()
204 verse = self.get_open_verse()
206 verse[-1].tail = (verse[-1].tail or "") + verse_text
208 verse.text = (verse.text or "") + verse_text
210 def push_elem(self, elem):
211 if elem.tag.startswith("wers"):
212 verse = deepcopy(elem)
214 self.verses.append(verse)
215 self.open_verse = verse
217 appended = deepcopy(elem)
219 self.get_open_verse().append(appended)
222 def replace_by_verse(tree):
223 """ Find stanzas and create new verses in place of a '/' character """
225 stanzas = tree.findall('.//' + WLNS('strofa'))
226 for stanza in stanzas:
227 Stanza(stanza).versify()
230 def add_to_manifest(manifest, partno):
231 """ Adds a node to the manifest section in content.opf file """
233 partstr = 'part%d' % partno
234 e = manifest.makeelement(
235 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
236 'media-type': 'application/xhtml+xml'}
241 def add_to_spine(spine, partno):
242 """ Adds a node to the spine section in content.opf file """
244 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
249 def __init__(self, name=None, part_href=None):
252 self.part_href = part_href
253 self.sub_number = None
255 def add(self, name, part_href, level=0, is_part=True, index=None):
256 assert level == 0 or index is None
257 if level > 0 and self.children:
258 return self.children[-1].add(name, part_href, level - 1, is_part)
261 t.part_href = part_href
262 if index is not None:
263 self.children.insert(index, t)
265 self.children.append(t)
267 t.sub_number = len(self.children) + 1
270 def append(self, toc):
271 self.children.append(toc)
273 def extend(self, toc):
274 self.children.extend(toc.children)
278 return max((c.depth() for c in self.children)) + 1
284 if self.sub_number is not None:
285 src += '#sub%d' % self.sub_number
288 def write_to_xml(self, nav_map, counter=1):
289 for child in self.children:
290 nav_point = nav_map.makeelement(NCXNS('navPoint'))
291 nav_point.set('id', 'NavPoint-%d' % counter)
292 nav_point.set('playOrder', str(counter))
294 nav_label = nav_map.makeelement(NCXNS('navLabel'))
295 text = nav_map.makeelement(NCXNS('text'))
296 if child.name is not None:
297 text.text = re.sub(r'\n', ' ', child.name)
299 text.text = child.name
300 nav_label.append(text)
301 nav_point.append(nav_label)
303 content = nav_map.makeelement(NCXNS('content'))
304 content.set('src', child.href())
305 nav_point.append(content)
306 nav_map.append(nav_point)
307 counter = child.write_to_xml(nav_point, counter + 1)
310 def html_part(self, depth=0):
312 for child in self.children:
314 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
315 (depth, child.href(), child.name))
316 texts.append(child.html_part(depth + 1))
317 return "\n".join(texts)
320 with open(get_resource('epub/toc.html')) as f:
321 t = unicode(f.read(), 'utf-8')
322 return t % self.html_part()
325 def used_chars(element):
326 """ Lists characters used in an ETree Element """
327 chars = set((element.text or '') + (element.tail or ''))
328 for child in element:
329 chars = chars.union(used_chars(child))
334 """ divide main content of the XML file into chunks """
336 # prepare a container for each chunk
337 part_xml = etree.Element('utwor')
338 etree.SubElement(part_xml, 'master')
339 main_xml_part = part_xml[0] # master
341 last_node_part = False
343 # the below loop are workaround for a problem with epubs in drama ebooks without acts
346 for one_part in main_text:
348 if name == 'naglowek_scena':
350 elif name == 'naglowek_akt':
353 for one_part in main_text:
355 if is_act is False and is_scene is True:
356 if name == 'naglowek_czesc':
358 last_node_part = True
359 main_xml_part[:] = [deepcopy(one_part)]
360 elif not last_node_part and name == "naglowek_scena":
362 main_xml_part[:] = [deepcopy(one_part)]
364 main_xml_part.append(deepcopy(one_part))
365 last_node_part = False
367 if name == 'naglowek_czesc':
369 last_node_part = True
370 main_xml_part[:] = [deepcopy(one_part)]
371 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
373 main_xml_part[:] = [deepcopy(one_part)]
375 main_xml_part.append(deepcopy(one_part))
376 last_node_part = False
380 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
381 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
384 for element in chunk_xml[0]:
385 if element.tag == "naglowek_czesc":
386 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
387 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
388 toc.add(node_name(element), "part%d.html" % chunk_no)
389 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
390 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
391 element.set('sub', str(subnumber))
393 if not _empty_html_static:
394 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
396 output_html = _empty_html_static[0]
398 find_annotations(annotations, chunk_xml, chunk_no)
399 replace_by_verse(chunk_xml)
400 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
401 chars = used_chars(html_tree.getroot())
402 output_html = etree.tostring(
403 html_tree, pretty_print=True, xml_declaration=True,
405 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
406 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
408 return output_html, toc, chars
411 def transform(wldoc, verbose=False, style=None, html_toc=False,
412 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path=''):
413 """ produces a EPUB file
415 sample=n: generate sample e-book (with at least n paragraphs)
416 cover: a cover.Cover factory or True for default
417 flags: less-advertising, without-fonts, working-copy
420 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
421 """ processes one input file and proceeds to its children """
423 replace_characters(wldoc.edoc.getroot())
425 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
426 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
428 # every input file will have a TOC entry,
429 # pointing to starting chunk
430 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
433 # write book title page
434 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
435 chars = used_chars(html_tree.getroot())
439 html_tree, pretty_print=True, xml_declaration=True,
441 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
442 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
445 # add a title page TOC entry
446 toc.add(u"Strona tytułowa", "title.html")
447 elif wldoc.book_info.parts:
448 # write title page for every parent
449 if sample is not None and sample <= 0:
451 html_string = open(get_resource('epub/emptyChunk.html')).read()
453 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
454 chars = used_chars(html_tree.getroot())
455 html_string = etree.tostring(
456 html_tree, pretty_print=True, xml_declaration=True,
458 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
459 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
461 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
462 add_to_manifest(manifest, chunk_counter)
463 add_to_spine(spine, chunk_counter)
466 if len(wldoc.edoc.getroot()) > 1:
467 # rdf before style master
468 main_text = wldoc.edoc.getroot()[1]
470 # rdf in style master
471 main_text = wldoc.edoc.getroot()[0]
472 if main_text.tag == RDFNS('RDF'):
475 if main_text is not None:
476 for chunk_xml in chop(main_text):
478 if sample is not None:
482 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
483 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
485 toc.extend(chunk_toc)
486 chars = chars.union(chunk_chars)
487 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
488 add_to_manifest(manifest, chunk_counter)
489 add_to_spine(spine, chunk_counter)
492 for child in wldoc.parts():
493 child_toc, chunk_counter, chunk_chars, sample = transform_file(
494 child, chunk_counter, first=False, sample=sample)
495 toc.append(child_toc)
496 chars = chars.union(chunk_chars)
498 return toc, chunk_counter, chars, sample
500 document = deepcopy(wldoc)
505 document.edoc.getroot().set(flag, 'yes')
507 document.clean_ed_note()
508 document.clean_ed_note('abstrakt')
511 editors = document.editors()
513 document.edoc.getroot().set('editors', u', '.join(sorted(
514 editor.readable() for editor in editors)))
515 if document.book_info.funders:
516 document.edoc.getroot().set('funders', u', '.join(
517 document.book_info.funders))
518 if document.book_info.thanks:
519 document.edoc.getroot().set('thanks', document.book_info.thanks)
521 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
522 manifest = opf.find('.//' + OPFNS('manifest'))
523 guide = opf.find('.//' + OPFNS('guide'))
524 spine = opf.find('.//' + OPFNS('spine'))
526 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
527 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
529 functions.reg_mathml_epub(zip)
531 if os.path.isdir(ilustr_path):
532 for i, filename in enumerate(os.listdir(ilustr_path)):
533 file_path = os.path.join(ilustr_path, filename)
534 zip.write(file_path, os.path.join('OPS', filename))
535 image_id = 'image%s' % i
536 manifest.append(etree.fromstring(
537 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
539 # write static elements
540 mime = zipfile.ZipInfo()
541 mime.filename = 'mimetype'
542 mime.compress_type = zipfile.ZIP_STORED
544 zip.writestr(mime, 'application/epub+zip')
546 'META-INF/container.xml',
547 '<?xml version="1.0" ?>'
548 '<container version="1.0" '
549 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
550 '<rootfiles><rootfile full-path="OPS/content.opf" '
551 'media-type="application/oebps-package+xml" />'
552 '</rootfiles></container>'
554 zip.write(get_resource('res/wl-logo-small.png'),
555 os.path.join('OPS', 'logo_wolnelektury.png'))
556 zip.write(get_resource('res/jedenprocent.png'),
557 os.path.join('OPS', 'jedenprocent.png'))
559 style = get_resource('epub/style.css')
560 zip.write(style, os.path.join('OPS', 'style.css'))
564 cover = DefaultEbookCover
566 cover_file = StringIO()
567 bound_cover = cover(document.book_info)
568 bound_cover.save(cover_file)
569 cover_name = 'cover.%s' % bound_cover.ext()
570 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
573 cover_tree = etree.parse(get_resource('epub/cover.html'))
574 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
575 zip.writestr('OPS/cover.html', etree.tostring(
576 cover_tree, pretty_print=True, xml_declaration=True,
578 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
579 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
582 if bound_cover.uses_dc_cover:
583 if document.book_info.cover_by:
584 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
585 if document.book_info.cover_source:
586 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
588 manifest.append(etree.fromstring(
589 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
590 manifest.append(etree.fromstring(
591 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
592 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
593 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
594 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
596 annotations = etree.Element('annotations')
598 toc_file = etree.fromstring(
599 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
600 '"-//NISO//DTD ncx 2005-1//EN" '
601 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
602 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
603 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
606 nav_map = toc_file[-1]
609 manifest.append(etree.fromstring(
610 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
611 spine.append(etree.fromstring(
612 '<itemref idref="html_toc" />'))
613 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
615 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
617 if len(toc.children) < 2:
618 toc.add(u"Początek utworu", "part1.html")
620 # Last modifications in container files and EPUB creation
621 if len(annotations) > 0:
622 toc.add("Przypisy", "annotations.html")
623 manifest.append(etree.fromstring(
624 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
625 spine.append(etree.fromstring(
626 '<itemref idref="annotations" />'))
627 replace_by_verse(annotations)
628 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
629 chars = chars.union(used_chars(html_tree.getroot()))
630 zip.writestr('OPS/annotations.html', etree.tostring(
631 html_tree, pretty_print=True, xml_declaration=True,
633 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
634 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
637 toc.add("Wesprzyj Wolne Lektury", "support.html")
638 manifest.append(etree.fromstring(
639 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
640 spine.append(etree.fromstring(
641 '<itemref idref="support" />'))
642 html_string = open(get_resource('epub/support.html')).read()
643 chars.update(used_chars(etree.fromstring(html_string)))
644 zip.writestr('OPS/support.html', html_string)
646 toc.add("Strona redakcyjna", "last.html")
647 manifest.append(etree.fromstring(
648 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
649 spine.append(etree.fromstring(
650 '<itemref idref="last" />'))
651 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
652 chars.update(used_chars(html_tree.getroot()))
653 zip.writestr('OPS/last.html', etree.tostring(
654 html_tree, pretty_print=True, xml_declaration=True,
656 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
657 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
660 if not flags or 'without-fonts' not in flags:
662 tmpdir = mkdtemp('-librarian-epub')
668 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
669 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
670 optimizer_call = ['perl', 'subset.pl', '--chars',
671 ''.join(chars).encode('utf-8'),
672 get_resource('fonts/' + fname),
673 os.path.join(tmpdir, fname)]
675 print "Running font-optimizer"
676 subprocess.check_call(optimizer_call)
678 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
679 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
680 manifest.append(etree.fromstring(
681 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
685 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
686 xml_declaration=True, encoding="utf-8"))
687 title = document.book_info.title
688 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
689 for st in attributes:
690 meta = toc_file.makeelement(NCXNS('meta'))
692 meta.set('content', '0')
693 toc_file[0].append(meta)
694 toc_file[0][0].set('content', str(document.book_info.url))
695 toc_file[0][1].set('content', str(toc.depth()))
696 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
700 toc.add(u"Spis treści", "toc.html", index=1)
701 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
702 toc.write_to_xml(nav_map)
703 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
704 xml_declaration=True, encoding="utf-8"))
707 return OutputFile.from_filename(output_file.name)