1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet, **kwargs):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 transform = etree.XSLT(etree.parse(xsltf))
121 params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
122 return transform(xml, **params)
125 def replace_characters(node):
126 def replace_chars(text):
129 return text.replace(u"\ufeff", u"")\
130 .replace("---", u"\u2014")\
131 .replace("--", u"\u2013")\
132 .replace(",,", u"\u201E")\
133 .replace('"', u"\u201D")\
134 .replace("'", u"\u2019")
135 if node.tag in ('uwaga', 'extra'):
139 node.text = replace_chars(node.text)
140 node.tail = replace_chars(node.tail)
142 replace_characters(child)
145 def find_annotations(annotations, source, part_no):
147 if child.tag in ('pe', 'pa', 'pt', 'pr'):
148 annotation = deepcopy(child)
149 number = str(len(annotations) + 1)
150 annotation.set('number', number)
151 annotation.set('part', str(part_no))
153 annotations.append(annotation)
158 if child.tag not in ('extra', 'uwaga'):
159 find_annotations(annotations, child, part_no)
162 class Stanza(object):
164 Converts / verse endings into verse elements in a stanza.
166 Slashes may only occur directly in the stanza. Any slashes in subelements
167 will be ignored, and the subelements will be put inside verse elements.
169 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
170 >>> Stanza(s).versify()
171 >>> print etree.tostring(s)
172 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
173 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
176 def __init__(self, stanza_elem):
177 self.stanza = stanza_elem
179 self.open_verse = None
182 self.push_text(self.stanza.text)
183 for elem in self.stanza:
185 self.push_text(elem.tail)
186 tail = self.stanza.tail
188 self.stanza.tail = tail
189 self.stanza.extend(self.verses)
191 def open_normal_verse(self):
192 self.open_verse = self.stanza.makeelement("wers_normalny")
193 self.verses.append(self.open_verse)
195 def get_open_verse(self):
196 if self.open_verse is None:
197 self.open_normal_verse()
198 return self.open_verse
200 def push_text(self, text):
203 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
205 self.open_normal_verse()
206 if not verse_text.strip():
208 verse = self.get_open_verse()
210 verse[-1].tail = (verse[-1].tail or "") + verse_text
212 verse.text = (verse.text or "") + verse_text
214 def push_elem(self, elem):
215 if elem.tag.startswith("wers"):
216 verse = deepcopy(elem)
218 self.verses.append(verse)
219 self.open_verse = verse
221 appended = deepcopy(elem)
223 self.get_open_verse().append(appended)
226 def replace_by_verse(tree):
227 """ Find stanzas and create new verses in place of a '/' character """
229 stanzas = tree.findall('.//' + WLNS('strofa'))
230 for stanza in stanzas:
231 Stanza(stanza).versify()
234 def add_to_manifest(manifest, partno):
235 """ Adds a node to the manifest section in content.opf file """
237 partstr = 'part%d' % partno
238 e = manifest.makeelement(
239 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
240 'media-type': 'application/xhtml+xml'}
245 def add_to_spine(spine, partno):
246 """ Adds a node to the spine section in content.opf file """
248 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
253 def __init__(self, name=None, part_href=None):
256 self.part_href = part_href
257 self.sub_number = None
259 def add(self, name, part_href, level=0, is_part=True, index=None):
260 assert level == 0 or index is None
261 if level > 0 and self.children:
262 return self.children[-1].add(name, part_href, level - 1, is_part)
265 t.part_href = part_href
266 if index is not None:
267 self.children.insert(index, t)
269 self.children.append(t)
271 t.sub_number = len(self.children) + 1
274 def append(self, toc):
275 self.children.append(toc)
277 def extend(self, toc):
278 self.children.extend(toc.children)
282 return max((c.depth() for c in self.children)) + 1
288 if self.sub_number is not None:
289 src += '#sub%d' % self.sub_number
292 def write_to_xml(self, nav_map, counter=1):
293 for child in self.children:
294 nav_point = nav_map.makeelement(NCXNS('navPoint'))
295 nav_point.set('id', 'NavPoint-%d' % counter)
296 nav_point.set('playOrder', str(counter))
298 nav_label = nav_map.makeelement(NCXNS('navLabel'))
299 text = nav_map.makeelement(NCXNS('text'))
300 if child.name is not None:
301 text.text = re.sub(r'\n', ' ', child.name)
303 text.text = child.name
304 nav_label.append(text)
305 nav_point.append(nav_label)
307 content = nav_map.makeelement(NCXNS('content'))
308 content.set('src', child.href())
309 nav_point.append(content)
310 nav_map.append(nav_point)
311 counter = child.write_to_xml(nav_point, counter + 1)
314 def html_part(self, depth=0):
316 for child in self.children:
318 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
319 (depth, child.href(), child.name))
320 texts.append(child.html_part(depth + 1))
321 return "\n".join(texts)
324 with open(get_resource('epub/toc.html')) as f:
325 t = unicode(f.read(), 'utf-8')
326 return t % self.html_part()
329 def used_chars(element):
330 """ Lists characters used in an ETree Element """
331 chars = set((element.text or '') + (element.tail or ''))
332 for child in element:
333 chars = chars.union(used_chars(child))
338 """ divide main content of the XML file into chunks """
340 # prepare a container for each chunk
341 part_xml = etree.Element('utwor')
342 etree.SubElement(part_xml, 'master')
343 main_xml_part = part_xml[0] # master
345 last_node_part = False
347 # the below loop are workaround for a problem with epubs in drama ebooks without acts
350 for one_part in main_text:
352 if name == 'naglowek_scena':
354 elif name == 'naglowek_akt':
357 for one_part in main_text:
359 if is_act is False and is_scene is True:
360 if name == 'naglowek_czesc':
362 last_node_part = True
363 main_xml_part[:] = [deepcopy(one_part)]
364 elif not last_node_part and name == "naglowek_scena":
366 main_xml_part[:] = [deepcopy(one_part)]
368 main_xml_part.append(deepcopy(one_part))
369 last_node_part = False
371 if name == 'naglowek_czesc':
373 last_node_part = True
374 main_xml_part[:] = [deepcopy(one_part)]
375 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
377 main_xml_part[:] = [deepcopy(one_part)]
379 main_xml_part.append(deepcopy(one_part))
380 last_node_part = False
384 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
385 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
388 for element in chunk_xml[0]:
389 if element.tag == "naglowek_czesc":
390 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
391 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
392 toc.add(node_name(element), "part%d.html" % chunk_no)
393 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
394 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
395 element.set('sub', str(subnumber))
397 if not _empty_html_static:
398 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
400 output_html = _empty_html_static[0]
402 find_annotations(annotations, chunk_xml, chunk_no)
403 replace_by_verse(chunk_xml)
404 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
405 chars = used_chars(html_tree.getroot())
406 output_html = etree.tostring(
407 html_tree, pretty_print=True, xml_declaration=True,
409 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
410 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
412 return output_html, toc, chars
415 def transform(wldoc, verbose=False, style=None, html_toc=False,
416 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
417 """ produces a EPUB file
419 sample=n: generate sample e-book (with at least n paragraphs)
420 cover: a cover.Cover factory or True for default
421 flags: less-advertising, without-fonts, working-copy
424 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
425 """ processes one input file and proceeds to its children """
427 replace_characters(wldoc.edoc.getroot())
429 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
430 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
432 # every input file will have a TOC entry,
433 # pointing to starting chunk
434 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
437 # write book title page
438 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
439 chars = used_chars(html_tree.getroot())
443 html_tree, pretty_print=True, xml_declaration=True,
445 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
446 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
449 # add a title page TOC entry
450 toc.add(u"Strona tytułowa", "title.html")
451 elif wldoc.book_info.parts:
452 # write title page for every parent
453 if sample is not None and sample <= 0:
455 html_string = open(get_resource('epub/emptyChunk.html')).read()
457 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
458 chars = used_chars(html_tree.getroot())
459 html_string = etree.tostring(
460 html_tree, pretty_print=True, xml_declaration=True,
462 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
463 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
465 html_string = re.sub(ur'([^\r])\n', ur'\1\r\n', html_string)
466 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
467 add_to_manifest(manifest, chunk_counter)
468 add_to_spine(spine, chunk_counter)
471 if len(wldoc.edoc.getroot()) > 1:
472 # rdf before style master
473 main_text = wldoc.edoc.getroot()[1]
475 # rdf in style master
476 main_text = wldoc.edoc.getroot()[0]
477 if main_text.tag == RDFNS('RDF'):
480 if main_text is not None:
481 for chunk_xml in chop(main_text):
483 if sample is not None:
487 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
488 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
490 toc.extend(chunk_toc)
491 chars = chars.union(chunk_chars)
492 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
493 add_to_manifest(manifest, chunk_counter)
494 add_to_spine(spine, chunk_counter)
497 for child in wldoc.parts():
498 child_toc, chunk_counter, chunk_chars, sample = transform_file(
499 child, chunk_counter, first=False, sample=sample)
500 toc.append(child_toc)
501 chars = chars.union(chunk_chars)
503 return toc, chunk_counter, chars, sample
505 document = deepcopy(wldoc)
510 document.edoc.getroot().set(flag, 'yes')
512 document.clean_ed_note()
513 document.clean_ed_note('abstrakt')
516 editors = document.editors()
518 document.edoc.getroot().set('editors', u', '.join(sorted(
519 editor.readable() for editor in editors)))
520 if document.book_info.funders:
521 document.edoc.getroot().set('funders', u', '.join(
522 document.book_info.funders))
523 if document.book_info.thanks:
524 document.edoc.getroot().set('thanks', document.book_info.thanks)
526 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
527 manifest = opf.find('.//' + OPFNS('manifest'))
528 guide = opf.find('.//' + OPFNS('guide'))
529 spine = opf.find('.//' + OPFNS('spine'))
531 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
532 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
534 functions.reg_mathml_epub(zip)
536 if os.path.isdir(ilustr_path):
537 for i, filename in enumerate(os.listdir(ilustr_path)):
538 file_path = os.path.join(ilustr_path, filename)
539 zip.write(file_path, os.path.join('OPS', filename))
540 image_id = 'image%s' % i
541 manifest.append(etree.fromstring(
542 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
544 # write static elements
545 mime = zipfile.ZipInfo()
546 mime.filename = 'mimetype'
547 mime.compress_type = zipfile.ZIP_STORED
549 zip.writestr(mime, 'application/epub+zip')
551 'META-INF/container.xml',
552 '<?xml version="1.0" ?>'
553 '<container version="1.0" '
554 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
555 '<rootfiles><rootfile full-path="OPS/content.opf" '
556 'media-type="application/oebps-package+xml" />'
557 '</rootfiles></container>'
559 zip.write(get_resource('res/wl-logo-small.png'),
560 os.path.join('OPS', 'logo_wolnelektury.png'))
561 zip.write(get_resource('res/jedenprocent.png'),
562 os.path.join('OPS', 'jedenprocent.png'))
564 style = get_resource('epub/style.css')
565 zip.write(style, os.path.join('OPS', 'style.css'))
571 cover_file = StringIO()
572 bound_cover = cover(document.book_info)
573 bound_cover.save(cover_file)
574 cover_name = 'cover.%s' % bound_cover.ext()
575 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
578 cover_tree = etree.parse(get_resource('epub/cover.html'))
579 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
580 zip.writestr('OPS/cover.html', etree.tostring(
581 cover_tree, pretty_print=True, xml_declaration=True,
583 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
584 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
587 if bound_cover.uses_dc_cover:
588 if document.book_info.cover_by:
589 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
590 if document.book_info.cover_source:
591 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
593 manifest.append(etree.fromstring(
594 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
595 manifest.append(etree.fromstring(
596 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
597 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
598 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
599 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
601 annotations = etree.Element('annotations')
603 toc_file = etree.fromstring(
604 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
605 '"-//NISO//DTD ncx 2005-1//EN" '
606 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
607 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
608 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
611 nav_map = toc_file[-1]
614 manifest.append(etree.fromstring(
615 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
616 spine.append(etree.fromstring(
617 '<itemref idref="html_toc" />'))
618 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
620 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
622 if len(toc.children) < 2:
623 toc.add(u"Początek utworu", "part1.html")
625 # Last modifications in container files and EPUB creation
626 if len(annotations) > 0:
627 toc.add("Przypisy", "annotations.html")
628 manifest.append(etree.fromstring(
629 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
630 spine.append(etree.fromstring(
631 '<itemref idref="annotations" />'))
632 replace_by_verse(annotations)
633 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
634 chars = chars.union(used_chars(html_tree.getroot()))
635 zip.writestr('OPS/annotations.html', etree.tostring(
636 html_tree, pretty_print=True, xml_declaration=True,
638 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
639 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
642 toc.add("Wesprzyj Wolne Lektury", "support.html")
643 manifest.append(etree.fromstring(
644 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
645 spine.append(etree.fromstring(
646 '<itemref idref="support" />'))
647 html_string = open(get_resource('epub/support.html')).read()
648 chars.update(used_chars(etree.fromstring(html_string)))
649 zip.writestr('OPS/support.html', html_string)
651 toc.add("Strona redakcyjna", "last.html")
652 manifest.append(etree.fromstring(
653 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
654 spine.append(etree.fromstring(
655 '<itemref idref="last" />'))
656 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
657 chars.update(used_chars(html_tree.getroot()))
658 zip.writestr('OPS/last.html', etree.tostring(
659 html_tree, pretty_print=True, xml_declaration=True,
661 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
662 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
665 if not flags or 'without-fonts' not in flags:
667 tmpdir = mkdtemp('-librarian-epub')
673 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
674 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
675 optimizer_call = ['perl', 'subset.pl', '--chars',
676 ''.join(chars).encode('utf-8'),
677 get_resource('fonts/' + fname),
678 os.path.join(tmpdir, fname)]
679 env = {"PERL_USE_UNSAFE_INC": "1"}
681 print "Running font-optimizer"
682 subprocess.check_call(optimizer_call, env=env)
684 dev_null = open(os.devnull, 'w')
685 subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null, env=env)
686 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
687 manifest.append(etree.fromstring(
688 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
692 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
693 xml_declaration=True, encoding="utf-8"))
694 title = document.book_info.title
695 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
696 for st in attributes:
697 meta = toc_file.makeelement(NCXNS('meta'))
699 meta.set('content', '0')
700 toc_file[0].append(meta)
701 toc_file[0][0].set('content', str(document.book_info.url))
702 toc_file[0][1].set('content', str(toc.depth()))
703 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
707 toc.add(u"Spis treści", "toc.html", index=1)
708 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
709 toc.write_to_xml(nav_map)
710 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
711 xml_declaration=True, encoding="utf-8"))
714 return OutputFile.from_filename(output_file.name)