1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def squeeze_whitespace(s):
33 return re.sub(r'\s+', ' ', s)
36 def set_hyph_language(source_tree):
37 def get_short_lng_code(text):
40 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
42 list = line.strip().split('|')
49 bibl_lng = etree.XPath('//dc:language//text()',
50 namespaces={'dc': str(DCNS)})(source_tree)
51 short_lng = get_short_lng_code(bibl_lng[0])
53 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
59 def hyphenate_and_fix_conjunctions(source_tree, hyph):
60 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
62 parent = t.getparent()
65 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
67 newt += hyph.inserted(w, u'\u00AD')
70 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
78 """ returns node's text and children as a string
80 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
84 nt = node.text if node.text is not None else ''
85 return ''.join([nt] + [etree.tostring(child) for child in node])
88 def set_inner_xml(node, text):
89 """ sets node's text and children from a string
91 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
92 >>> set_inner_xml(e, 'x<b>y</b>z')
93 >>> print etree.tostring(e)
97 p = etree.fromstring('<x>%s</x>' % text)
103 """ Find out a node's name
105 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
109 tempnode = deepcopy(node)
111 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
112 for e in tempnode.findall('.//%s' % p):
116 etree.strip_tags(tempnode, '*')
120 def xslt(xml, sheet, **kwargs):
121 if isinstance(xml, etree._Element):
122 xml = etree.ElementTree(xml)
123 with open(sheet) as xsltf:
124 transform = etree.XSLT(etree.parse(xsltf))
125 params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
126 return transform(xml, **params)
129 def replace_characters(node):
130 def replace_chars(text):
133 return text.replace(u"\ufeff", u"")\
134 .replace("---", u"\u2014")\
135 .replace("--", u"\u2013")\
136 .replace(",,", u"\u201E")\
137 .replace('"', u"\u201D")\
138 .replace("'", u"\u2019")
139 if node.tag in ('uwaga', 'extra'):
143 node.text = replace_chars(node.text)
144 node.tail = replace_chars(node.tail)
146 replace_characters(child)
149 def find_annotations(annotations, source, part_no):
151 if child.tag in ('pe', 'pa', 'pt', 'pr'):
152 annotation = deepcopy(child)
153 number = str(len(annotations) + 1)
154 annotation.set('number', number)
155 annotation.set('part', str(part_no))
157 annotations.append(annotation)
162 if child.tag not in ('extra', 'uwaga'):
163 find_annotations(annotations, child, part_no)
166 class Stanza(object):
168 Converts / verse endings into verse elements in a stanza.
170 Slashes may only occur directly in the stanza. Any slashes in subelements
171 will be ignored, and the subelements will be put inside verse elements.
173 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
174 >>> Stanza(s).versify()
175 >>> print etree.tostring(s)
176 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
177 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
180 def __init__(self, stanza_elem):
181 self.stanza = stanza_elem
183 self.open_verse = None
186 self.push_text(self.stanza.text)
187 for elem in self.stanza:
189 self.push_text(elem.tail)
190 tail = self.stanza.tail
192 self.stanza.tail = tail
193 self.stanza.extend(verse for verse in self.verses if verse.text or len(verse) > 0)
195 def open_normal_verse(self):
196 self.open_verse = self.stanza.makeelement("wers_normalny")
197 self.verses.append(self.open_verse)
199 def get_open_verse(self):
200 if self.open_verse is None:
201 self.open_normal_verse()
202 return self.open_verse
204 def push_text(self, text):
207 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
209 self.open_normal_verse()
210 if not verse_text.strip():
212 verse = self.get_open_verse()
214 verse[-1].tail = (verse[-1].tail or "") + verse_text
216 verse.text = (verse.text or "") + verse_text
218 def push_elem(self, elem):
219 if elem.tag.startswith("wers"):
220 verse = deepcopy(elem)
222 self.verses.append(verse)
223 self.open_verse = verse
225 appended = deepcopy(elem)
227 self.get_open_verse().append(appended)
230 def replace_by_verse(tree):
231 """ Find stanzas and create new verses in place of a '/' character """
233 stanzas = tree.findall('.//' + WLNS('strofa'))
234 for stanza in stanzas:
235 Stanza(stanza).versify()
238 def add_to_manifest(manifest, partno):
239 """ Adds a node to the manifest section in content.opf file """
241 partstr = 'part%d' % partno
242 e = manifest.makeelement(
243 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
244 'media-type': 'application/xhtml+xml'}
249 def add_to_spine(spine, partno):
250 """ Adds a node to the spine section in content.opf file """
252 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
257 def __init__(self, name=None, part_href=None):
260 self.part_href = part_href
261 self.sub_number = None
263 def add(self, name, part_href, level=0, is_part=True, index=None):
264 assert level == 0 or index is None
265 if level > 0 and self.children:
266 return self.children[-1].add(name, part_href, level - 1, is_part)
269 t.part_href = part_href
270 if index is not None:
271 self.children.insert(index, t)
273 self.children.append(t)
275 t.sub_number = len(self.children) + 1
278 def append(self, toc):
279 self.children.append(toc)
281 def extend(self, toc):
282 self.children.extend(toc.children)
286 return max((c.depth() for c in self.children)) + 1
292 if self.sub_number is not None:
293 src += '#sub%d' % self.sub_number
296 def write_to_xml(self, nav_map, counter=1):
297 for child in self.children:
298 nav_point = nav_map.makeelement(NCXNS('navPoint'))
299 nav_point.set('id', 'NavPoint-%d' % counter)
300 nav_point.set('playOrder', str(counter))
302 nav_label = nav_map.makeelement(NCXNS('navLabel'))
303 text = nav_map.makeelement(NCXNS('text'))
304 if child.name is not None:
305 text.text = re.sub(r'\n', ' ', child.name)
307 text.text = child.name
308 nav_label.append(text)
309 nav_point.append(nav_label)
311 content = nav_map.makeelement(NCXNS('content'))
312 content.set('src', child.href())
313 nav_point.append(content)
314 nav_map.append(nav_point)
315 counter = child.write_to_xml(nav_point, counter + 1)
318 def html_part(self, depth=0):
320 for child in self.children:
322 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
323 (depth, child.href(), child.name))
324 texts.append(child.html_part(depth + 1))
325 return "\n".join(texts)
328 with open(get_resource('epub/toc.html')) as f:
329 t = unicode(f.read(), 'utf-8')
330 return t % self.html_part()
333 def used_chars(element):
334 """ Lists characters used in an ETree Element """
335 chars = set((element.text or '') + (element.tail or ''))
336 for child in element:
337 chars = chars.union(used_chars(child))
342 """ divide main content of the XML file into chunks """
344 # prepare a container for each chunk
345 part_xml = etree.Element('utwor')
346 etree.SubElement(part_xml, 'master')
347 main_xml_part = part_xml[0] # master
349 last_node_part = False
351 # the below loop are workaround for a problem with epubs in drama ebooks without acts
354 for one_part in main_text:
356 if name == 'naglowek_scena':
358 elif name == 'naglowek_akt':
361 for one_part in main_text:
363 if is_act is False and is_scene is True:
364 if name == 'naglowek_czesc':
366 last_node_part = True
367 main_xml_part[:] = [deepcopy(one_part)]
368 elif not last_node_part and name == "naglowek_scena":
370 main_xml_part[:] = [deepcopy(one_part)]
372 main_xml_part.append(deepcopy(one_part))
373 last_node_part = False
375 if name == 'naglowek_czesc':
377 last_node_part = True
378 main_xml_part[:] = [deepcopy(one_part)]
379 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
381 main_xml_part[:] = [deepcopy(one_part)]
383 main_xml_part.append(deepcopy(one_part))
384 last_node_part = False
388 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
389 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
392 for element in chunk_xml[0]:
393 if element.tag == "naglowek_czesc":
394 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
395 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
396 toc.add(node_name(element), "part%d.html" % chunk_no)
397 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
398 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
399 element.set('sub', str(subnumber))
401 if not _empty_html_static:
402 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
404 output_html = _empty_html_static[0]
406 find_annotations(annotations, chunk_xml, chunk_no)
407 replace_by_verse(chunk_xml)
408 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
409 chars = used_chars(html_tree.getroot())
410 output_html = etree.tostring(
411 html_tree, pretty_print=True, xml_declaration=True,
413 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
414 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
416 return output_html, toc, chars
419 def transform(wldoc, verbose=False, style=None, html_toc=False,
420 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
421 """ produces a EPUB file
423 sample=n: generate sample e-book (with at least n paragraphs)
424 cover: a cover.Cover factory or True for default
425 flags: less-advertising, without-fonts, working-copy
428 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
429 """ processes one input file and proceeds to its children """
431 replace_characters(wldoc.edoc.getroot())
433 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
434 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
436 # every input file will have a TOC entry,
437 # pointing to starting chunk
438 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
441 # write book title page
442 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
443 chars = used_chars(html_tree.getroot())
444 html_string = etree.tostring(
445 html_tree, pretty_print=True, xml_declaration=True,
447 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
448 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
450 zip.writestr('OPS/title.html', squeeze_whitespace(html_string))
451 # add a title page TOC entry
452 toc.add(u"Strona tytułowa", "title.html")
453 elif wldoc.book_info.parts:
454 # write title page for every parent
455 if sample is not None and sample <= 0:
457 html_string = open(get_resource('epub/emptyChunk.html')).read()
459 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
460 chars = used_chars(html_tree.getroot())
461 html_string = etree.tostring(
462 html_tree, pretty_print=True, xml_declaration=True,
464 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
465 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
467 zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(html_string))
468 add_to_manifest(manifest, chunk_counter)
469 add_to_spine(spine, chunk_counter)
472 if len(wldoc.edoc.getroot()) > 1:
473 # rdf before style master
474 main_text = wldoc.edoc.getroot()[1]
476 # rdf in style master
477 main_text = wldoc.edoc.getroot()[0]
478 if main_text.tag == RDFNS('RDF'):
481 if main_text is not None:
482 for chunk_xml in chop(main_text):
484 if sample is not None:
488 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
489 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
491 toc.extend(chunk_toc)
492 chars = chars.union(chunk_chars)
493 zip.writestr('OPS/part%d.html' % chunk_counter, squeeze_whitespace(chunk_html))
494 add_to_manifest(manifest, chunk_counter)
495 add_to_spine(spine, chunk_counter)
498 for child in wldoc.parts():
499 child_toc, chunk_counter, chunk_chars, sample = transform_file(
500 child, chunk_counter, first=False, sample=sample)
501 toc.append(child_toc)
502 chars = chars.union(chunk_chars)
504 return toc, chunk_counter, chars, sample
506 document = deepcopy(wldoc)
511 document.edoc.getroot().set(flag, 'yes')
513 document.clean_ed_note()
514 document.clean_ed_note('abstrakt')
517 editors = document.editors()
519 document.edoc.getroot().set('editors', u', '.join(sorted(
520 editor.readable() for editor in editors)))
521 if document.book_info.funders:
522 document.edoc.getroot().set('funders', u', '.join(
523 document.book_info.funders))
524 if document.book_info.thanks:
525 document.edoc.getroot().set('thanks', document.book_info.thanks)
527 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
528 manifest = opf.find('.//' + OPFNS('manifest'))
529 guide = opf.find('.//' + OPFNS('guide'))
530 spine = opf.find('.//' + OPFNS('spine'))
532 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
533 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
535 functions.reg_mathml_epub(zip)
537 if os.path.isdir(ilustr_path):
538 for i, filename in enumerate(os.listdir(ilustr_path)):
539 file_path = os.path.join(ilustr_path, filename)
540 zip.write(file_path, os.path.join('OPS', filename))
541 image_id = 'image%s' % i
542 manifest.append(etree.fromstring(
543 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
545 # write static elements
546 mime = zipfile.ZipInfo()
547 mime.filename = 'mimetype'
548 mime.compress_type = zipfile.ZIP_STORED
550 zip.writestr(mime, 'application/epub+zip')
552 'META-INF/container.xml',
553 '<?xml version="1.0" ?>'
554 '<container version="1.0" '
555 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
556 '<rootfiles><rootfile full-path="OPS/content.opf" '
557 'media-type="application/oebps-package+xml" />'
558 '</rootfiles></container>'
560 zip.write(get_resource('res/wl-logo-small.png'),
561 os.path.join('OPS', 'logo_wolnelektury.png'))
562 zip.write(get_resource('res/jedenprocent.png'),
563 os.path.join('OPS', 'jedenprocent.png'))
565 style = get_resource('epub/style.css')
566 zip.write(style, os.path.join('OPS', 'style.css'))
572 cover_file = StringIO()
573 bound_cover = cover(document.book_info)
574 bound_cover.save(cover_file)
575 cover_name = 'cover.%s' % bound_cover.ext()
576 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
579 cover_tree = etree.parse(get_resource('epub/cover.html'))
580 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
581 zip.writestr('OPS/cover.html', etree.tostring(
582 cover_tree, pretty_print=True, xml_declaration=True,
584 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
585 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
588 if bound_cover.uses_dc_cover:
589 if document.book_info.cover_by:
590 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
591 if document.book_info.cover_source:
592 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
594 manifest.append(etree.fromstring(
595 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
596 manifest.append(etree.fromstring(
597 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
598 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
599 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
600 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
602 annotations = etree.Element('annotations')
604 toc_file = etree.fromstring(
605 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
606 '"-//NISO//DTD ncx 2005-1//EN" '
607 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
608 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
609 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
612 nav_map = toc_file[-1]
615 manifest.append(etree.fromstring(
616 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
617 spine.append(etree.fromstring(
618 '<itemref idref="html_toc" />'))
619 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
621 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
623 if len(toc.children) < 2:
624 toc.add(u"Początek utworu", "part1.html")
626 # Last modifications in container files and EPUB creation
627 if len(annotations) > 0:
628 toc.add("Przypisy", "annotations.html")
629 manifest.append(etree.fromstring(
630 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
631 spine.append(etree.fromstring(
632 '<itemref idref="annotations" />'))
633 replace_by_verse(annotations)
634 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
635 chars = chars.union(used_chars(html_tree.getroot()))
636 zip.writestr('OPS/annotations.html', etree.tostring(
637 html_tree, pretty_print=True, xml_declaration=True,
639 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
640 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
643 toc.add("Wesprzyj Wolne Lektury", "support.html")
644 manifest.append(etree.fromstring(
645 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
646 spine.append(etree.fromstring(
647 '<itemref idref="support" />'))
648 html_string = open(get_resource('epub/support.html')).read()
649 chars.update(used_chars(etree.fromstring(html_string)))
650 zip.writestr('OPS/support.html', squeeze_whitespace(html_string))
652 toc.add("Strona redakcyjna", "last.html")
653 manifest.append(etree.fromstring(
654 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
655 spine.append(etree.fromstring(
656 '<itemref idref="last" />'))
657 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
658 chars.update(used_chars(html_tree.getroot()))
659 zip.writestr('OPS/last.html', squeeze_whitespace(etree.tostring(
660 html_tree, pretty_print=True, xml_declaration=True,
662 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
663 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
666 if not flags or 'without-fonts' not in flags:
668 tmpdir = mkdtemp('-librarian-epub')
674 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
675 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
676 optimizer_call = ['perl', 'subset.pl', '--chars',
677 ''.join(chars).encode('utf-8'),
678 get_resource('fonts/' + fname),
679 os.path.join(tmpdir, fname)]
680 env = {"PERL_USE_UNSAFE_INC": "1"}
682 print "Running font-optimizer"
683 subprocess.check_call(optimizer_call, env=env)
685 dev_null = open(os.devnull, 'w')
686 subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null, env=env)
687 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
688 manifest.append(etree.fromstring(
689 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
693 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
694 xml_declaration=True, encoding="utf-8"))
695 title = document.book_info.title
696 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
697 for st in attributes:
698 meta = toc_file.makeelement(NCXNS('meta'))
700 meta.set('content', '0')
701 toc_file[0].append(meta)
702 toc_file[0][0].set('content', str(document.book_info.url))
703 toc_file[0][1].set('content', str(toc.depth()))
704 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
708 toc.add(u"Spis treści", "toc.html", index=1)
709 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
710 toc.write_to_xml(nav_map)
711 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
712 xml_declaration=True, encoding="utf-8"))
715 return OutputFile.from_filename(output_file.name)