1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
12 from six import BytesIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def squeeze_whitespace(s):
33 return re.sub(b'\\s+', b' ', s)
36 def set_hyph_language(source_tree):
37 def get_short_lng_code(text):
40 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
41 for line in f.read().decode('latin1').split('\n'):
42 list = line.strip().split('|')
49 bibl_lng = etree.XPath('//dc:language//text()',
50 namespaces={'dc': str(DCNS)})(source_tree)
51 short_lng = get_short_lng_code(bibl_lng[0])
53 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
59 def hyphenate_and_fix_conjunctions(source_tree, hyph):
60 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
62 parent = t.getparent()
65 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
67 newt += hyph.inserted(w, u'\u00AD')
70 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
78 """ returns node's text and children as a string
80 >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
84 nt = node.text if node.text is not None else ''
86 [nt] + [etree.tostring(child, encoding='unicode') for child in node]
90 def set_inner_xml(node, text):
91 """ sets node's text and children from a string
93 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
94 >>> set_inner_xml(e, 'x<b>y</b>z')
95 >>> print(etree.tostring(e, encoding='unicode'))
99 p = etree.fromstring('<x>%s</x>' % text)
105 """ Find out a node's name
107 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
111 tempnode = deepcopy(node)
113 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
114 for e in tempnode.findall('.//%s' % p):
118 etree.strip_tags(tempnode, '*')
122 def xslt(xml, sheet, **kwargs):
123 if isinstance(xml, etree._Element):
124 xml = etree.ElementTree(xml)
125 with open(sheet) as xsltf:
126 transform = etree.XSLT(etree.parse(xsltf))
128 (key, transform.strparam(value))
129 for key, value in kwargs.items()
131 return transform(xml, **params)
134 def replace_characters(node):
135 def replace_chars(text):
138 return text.replace(u"\ufeff", u"")\
139 .replace("---", u"\u2014")\
140 .replace("--", u"\u2013")\
141 .replace(",,", u"\u201E")\
142 .replace('"', u"\u201D")\
143 .replace("'", u"\u2019")
144 if node.tag in ('uwaga', 'extra'):
148 node.text = replace_chars(node.text)
149 node.tail = replace_chars(node.tail)
151 replace_characters(child)
154 def find_annotations(annotations, source, part_no):
156 if child.tag in ('pe', 'pa', 'pt', 'pr'):
157 annotation = deepcopy(child)
158 number = str(len(annotations) + 1)
159 annotation.set('number', number)
160 annotation.set('part', str(part_no))
162 annotations.append(annotation)
167 if child.tag not in ('extra', 'uwaga'):
168 find_annotations(annotations, child, part_no)
171 class Stanza(object):
173 Converts / verse endings into verse elements in a stanza.
175 Slashes may only occur directly in the stanza. Any slashes in subelements
176 will be ignored, and the subelements will be put inside verse elements.
178 >>> s = etree.fromstring(
179 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
181 >>> Stanza(s).versify()
182 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
184 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
185 <wers_normalny>b<x>x/
186 y</x>c</wers_normalny>
187 <wers_normalny>d</wers_normalny>
191 def __init__(self, stanza_elem):
192 self.stanza = stanza_elem
194 self.open_verse = None
197 self.push_text(self.stanza.text)
198 for elem in self.stanza:
200 self.push_text(elem.tail)
201 tail = self.stanza.tail
203 self.stanza.tail = tail
205 verse for verse in self.verses
206 if verse.text or len(verse) > 0
209 def open_normal_verse(self):
210 self.open_verse = self.stanza.makeelement("wers_normalny")
211 self.verses.append(self.open_verse)
213 def get_open_verse(self):
214 if self.open_verse is None:
215 self.open_normal_verse()
216 return self.open_verse
218 def push_text(self, text):
221 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
223 self.open_normal_verse()
224 if not verse_text.strip():
226 verse = self.get_open_verse()
228 verse[-1].tail = (verse[-1].tail or "") + verse_text
230 verse.text = (verse.text or "") + verse_text
232 def push_elem(self, elem):
233 if elem.tag.startswith("wers"):
234 verse = deepcopy(elem)
236 self.verses.append(verse)
237 self.open_verse = verse
239 appended = deepcopy(elem)
241 self.get_open_verse().append(appended)
244 def replace_by_verse(tree):
245 """ Find stanzas and create new verses in place of a '/' character """
247 stanzas = tree.findall('.//' + WLNS('strofa'))
248 for stanza in stanzas:
249 Stanza(stanza).versify()
252 def add_to_manifest(manifest, partno):
253 """ Adds a node to the manifest section in content.opf file """
255 partstr = 'part%d' % partno
256 e = manifest.makeelement(
257 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
258 'media-type': 'application/xhtml+xml'}
263 def add_to_spine(spine, partno):
264 """ Adds a node to the spine section in content.opf file """
266 e = spine.makeelement(
268 attrib={'idref': 'part%d' % partno}
274 def __init__(self, name=None, part_href=None):
277 self.part_href = part_href
278 self.sub_number = None
280 def add(self, name, part_href, level=0, is_part=True, index=None):
281 assert level == 0 or index is None
282 if level > 0 and self.children:
283 return self.children[-1].add(name, part_href, level - 1, is_part)
286 t.part_href = part_href
287 if index is not None:
288 self.children.insert(index, t)
290 self.children.append(t)
292 t.sub_number = len(self.children) + 1
295 def append(self, toc):
296 self.children.append(toc)
298 def extend(self, toc):
299 self.children.extend(toc.children)
303 return max((c.depth() for c in self.children)) + 1
309 if self.sub_number is not None:
310 src += '#sub%d' % self.sub_number
313 def write_to_xml(self, nav_map, counter=1):
314 for child in self.children:
315 nav_point = nav_map.makeelement(NCXNS('navPoint'))
316 nav_point.set('id', 'NavPoint-%d' % counter)
317 nav_point.set('playOrder', str(counter))
319 nav_label = nav_map.makeelement(NCXNS('navLabel'))
320 text = nav_map.makeelement(NCXNS('text'))
321 if child.name is not None:
322 text.text = re.sub(r'\n', ' ', child.name)
324 text.text = child.name
325 nav_label.append(text)
326 nav_point.append(nav_label)
328 content = nav_map.makeelement(NCXNS('content'))
329 content.set('src', child.href())
330 nav_point.append(content)
331 nav_map.append(nav_point)
332 counter = child.write_to_xml(nav_point, counter + 1)
335 def html_part(self, depth=0):
337 for child in self.children:
339 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
340 (depth, child.href(), child.name))
341 texts.append(child.html_part(depth + 1))
342 return "\n".join(texts)
345 with open(get_resource('epub/toc.html'), 'rb') as f:
346 t = f.read().decode('utf-8')
347 return t % self.html_part()
350 def used_chars(element):
351 """ Lists characters used in an ETree Element """
352 chars = set((element.text or '') + (element.tail or ''))
353 for child in element:
354 chars = chars.union(used_chars(child))
359 """ divide main content of the XML file into chunks """
361 # prepare a container for each chunk
362 part_xml = etree.Element('utwor')
363 etree.SubElement(part_xml, 'master')
364 main_xml_part = part_xml[0] # master
366 last_node_part = False
368 # The below loop are workaround for a problem with epubs
369 # in drama ebooks without acts.
372 for one_part in main_text:
374 if name == 'naglowek_scena':
376 elif name == 'naglowek_akt':
379 for one_part in main_text:
381 if is_act is False and is_scene is True:
382 if name == 'naglowek_czesc':
384 last_node_part = True
385 main_xml_part[:] = [deepcopy(one_part)]
386 elif not last_node_part and name == "naglowek_scena":
388 main_xml_part[:] = [deepcopy(one_part)]
390 main_xml_part.append(deepcopy(one_part))
391 last_node_part = False
393 if name == 'naglowek_czesc':
395 last_node_part = True
396 main_xml_part[:] = [deepcopy(one_part)]
397 elif (not last_node_part
399 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
402 main_xml_part[:] = [deepcopy(one_part)]
404 main_xml_part.append(deepcopy(one_part))
405 last_node_part = False
409 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
410 _empty_html_static=[]):
412 Transforms one chunk, returns a HTML string, a TOC object
413 and a set of used characters.
417 for element in chunk_xml[0]:
418 if element.tag == "naglowek_czesc":
419 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
420 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
421 toc.add(node_name(element), "part%d.html" % chunk_no)
422 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
423 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no,
424 level=1, is_part=False)
425 element.set('sub', str(subnumber))
427 if not _empty_html_static:
428 with open(get_resource('epub/emptyChunk.html')) as f:
429 _empty_html_static.append(f.read())
431 output_html = _empty_html_static[0]
433 find_annotations(annotations, chunk_xml, chunk_no)
434 replace_by_verse(chunk_xml)
435 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
436 chars = used_chars(html_tree.getroot())
437 output_html = etree.tostring(
438 html_tree, pretty_print=True, xml_declaration=True,
440 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
441 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
443 return output_html, toc, chars
446 def transform(wldoc, verbose=False, style=None, html_toc=False,
447 sample=None, cover=None, flags=None, hyphenate=False,
448 ilustr_path='', output_type='epub'):
449 """ produces a EPUB file
451 sample=n: generate sample e-book (with at least n paragraphs)
452 cover: a cover.Cover factory or True for default
453 flags: less-advertising, without-fonts, working-copy
456 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
457 """ processes one input file and proceeds to its children """
459 replace_characters(wldoc.edoc.getroot())
461 hyphenator = set_hyph_language(
463 ) if hyphenate else None
464 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
466 # every input file will have a TOC entry,
467 # pointing to starting chunk
468 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
471 # write book title page
472 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
473 outputtype=output_type)
474 chars = used_chars(html_tree.getroot())
475 html_string = etree.tostring(
476 html_tree, pretty_print=True, xml_declaration=True,
478 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
479 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
481 zip.writestr('OPS/title.html', squeeze_whitespace(html_string))
482 # add a title page TOC entry
483 toc.add(u"Strona tytułowa", "title.html")
484 elif wldoc.book_info.parts:
485 # write title page for every parent
486 if sample is not None and sample <= 0:
488 html_string = open(get_resource('epub/emptyChunk.html')).read()
490 html_tree = xslt(wldoc.edoc,
491 get_resource('epub/xsltChunkTitle.xsl'))
492 chars = used_chars(html_tree.getroot())
493 html_string = etree.tostring(
494 html_tree, pretty_print=True, xml_declaration=True,
496 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"'
497 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
499 zip.writestr('OPS/part%d.html' % chunk_counter,
500 squeeze_whitespace(html_string))
501 add_to_manifest(manifest, chunk_counter)
502 add_to_spine(spine, chunk_counter)
505 if len(wldoc.edoc.getroot()) > 1:
506 # rdf before style master
507 main_text = wldoc.edoc.getroot()[1]
509 # rdf in style master
510 main_text = wldoc.edoc.getroot()[0]
511 if main_text.tag == RDFNS('RDF'):
514 if main_text is not None:
515 for chunk_xml in chop(main_text):
517 if sample is not None:
521 sample -= len(chunk_xml.xpath(
522 '//strofa|//akap|//akap_cd|//akap_dialog'
524 chunk_html, chunk_toc, chunk_chars = transform_chunk(
525 chunk_xml, chunk_counter, annotations, empty)
527 toc.extend(chunk_toc)
528 chars = chars.union(chunk_chars)
529 zip.writestr('OPS/part%d.html' % chunk_counter,
530 squeeze_whitespace(chunk_html))
531 add_to_manifest(manifest, chunk_counter)
532 add_to_spine(spine, chunk_counter)
535 for child in wldoc.parts():
536 child_toc, chunk_counter, chunk_chars, sample = transform_file(
537 child, chunk_counter, first=False, sample=sample)
538 toc.append(child_toc)
539 chars = chars.union(chunk_chars)
541 return toc, chunk_counter, chars, sample
543 document = deepcopy(wldoc)
548 document.edoc.getroot().set(flag, 'yes')
550 document.clean_ed_note()
551 document.clean_ed_note('abstrakt')
554 editors = document.editors()
556 document.edoc.getroot().set('editors', u', '.join(sorted(
557 editor.readable() for editor in editors)))
558 if document.book_info.funders:
559 document.edoc.getroot().set('funders', u', '.join(
560 document.book_info.funders))
561 if document.book_info.thanks:
562 document.edoc.getroot().set('thanks', document.book_info.thanks)
564 opf = xslt(document.book_info.to_etree(),
565 get_resource('epub/xsltContent.xsl'))
566 manifest = opf.find('.//' + OPFNS('manifest'))
567 guide = opf.find('.//' + OPFNS('guide'))
568 spine = opf.find('.//' + OPFNS('spine'))
570 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
572 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
574 functions.reg_mathml_epub(zip)
576 if os.path.isdir(ilustr_path):
577 ilustr_elements = set(ilustr.get('src')
578 for ilustr in document.edoc.findall('//ilustr'))
579 for i, filename in enumerate(os.listdir(ilustr_path)):
580 if filename not in ilustr_elements:
582 file_path = os.path.join(ilustr_path, filename)
583 zip.write(file_path, os.path.join('OPS', filename))
584 image_id = 'image%s' % i
585 manifest.append(etree.fromstring(
586 '<item id="%s" href="%s" media-type="%s" />' % (
587 image_id, filename, guess_type(file_path)[0])
590 # write static elements
591 mime = zipfile.ZipInfo()
592 mime.filename = 'mimetype'
593 mime.compress_type = zipfile.ZIP_STORED
595 zip.writestr(mime, b'application/epub+zip')
597 'META-INF/container.xml',
598 b'<?xml version="1.0" ?>'
599 b'<container version="1.0" '
600 b'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
601 b'<rootfiles><rootfile full-path="OPS/content.opf" '
602 b'media-type="application/oebps-package+xml" />'
603 b'</rootfiles></container>'
605 zip.write(get_resource('res/wl-logo-small.png'),
606 os.path.join('OPS', 'logo_wolnelektury.png'))
607 zip.write(get_resource('res/jedenprocent.png'),
608 os.path.join('OPS', 'jedenprocent.png'))
610 style = get_resource('epub/style.css')
611 zip.write(style, os.path.join('OPS', 'style.css'))
617 cover_file = BytesIO()
618 bound_cover = cover(document.book_info)
619 bound_cover.save(cover_file)
620 cover_name = 'cover.%s' % bound_cover.ext()
621 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
624 cover_tree = etree.parse(get_resource('epub/cover.html'))
625 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
626 zip.writestr('OPS/cover.html', etree.tostring(
627 cover_tree, pretty_print=True, xml_declaration=True,
629 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
630 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
633 if bound_cover.uses_dc_cover:
634 if document.book_info.cover_by:
635 document.edoc.getroot().set('data-cover-by',
636 document.book_info.cover_by)
637 if document.book_info.cover_source:
638 document.edoc.getroot().set('data-cover-source',
639 document.book_info.cover_source)
641 manifest.append(etree.fromstring(
642 '<item id="cover" href="cover.html" '
643 'media-type="application/xhtml+xml" />'
645 manifest.append(etree.fromstring(
646 '<item id="cover-image" href="%s" media-type="%s" />' % (
647 cover_name, bound_cover.mime_type()
650 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
651 opf.getroot()[0].append(etree.fromstring(
652 '<meta name="cover" content="cover-image"/>'
654 guide.append(etree.fromstring(
655 '<reference href="cover.html" type="cover" title="Okładka"/>'
658 annotations = etree.Element('annotations')
660 toc_file = etree.fromstring(
661 b'<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
662 b'"-//NISO//DTD ncx 2005-1//EN" '
663 b'"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
664 b'<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
665 b'version="2005-1"><head></head><docTitle></docTitle><navMap>'
668 nav_map = toc_file[-1]
671 manifest.append(etree.fromstring(
672 '<item id="html_toc" href="toc.html" '
673 'media-type="application/xhtml+xml" />'
675 spine.append(etree.fromstring(
676 '<itemref idref="html_toc" />'))
677 guide.append(etree.fromstring(
678 '<reference href="toc.html" type="toc" title="Spis treści"/>'
681 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
683 if len(toc.children) < 2:
684 toc.add(u"Początek utworu", "part1.html")
686 # Last modifications in container files and EPUB creation
687 if len(annotations) > 0:
688 toc.add("Przypisy", "annotations.html")
689 manifest.append(etree.fromstring(
690 '<item id="annotations" href="annotations.html" '
691 'media-type="application/xhtml+xml" />'
693 spine.append(etree.fromstring(
694 '<itemref idref="annotations" />'))
695 replace_by_verse(annotations)
696 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
697 chars = chars.union(used_chars(html_tree.getroot()))
698 zip.writestr('OPS/annotations.html', etree.tostring(
699 html_tree, pretty_print=True, xml_declaration=True,
701 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
702 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
705 toc.add("Wesprzyj Wolne Lektury", "support.html")
706 manifest.append(etree.fromstring(
707 '<item id="support" href="support.html" '
708 'media-type="application/xhtml+xml" />'
710 spine.append(etree.fromstring(
711 '<itemref idref="support" />'))
712 html_string = open(get_resource('epub/support.html'), 'rb').read()
713 chars.update(used_chars(etree.fromstring(html_string)))
714 zip.writestr('OPS/support.html', squeeze_whitespace(html_string))
716 toc.add("Strona redakcyjna", "last.html")
717 manifest.append(etree.fromstring(
718 '<item id="last" href="last.html" '
719 'media-type="application/xhtml+xml" />'
721 spine.append(etree.fromstring(
722 '<itemref idref="last" />'))
723 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
724 outputtype=output_type)
725 chars.update(used_chars(html_tree.getroot()))
726 zip.writestr('OPS/last.html', squeeze_whitespace(etree.tostring(
727 html_tree, pretty_print=True, xml_declaration=True,
729 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
730 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
733 if not flags or 'without-fonts' not in flags:
735 tmpdir = mkdtemp('-librarian-epub')
741 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
743 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
744 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
745 optimizer_call = ['perl', 'subset.pl', '--chars',
746 ''.join(chars).encode('utf-8'),
747 get_resource('fonts/' + fname),
748 os.path.join(tmpdir, fname)]
749 env = {"PERL_USE_UNSAFE_INC": "1"}
751 print("Running font-optimizer")
752 subprocess.check_call(optimizer_call, env=env)
754 dev_null = open(os.devnull, 'w')
755 subprocess.check_call(optimizer_call, stdout=dev_null,
756 stderr=dev_null, env=env)
757 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
758 manifest.append(etree.fromstring(
759 '<item id="%s" href="%s" '
760 'media-type="application/x-font-truetype" />'
766 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
767 xml_declaration=True, encoding="utf-8"))
768 title = document.book_info.title
769 attributes = ("dtb:uid", "dtb:depth", "dtb:totalPageCount",
771 for st in attributes:
772 meta = toc_file.makeelement(NCXNS('meta'))
774 meta.set('content', '0')
775 toc_file[0].append(meta)
776 toc_file[0][0].set('content', str(document.book_info.url))
777 toc_file[0][1].set('content', str(toc.depth()))
778 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
782 toc.add(u"Spis treści", "toc.html", index=1)
783 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
784 toc.write_to_xml(nav_map)
785 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
786 xml_declaration=True, encoding="utf-8"))
789 return OutputFile.from_filename(output_file.name)