1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from ebooklib import epub
17 from lxml import etree
19 from tempfile import mkdtemp, NamedTemporaryFile
20 from shutil import rmtree
22 from librarian import RDFNS, WLNS, DCNS, OutputFile
23 from librarian.cover import make_cover
25 from librarian import functions, get_resource
27 from librarian.hyphenator import Hyphenator
29 functions.reg_person_name()
32 def squeeze_whitespace(s):
33 return re.sub(b'\\s+', b' ', s)
36 def set_hyph_language(source_tree):
37 bibl_lng = etree.XPath('//dc:language//text()',
38 namespaces={'dc': str(DCNS)})(source_tree)
39 short_lng = functions.lang_code_3to2(bibl_lng[0])
41 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
47 def hyphenate_and_fix_conjunctions(source_tree, hyph):
48 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
50 parent = t.getparent()
53 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
55 newt += hyph.inserted(w, u'\u00AD')
58 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
66 """ returns node's text and children as a string
68 >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
72 nt = node.text if node.text is not None else ''
74 [nt] + [etree.tostring(child, encoding='unicode') for child in node]
78 def set_inner_xml(node, text):
79 """ sets node's text and children from a string
81 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
82 >>> set_inner_xml(e, 'x<b>y</b>z')
83 >>> print(etree.tostring(e, encoding='unicode'))
87 p = etree.fromstring('<x>%s</x>' % text)
93 """ Find out a node's name
95 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
99 tempnode = deepcopy(node)
101 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
102 for e in tempnode.findall('.//%s' % p):
106 etree.strip_tags(tempnode, '*')
110 def xslt(xml, sheet, **kwargs):
111 if isinstance(xml, etree._Element):
112 xml = etree.ElementTree(xml)
113 with open(sheet) as xsltf:
114 transform = etree.XSLT(etree.parse(xsltf))
116 (key, transform.strparam(value))
117 for key, value in kwargs.items()
119 return transform(xml, **params)
122 def replace_characters(node):
123 def replace_chars(text):
126 return text.replace(u"\ufeff", u"")\
127 .replace("---", u"\u2014")\
128 .replace("--", u"\u2013")\
129 .replace(",,", u"\u201E")\
130 .replace('"', u"\u201D")\
131 .replace("'", u"\u2019")
132 if node.tag in ('uwaga', 'extra'):
136 node.text = replace_chars(node.text)
137 node.tail = replace_chars(node.tail)
139 replace_characters(child)
142 def find_annotations(annotations, source, part_no):
144 if child.tag in ('pe', 'pa', 'pt', 'pr'):
145 annotation = deepcopy(child)
146 number = str(len(annotations) + 1)
147 annotation.set('number', number)
148 annotation.set('part', str(part_no))
150 annotations.append(annotation)
155 if child.tag not in ('extra', 'uwaga'):
156 find_annotations(annotations, child, part_no)
159 class Stanza(object):
161 Converts / verse endings into verse elements in a stanza.
163 Slashes may only occur directly in the stanza. Any slashes in subelements
164 will be ignored, and the subelements will be put inside verse elements.
166 >>> s = etree.fromstring(
167 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
169 >>> Stanza(s).versify()
170 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
172 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
173 <wers_normalny>b<x>x/
174 y</x>c</wers_normalny>
175 <wers_normalny>d</wers_normalny>
179 def __init__(self, stanza_elem):
180 self.stanza = stanza_elem
182 self.open_verse = None
185 self.push_text(self.stanza.text)
186 for elem in self.stanza:
188 self.push_text(elem.tail)
189 tail = self.stanza.tail
191 self.stanza.tail = tail
193 verse for verse in self.verses
194 if verse.text or len(verse) > 0
197 def open_normal_verse(self):
198 self.open_verse = self.stanza.makeelement("wers_normalny")
199 self.verses.append(self.open_verse)
201 def get_open_verse(self):
202 if self.open_verse is None:
203 self.open_normal_verse()
204 return self.open_verse
206 def push_text(self, text):
209 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
211 self.open_normal_verse()
212 if not verse_text.strip():
214 verse = self.get_open_verse()
216 verse[-1].tail = (verse[-1].tail or "") + verse_text
218 verse.text = (verse.text or "") + verse_text
220 def push_elem(self, elem):
221 if elem.tag.startswith("wers"):
222 verse = deepcopy(elem)
224 self.verses.append(verse)
225 self.open_verse = verse
227 appended = deepcopy(elem)
229 self.get_open_verse().append(appended)
232 def replace_by_verse(tree):
233 """ Find stanzas and create new verses in place of a '/' character """
235 stanzas = tree.findall('.//' + WLNS('strofa'))
236 for stanza in stanzas:
237 Stanza(stanza).versify()
240 def used_chars(element):
241 """ Lists characters used in an ETree Element """
242 chars = set((element.text or '') + (element.tail or ''))
243 for child in element:
244 chars = chars.union(used_chars(child))
249 """ divide main content of the XML file into chunks """
251 # prepare a container for each chunk
252 part_xml = etree.Element('utwor')
253 etree.SubElement(part_xml, 'master')
254 main_xml_part = part_xml[0] # master
256 last_node_part = False
258 # The below loop are workaround for a problem with epubs
259 # in drama ebooks without acts.
262 for one_part in main_text:
264 if name == 'naglowek_scena':
266 elif name == 'naglowek_akt':
269 for one_part in main_text:
271 if is_act is False and is_scene is True:
272 if name == 'naglowek_czesc':
274 last_node_part = True
275 main_xml_part[:] = [deepcopy(one_part)]
276 elif not last_node_part and name == "naglowek_scena":
278 main_xml_part[:] = [deepcopy(one_part)]
280 main_xml_part.append(deepcopy(one_part))
281 last_node_part = False
283 if name == 'naglowek_czesc':
285 last_node_part = True
286 main_xml_part[:] = [deepcopy(one_part)]
287 elif (not last_node_part
289 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
292 main_xml_part[:] = [deepcopy(one_part)]
294 main_xml_part.append(deepcopy(one_part))
295 last_node_part = False
299 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
300 _empty_html_static=[]):
302 Transforms one chunk, returns a HTML string, a TOC object
303 and a set of used characters.
307 for element in chunk_xml[0]:
308 if element.tag == "naglowek_czesc":
312 "part%d.xhtml#book-text" % chunk_no,
314 "part%d-text" % chunk_no
319 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
323 "part%d.xhtml" % chunk_no,
330 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
335 "part%d.xhtml" % chunk_no,
343 subnumber = len(toc[-1][1])
346 "part%d.xhtml#sub%d" % (chunk_no, subnumber),
348 "part%d-sub%d" % (chunk_no, subnumber)
351 element.set('sub', six.text_type(subnumber))
353 if not _empty_html_static:
354 with open(get_resource('epub/emptyChunk.xhtml')) as f:
355 _empty_html_static.append(f.read())
357 output_html = _empty_html_static[0]
359 find_annotations(annotations, chunk_xml, chunk_no)
360 replace_by_verse(chunk_xml)
361 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
362 chars = used_chars(html_tree.getroot())
363 output_html = etree.tostring(
364 html_tree, pretty_print=True, xml_declaration=True,
366 doctype='<!DOCTYPE html>'
368 return output_html, toc, chars
371 def remove_empty_lists_from_toc(toc):
372 for i, e in enumerate(toc):
373 if isinstance(e, tuple):
375 remove_empty_lists_from_toc(e[1])
380 def transform(wldoc, verbose=False, style=None,
381 sample=None, cover=None, flags=None, hyphenate=False,
382 base_url='file://./', output_type='epub'):
383 """ produces a EPUB file
385 sample=n: generate sample e-book (with at least n paragraphs)
386 cover: a cover.Cover factory or True for default
387 flags: less-advertising, without-fonts, working-copy
390 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
391 """ processes one input file and proceeds to its children """
393 replace_characters(wldoc.edoc.getroot())
395 hyphenator = set_hyph_language(
397 ) if hyphenate else None
398 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
400 # every input file will have a TOC entry,
401 # pointing to starting chunk
405 "part%d.xhtml" % chunk_counter,
406 wldoc.book_info.title,
407 "path%d-start" % chunk_counter
414 # write book title page
415 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
416 outputtype=output_type)
417 chars = used_chars(html_tree.getroot())
418 html_string = etree.tostring(
419 html_tree, pretty_print=True, xml_declaration=True,
421 doctype='<!DOCTYPE html>'
423 item = epub.EpubItem(
425 file_name="title.xhtml",
426 media_type="application/xhtml+xml",
427 content=squeeze_whitespace(html_string)
430 output.add_item(item)
431 # add a title page TOC entry
440 item = epub.EpubNav()
448 output.add_item(item)
459 elif wldoc.book_info.parts:
460 # write title page for every parent
461 if sample is not None and sample <= 0:
464 get_resource('epub/emptyChunk.xhtml')).read()
466 html_tree = xslt(wldoc.edoc,
467 get_resource('epub/xsltChunkTitle.xsl'))
468 chars = used_chars(html_tree.getroot())
469 html_string = etree.tostring(
470 html_tree, pretty_print=True, xml_declaration=True,
472 doctype='<!DOCTYPE html>'
474 item = epub.EpubItem(
475 uid="part%d" % chunk_counter,
476 file_name="part%d.xhtml" % chunk_counter,
477 media_type="application/xhtml+xml",
478 content=squeeze_whitespace(html_string)
480 output.add_item(item)
485 if len(wldoc.edoc.getroot()) > 1:
486 # rdf before style master
487 main_text = wldoc.edoc.getroot()[1]
489 # rdf in style master
490 main_text = wldoc.edoc.getroot()[0]
491 if main_text.tag == RDFNS('RDF'):
494 if main_text is not None:
495 for chunk_xml in chop(main_text):
497 if sample is not None:
501 sample -= len(chunk_xml.xpath(
502 '//strofa|//akap|//akap_cd|//akap_dialog'
504 chunk_html, chunk_toc, chunk_chars = transform_chunk(
505 chunk_xml, chunk_counter, annotations, empty)
507 toc[-1][1].extend(chunk_toc)
508 chars = chars.union(chunk_chars)
509 item = epub.EpubItem(
510 uid="part%d" % chunk_counter,
511 file_name="part%d.xhtml" % chunk_counter,
512 media_type="application/xhtml+xml",
513 content=squeeze_whitespace(chunk_html)
515 output.add_item(item)
519 for child in wldoc.parts():
520 child_toc, chunk_counter, chunk_chars, sample = transform_file(
521 child, chunk_counter, first=False, sample=sample)
522 toc[-1][1].extend(child_toc)
523 chars = chars.union(chunk_chars)
525 return toc, chunk_counter, chars, sample
527 document = deepcopy(wldoc)
532 document.edoc.getroot().set(flag, 'yes')
534 document.clean_ed_note()
535 document.clean_ed_note('abstrakt')
538 editors = document.editors()
540 document.edoc.getroot().set('editors', u', '.join(sorted(
541 editor.readable() for editor in editors)))
542 if document.book_info.funders:
543 document.edoc.getroot().set('funders', u', '.join(
544 document.book_info.funders))
545 if document.book_info.thanks:
546 document.edoc.getroot().set('thanks', document.book_info.thanks)
548 output = epub.EpubBook()
549 output.set_identifier(six.text_type(document.book_info.url))
550 output.set_language(functions.lang_code_3to2(document.book_info.language))
551 output.set_title(document.book_info.title)
552 for i, author in enumerate(document.book_info.authors):
555 file_as=six.text_type(author),
556 uid='creator{}'.format(i)
558 for translator in document.book_info.translators:
560 translator.readable(),
561 file_as=six.text_type(translator),
563 uid='translator{}'.format(i)
565 for publisher in document.book_info.publisher:
566 output.add_metadata("DC", "publisher", publisher)
567 output.add_metadata("DC", "date", document.book_info.created_at)
569 output.guide.append({
572 "href": "part1.xhtml"
575 output.add_item(epub.EpubNcx())
579 functions.reg_mathml_epub(output)
582 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
583 url = six.moves.urllib.parse.urljoin(
587 with six.moves.urllib.request.urlopen(url) as imgfile:
588 img = Image.open(imgfile)
590 th_format, ext, media_type = {
591 'GIF': ('GIF', 'gif', 'image/gif'),
592 'PNG': ('PNG', 'png', 'image/png'),
593 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
596 if img.size[0] < width:
599 th = img.resize((width, round(width * img.size[1] / img.size[0])))
601 buffer = six.BytesIO()
602 th.save(buffer, format=th_format)
604 file_name = 'image%d.%s' % (i, ext)
605 ilustr.set('src', file_name)
610 media_type=media_type,
611 content=buffer.getvalue()
615 # write static elements
617 with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
620 uid="logo_wolnelektury.png",
621 file_name="logo_wolnelektury.png",
622 media_type="image/png",
626 with open(get_resource('res/jedenprocent.png'), 'rb') as f:
630 file_name="jedenprocent.png",
631 media_type="image/png",
637 style = get_resource('epub/style.css')
638 with open(style, 'rb') as f:
642 file_name="style.css",
643 media_type="text/css",
652 cover_file = six.BytesIO()
653 bound_cover = cover(document.book_info)
654 bound_cover.save(cover_file)
655 cover_name = 'cover.%s' % bound_cover.ext()
658 file_name=cover_name,
659 content=cover_file.getvalue(),
661 spine.append('cover')
662 output.guide.append({
664 "href": "cover.xhtml",
670 if bound_cover.uses_dc_cover:
671 if document.book_info.cover_by:
672 document.edoc.getroot().set('data-cover-by',
673 document.book_info.cover_by)
674 if document.book_info.cover_source:
675 document.edoc.getroot().set('data-cover-source',
676 document.book_info.cover_source)
678 annotations = etree.Element('annotations')
680 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
681 output.toc = toc[0][1]
683 # Last modifications in container files and EPUB creation
684 if len(annotations) > 0:
692 replace_by_verse(annotations)
693 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
694 chars = chars.union(used_chars(html_tree.getroot()))
696 item = epub.EpubItem(
698 file_name="annotations.xhtml",
699 media_type="application/xhtml+xml",
700 content=etree.tostring(
701 html_tree, pretty_print=True, xml_declaration=True,
703 doctype='<!DOCTYPE html>'
706 output.add_item(item)
712 "Wesprzyj Wolne Lektury",
716 with open(get_resource('epub/support.xhtml'), 'rb') as f:
717 html_string = f.read()
718 chars.update(used_chars(etree.fromstring(html_string)))
719 item = epub.EpubItem(
721 file_name="support.xhtml",
722 media_type="application/xhtml+xml",
723 content=squeeze_whitespace(html_string)
725 output.add_item(item)
735 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
736 outputtype=output_type)
737 chars.update(used_chars(html_tree.getroot()))
738 item = epub.EpubItem(
740 file_name="last.xhtml",
741 media_type="application/xhtml+xml",
742 content=squeeze_whitespace(etree.tostring(
743 html_tree, pretty_print=True, xml_declaration=True,
745 doctype='<!DOCTYPE html>'
748 output.add_item(item)
751 if not flags or 'without-fonts' not in flags:
753 tmpdir = mkdtemp('-librarian-epub')
759 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
761 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
762 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
763 optimizer_call = ['perl', 'subset.pl', '--chars',
764 ''.join(chars).encode('utf-8'),
765 get_resource('fonts/' + fname),
766 os.path.join(tmpdir, fname)]
767 env = {"PERL_USE_UNSAFE_INC": "1"}
769 print("Running font-optimizer")
770 subprocess.check_call(optimizer_call, env=env)
772 dev_null = open(os.devnull, 'w')
773 subprocess.check_call(optimizer_call, stdout=dev_null,
774 stderr=dev_null, env=env)
775 with open(os.path.join(tmpdir, fname), 'rb') as f:
780 media_type="font/ttf",
788 remove_empty_lists_from_toc(output.toc)
790 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
793 epub.write_epub(output_file.name, output, {'epub3_landmark': False})
794 return OutputFile.from_filename(output_file.name)