1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from ebooklib import epub
17 from lxml import etree
19 from tempfile import mkdtemp, NamedTemporaryFile
20 from shutil import rmtree
22 from librarian import RDFNS, WLNS, DCNS, OutputFile
23 from librarian.cover import make_cover
25 from librarian import functions, get_resource
27 from librarian.hyphenator import Hyphenator
29 functions.reg_person_name()
32 def squeeze_whitespace(s):
33 return re.sub(b'\\s+', b' ', s)
36 def set_hyph_language(source_tree):
37 bibl_lng = etree.XPath('//dc:language//text()',
38 namespaces={'dc': str(DCNS)})(source_tree)
39 short_lng = functions.lang_code_3to2(bibl_lng[0])
41 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
47 def hyphenate_and_fix_conjunctions(source_tree, hyph):
48 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
50 parent = t.getparent()
53 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
55 newt += hyph.inserted(w, u'\u00AD')
58 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
66 """ returns node's text and children as a string
68 >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
72 nt = node.text if node.text is not None else ''
74 [nt] + [etree.tostring(child, encoding='unicode') for child in node]
78 def set_inner_xml(node, text):
79 """ sets node's text and children from a string
81 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
82 >>> set_inner_xml(e, 'x<b>y</b>z')
83 >>> print(etree.tostring(e, encoding='unicode'))
87 p = etree.fromstring('<x>%s</x>' % text)
93 """ Find out a node's name
95 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
99 tempnode = deepcopy(node)
101 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
102 for e in tempnode.findall('.//%s' % p):
106 etree.strip_tags(tempnode, '*')
110 def xslt(xml, sheet, **kwargs):
111 if isinstance(xml, etree._Element):
112 xml = etree.ElementTree(xml)
113 with open(sheet) as xsltf:
114 transform = etree.XSLT(etree.parse(xsltf))
116 (key, transform.strparam(value))
117 for key, value in kwargs.items()
119 return transform(xml, **params)
122 def replace_characters(node):
123 def replace_chars(text):
126 return text.replace(u"\ufeff", u"")\
127 .replace("---", u"\u2014")\
128 .replace("--", u"\u2013")\
129 .replace(",,", u"\u201E")\
130 .replace('"', u"\u201D")\
131 .replace("'", u"\u2019")
132 if node.tag in ('uwaga', 'extra'):
136 node.text = replace_chars(node.text)
137 node.tail = replace_chars(node.tail)
139 replace_characters(child)
142 def find_annotations(annotations, source, part_no):
144 if child.tag in ('pe', 'pa', 'pt', 'pr'):
145 annotation = deepcopy(child)
146 number = str(len(annotations) + 1)
147 annotation.set('number', number)
148 annotation.set('part', str(part_no))
150 annotations.append(annotation)
155 if child.tag not in ('extra', 'uwaga'):
156 find_annotations(annotations, child, part_no)
159 class Stanza(object):
161 Converts / verse endings into verse elements in a stanza.
163 Slashes may only occur directly in the stanza. Any slashes in subelements
164 will be ignored, and the subelements will be put inside verse elements.
166 >>> s = etree.fromstring(
167 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
169 >>> Stanza(s).versify()
170 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
172 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
173 <wers_normalny>b<x>x/
174 y</x>c</wers_normalny>
175 <wers_normalny>d</wers_normalny>
179 def __init__(self, stanza_elem):
180 self.stanza = stanza_elem
182 self.open_verse = None
185 self.push_text(self.stanza.text)
186 for elem in self.stanza:
188 self.push_text(elem.tail)
189 tail = self.stanza.tail
191 self.stanza.tail = tail
193 verse for verse in self.verses
194 if verse.text or len(verse) > 0
197 def open_normal_verse(self):
198 self.open_verse = self.stanza.makeelement("wers_normalny")
199 self.verses.append(self.open_verse)
201 def get_open_verse(self):
202 if self.open_verse is None:
203 self.open_normal_verse()
204 return self.open_verse
206 def push_text(self, text):
209 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
211 self.open_normal_verse()
212 if not verse_text.strip():
214 verse = self.get_open_verse()
216 verse[-1].tail = (verse[-1].tail or "") + verse_text
218 verse.text = (verse.text or "") + verse_text
220 def push_elem(self, elem):
221 if elem.tag.startswith("wers"):
222 verse = deepcopy(elem)
224 self.verses.append(verse)
225 self.open_verse = verse
227 appended = deepcopy(elem)
229 self.get_open_verse().append(appended)
232 def replace_by_verse(tree):
233 """ Find stanzas and create new verses in place of a '/' character """
235 stanzas = tree.findall('.//' + WLNS('strofa'))
236 for stanza in stanzas:
237 Stanza(stanza).versify()
240 def used_chars(element):
241 """ Lists characters used in an ETree Element """
242 chars = set((element.text or '') + (element.tail or ''))
243 for child in element:
244 chars = chars.union(used_chars(child))
249 """ divide main content of the XML file into chunks """
251 # prepare a container for each chunk
252 part_xml = etree.Element('utwor')
253 etree.SubElement(part_xml, 'master')
254 main_xml_part = part_xml[0] # master
256 last_node_part = False
258 # The below loop are workaround for a problem with epubs
259 # in drama ebooks without acts.
262 for one_part in main_text:
264 if name == 'naglowek_scena':
266 elif name == 'naglowek_akt':
269 for one_part in main_text:
271 if is_act is False and is_scene is True:
272 if name == 'naglowek_czesc':
274 last_node_part = True
275 main_xml_part[:] = [deepcopy(one_part)]
276 elif not last_node_part and name == "naglowek_scena":
278 main_xml_part[:] = [deepcopy(one_part)]
280 main_xml_part.append(deepcopy(one_part))
281 last_node_part = False
283 if name == 'naglowek_czesc':
285 last_node_part = True
286 main_xml_part[:] = [deepcopy(one_part)]
287 elif (not last_node_part
289 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
292 main_xml_part[:] = [deepcopy(one_part)]
294 main_xml_part.append(deepcopy(one_part))
295 last_node_part = False
299 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
300 _empty_html_static=[]):
302 Transforms one chunk, returns a HTML string, a TOC object
303 and a set of used characters.
307 for element in chunk_xml[0]:
308 if element.tag == "naglowek_czesc":
312 "part%d.xhtml#book-text" % chunk_no,
314 "part%d-text" % chunk_no
319 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
323 "part%d.xhtml" % chunk_no,
330 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
335 "part%d.xhtml" % chunk_no,
343 subnumber = len(toc[-1][1])
346 "part%d.xhtml#sub%d" % (chunk_no, subnumber),
348 "part%d-sub%d" % (chunk_no, subnumber)
351 element.set('sub', six.text_type(subnumber))
353 if not _empty_html_static:
354 with open(get_resource('epub/emptyChunk.xhtml')) as f:
355 _empty_html_static.append(f.read())
357 output_html = _empty_html_static[0]
359 find_annotations(annotations, chunk_xml, chunk_no)
360 replace_by_verse(chunk_xml)
361 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
362 chars = used_chars(html_tree.getroot())
363 output_html = etree.tostring(
364 html_tree, pretty_print=True, xml_declaration=True,
366 doctype='<!DOCTYPE html>'
368 return output_html, toc, chars
371 def remove_empty_lists_from_toc(toc):
372 for i, e in enumerate(toc):
373 if isinstance(e, tuple):
375 remove_empty_lists_from_toc(e[1])
380 def transform(wldoc, verbose=False, style=None,
381 sample=None, cover=None, flags=None, hyphenate=False,
382 base_url='file://./', output_type='epub'):
383 """ produces a EPUB file
385 sample=n: generate sample e-book (with at least n paragraphs)
386 cover: a cover.Cover factory or True for default
387 flags: less-advertising, without-fonts, working-copy
390 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
391 """ processes one input file and proceeds to its children """
393 replace_characters(wldoc.edoc.getroot())
395 hyphenator = set_hyph_language(
397 ) if hyphenate else None
398 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
400 # every input file will have a TOC entry,
401 # pointing to starting chunk
405 "part%d.xhtml" % chunk_counter,
406 wldoc.book_info.title,
407 "path%d-start" % chunk_counter
414 # write book title page
415 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
416 outputtype=output_type)
417 chars = used_chars(html_tree.getroot())
418 html_string = etree.tostring(
419 html_tree, pretty_print=True, xml_declaration=True,
421 doctype='<!DOCTYPE html>'
423 item = epub.EpubItem(
425 file_name="title.xhtml",
426 media_type="application/xhtml+xml",
427 content=squeeze_whitespace(html_string)
430 output.add_item(item)
431 # add a title page TOC entry
440 item = epub.EpubNav()
448 output.add_item(item)
451 elif wldoc.book_info.parts:
452 # write title page for every parent
453 if sample is not None and sample <= 0:
456 get_resource('epub/emptyChunk.xhtml')).read()
458 html_tree = xslt(wldoc.edoc,
459 get_resource('epub/xsltChunkTitle.xsl'))
460 chars = used_chars(html_tree.getroot())
461 html_string = etree.tostring(
462 html_tree, pretty_print=True, xml_declaration=True,
464 doctype='<!DOCTYPE html>'
466 item = epub.EpubItem(
467 uid="part%d" % chunk_counter,
468 file_name="part%d.xhtml" % chunk_counter,
469 media_type="application/xhtml+xml",
470 content=squeeze_whitespace(html_string)
472 output.add_item(item)
477 if len(wldoc.edoc.getroot()) > 1:
478 # rdf before style master
479 main_text = wldoc.edoc.getroot()[1]
481 # rdf in style master
482 main_text = wldoc.edoc.getroot()[0]
483 if main_text.tag == RDFNS('RDF'):
486 if main_text is not None:
487 for chunk_xml in chop(main_text):
489 if sample is not None:
493 sample -= len(chunk_xml.xpath(
494 '//strofa|//akap|//akap_cd|//akap_dialog'
496 chunk_html, chunk_toc, chunk_chars = transform_chunk(
497 chunk_xml, chunk_counter, annotations, empty)
499 toc[-1][1].extend(chunk_toc)
500 chars = chars.union(chunk_chars)
501 item = epub.EpubItem(
502 uid="part%d" % chunk_counter,
503 file_name="part%d.xhtml" % chunk_counter,
504 media_type="application/xhtml+xml",
505 content=squeeze_whitespace(chunk_html)
507 output.add_item(item)
511 for child in wldoc.parts():
512 child_toc, chunk_counter, chunk_chars, sample = transform_file(
513 child, chunk_counter, first=False, sample=sample)
514 toc[-1][1].extend(child_toc)
515 chars = chars.union(chunk_chars)
517 return toc, chunk_counter, chars, sample
519 document = deepcopy(wldoc)
524 document.edoc.getroot().set(flag, 'yes')
526 document.clean_ed_note()
527 document.clean_ed_note('abstrakt')
530 editors = document.editors()
532 document.edoc.getroot().set('editors', u', '.join(sorted(
533 editor.readable() for editor in editors)))
534 if document.book_info.funders:
535 document.edoc.getroot().set('funders', u', '.join(
536 document.book_info.funders))
537 if document.book_info.thanks:
538 document.edoc.getroot().set('thanks', document.book_info.thanks)
540 output = epub.EpubBook()
541 output.set_identifier(six.text_type(document.book_info.url))
542 output.set_language(functions.lang_code_3to2(document.book_info.language))
543 output.set_title(document.book_info.title)
544 for author in document.book_info.authors:
547 file_as=six.text_type(author)
549 for translator in document.book_info.translators:
551 translator.readable(),
552 file_as=six.text_type(translator),
555 for publisher in document.book_info.publisher:
556 output.add_metadata("DC", "publisher", publisher)
557 output.add_metadata("DC", "date", document.book_info.created_at)
559 output.guide.append({
562 "href": "part1.xhtml"
565 output.add_item(epub.EpubNcx())
569 functions.reg_mathml_epub(output)
572 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
573 url = six.moves.urllib.parse.urljoin(
577 with six.moves.urllib.request.urlopen(url) as imgfile:
578 img = Image.open(imgfile)
580 th_format, ext, media_type = {
581 'GIF': ('GIF', 'gif', 'image/gif'),
582 'PNG': ('PNG', 'png', 'image/png'),
583 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
586 if img.size[0] < width:
589 th = img.resize((width, round(width * img.size[1] / img.size[0])))
591 buffer = six.BytesIO()
592 th.save(buffer, format=th_format)
594 file_name = 'image%d.%s' % (i, ext)
595 ilustr.set('src', file_name)
600 media_type=media_type,
601 content=buffer.getvalue()
605 # write static elements
607 with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
610 uid="logo_wolnelektury.png",
611 file_name="logo_wolnelektury.png",
612 media_type="image/png",
616 with open(get_resource('res/jedenprocent.png'), 'rb') as f:
620 file_name="jedenprocent.png",
621 media_type="image/png",
627 style = get_resource('epub/style.css')
628 with open(style, 'rb') as f:
632 file_name="style.css",
633 media_type="text/css",
642 cover_file = six.BytesIO()
643 bound_cover = cover(document.book_info)
644 bound_cover.save(cover_file)
645 cover_name = 'cover.%s' % bound_cover.ext()
648 file_name=cover_name,
649 content=cover_file.getvalue(),
651 spine.append('cover')
652 output.guide.append({
654 "href": "cover.xhtml",
660 if bound_cover.uses_dc_cover:
661 if document.book_info.cover_by:
662 document.edoc.getroot().set('data-cover-by',
663 document.book_info.cover_by)
664 if document.book_info.cover_source:
665 document.edoc.getroot().set('data-cover-source',
666 document.book_info.cover_source)
668 annotations = etree.Element('annotations')
670 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
671 output.toc = toc[0][1]
682 # Last modifications in container files and EPUB creation
683 if len(annotations) > 0:
691 replace_by_verse(annotations)
692 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
693 chars = chars.union(used_chars(html_tree.getroot()))
695 item = epub.EpubItem(
697 file_name="annotations.xhtml",
698 media_type="application/xhtml+xml",
699 content=etree.tostring(
700 html_tree, pretty_print=True, xml_declaration=True,
702 doctype='<!DOCTYPE html>'
705 output.add_item(item)
711 "Wesprzyj Wolne Lektury",
715 with open(get_resource('epub/support.xhtml'), 'rb') as f:
716 html_string = f.read()
717 chars.update(used_chars(etree.fromstring(html_string)))
718 item = epub.EpubItem(
720 file_name="support.xhtml",
721 media_type="application/xhtml+xml",
722 content=squeeze_whitespace(html_string)
724 output.add_item(item)
734 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
735 outputtype=output_type)
736 chars.update(used_chars(html_tree.getroot()))
737 item = epub.EpubItem(
739 file_name="last.xhtml",
740 media_type="application/xhtml+xml",
741 content=squeeze_whitespace(etree.tostring(
742 html_tree, pretty_print=True, xml_declaration=True,
744 doctype='<!DOCTYPE html>'
747 output.add_item(item)
750 if not flags or 'without-fonts' not in flags:
752 tmpdir = mkdtemp('-librarian-epub')
758 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
760 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
761 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
762 optimizer_call = ['perl', 'subset.pl', '--chars',
763 ''.join(chars).encode('utf-8'),
764 get_resource('fonts/' + fname),
765 os.path.join(tmpdir, fname)]
766 env = {"PERL_USE_UNSAFE_INC": "1"}
768 print("Running font-optimizer")
769 subprocess.check_call(optimizer_call, env=env)
771 dev_null = open(os.devnull, 'w')
772 subprocess.check_call(optimizer_call, stdout=dev_null,
773 stderr=dev_null, env=env)
774 with open(os.path.join(tmpdir, fname), 'rb') as f:
779 media_type="font/ttf",
787 remove_empty_lists_from_toc(output.toc)
789 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
792 epub.write_epub(output_file.name, output, {'epub3_landmark': False})
793 return OutputFile.from_filename(output_file.name)