1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from ebooklib import epub
17 from lxml import etree
19 from tempfile import mkdtemp, NamedTemporaryFile
20 from shutil import rmtree
22 from librarian import RDFNS, WLNS, DCNS, OutputFile
23 from librarian.cover import make_cover
25 from librarian import functions, get_resource
27 from librarian.hyphenator import Hyphenator
29 functions.reg_person_name()
32 def squeeze_whitespace(s):
34 return re.sub(b'\\s+', b' ', s)
37 def set_hyph_language(source_tree):
38 bibl_lng = etree.XPath('//dc:language//text()',
39 namespaces={'dc': str(DCNS)})(source_tree)
40 short_lng = functions.lang_code_3to2(bibl_lng[0])
42 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
48 def hyphenate_and_fix_conjunctions(source_tree, hyph):
49 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
51 parent = t.getparent()
54 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
56 newt += hyph.inserted(w, u'\u00AD')
59 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
67 """ Find out a node's name
69 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
73 tempnode = deepcopy(node)
75 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
76 for e in tempnode.findall('.//%s' % p):
80 etree.strip_tags(tempnode, '*')
84 def xslt(xml, sheet, **kwargs):
85 if isinstance(xml, etree._Element):
86 xml = etree.ElementTree(xml)
87 with open(sheet) as xsltf:
88 transform = etree.XSLT(etree.parse(xsltf))
90 (key, transform.strparam(value))
91 for key, value in kwargs.items()
93 return transform(xml, **params)
96 def replace_characters(node):
97 def replace_chars(text):
100 return text.replace(u"\ufeff", u"")\
101 .replace("---", u"\u2014")\
102 .replace("--", u"\u2013")\
103 .replace(",,", u"\u201E")\
104 .replace('"', u"\u201D")\
105 .replace("'", u"\u2019")
106 if node.tag in ('uwaga', 'extra'):
110 node.text = replace_chars(node.text)
111 node.tail = replace_chars(node.tail)
113 replace_characters(child)
116 def find_annotations(annotations, source, part_no):
118 if child.tag in ('pe', 'pa', 'pt', 'pr'):
119 annotation = deepcopy(child)
120 number = str(len(annotations) + 1)
121 annotation.set('number', number)
122 annotation.set('part', str(part_no))
124 annotations.append(annotation)
129 if child.tag not in ('extra', 'uwaga'):
130 find_annotations(annotations, child, part_no)
133 class Stanza(object):
135 Converts / verse endings into verse elements in a stanza.
137 Slashes may only occur directly in the stanza. Any slashes in subelements
138 will be ignored, and the subelements will be put inside verse elements.
140 >>> s = etree.fromstring(
141 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
143 >>> Stanza(s).versify()
144 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
146 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
147 <wers_normalny>b<x>x/
148 y</x>c</wers_normalny>
149 <wers_normalny>d</wers_normalny>
153 def __init__(self, stanza_elem):
154 self.stanza = stanza_elem
156 self.open_verse = None
159 self.push_text(self.stanza.text)
160 for elem in self.stanza:
162 self.push_text(elem.tail)
163 tail = self.stanza.tail
165 self.stanza.tail = tail
167 verse for verse in self.verses
168 if verse.text or len(verse) > 0
171 def open_normal_verse(self):
172 self.open_verse = self.stanza.makeelement("wers_normalny")
173 self.verses.append(self.open_verse)
175 def get_open_verse(self):
176 if self.open_verse is None:
177 self.open_normal_verse()
178 return self.open_verse
180 def push_text(self, text):
183 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
185 self.open_normal_verse()
186 if not verse_text.strip():
188 verse = self.get_open_verse()
190 verse[-1].tail = (verse[-1].tail or "") + verse_text
192 verse.text = (verse.text or "") + verse_text
194 def push_elem(self, elem):
195 if elem.tag.startswith("wers"):
196 verse = deepcopy(elem)
198 self.verses.append(verse)
199 self.open_verse = verse
201 appended = deepcopy(elem)
203 self.get_open_verse().append(appended)
206 def replace_by_verse(tree):
207 """ Find stanzas and create new verses in place of a '/' character """
209 stanzas = tree.findall('.//' + WLNS('strofa'))
210 for stanza in stanzas:
211 Stanza(stanza).versify()
214 def used_chars(element):
215 """ Lists characters used in an ETree Element """
216 chars = set((element.text or '') + (element.tail or ''))
217 for child in element:
218 chars = chars.union(used_chars(child))
223 """ divide main content of the XML file into chunks """
225 # prepare a container for each chunk
226 part_xml = etree.Element('utwor')
227 etree.SubElement(part_xml, 'master')
228 main_xml_part = part_xml[0] # master
230 last_node_part = False
232 # The below loop are workaround for a problem with epubs
233 # in drama ebooks without acts.
236 for one_part in main_text:
238 if name == 'naglowek_scena':
240 elif name == 'naglowek_akt':
243 for one_part in main_text:
245 if is_act is False and is_scene is True:
246 if name == 'naglowek_czesc':
248 last_node_part = True
249 main_xml_part[:] = [deepcopy(one_part)]
250 elif not last_node_part and name == "naglowek_scena":
252 main_xml_part[:] = [deepcopy(one_part)]
254 main_xml_part.append(deepcopy(one_part))
255 last_node_part = False
257 if name == 'naglowek_czesc':
259 last_node_part = True
260 main_xml_part[:] = [deepcopy(one_part)]
261 elif (not last_node_part
263 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
266 main_xml_part[:] = [deepcopy(one_part)]
268 main_xml_part.append(deepcopy(one_part))
269 last_node_part = False
273 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
274 _empty_html_static=[]):
276 Transforms one chunk, returns a HTML string, a TOC object
277 and a set of used characters.
281 for element in chunk_xml[0]:
282 if element.tag == "naglowek_czesc":
286 "part%d.xhtml#book-text" % chunk_no,
288 "part%d-text" % chunk_no
293 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
297 "part%d.xhtml" % chunk_no,
304 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
309 "part%d.xhtml" % chunk_no,
317 subnumber = len(toc[-1][1])
320 "part%d.xhtml#sub%d" % (chunk_no, subnumber),
322 "part%d-sub%d" % (chunk_no, subnumber)
325 element.set('sub', six.text_type(subnumber))
327 if not _empty_html_static:
328 with open(get_resource('epub/emptyChunk.xhtml')) as f:
329 _empty_html_static.append(f.read())
331 output_html = _empty_html_static[0]
333 find_annotations(annotations, chunk_xml, chunk_no)
334 replace_by_verse(chunk_xml)
335 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
336 chars = used_chars(html_tree.getroot())
337 output_html = etree.tostring(
338 html_tree, pretty_print=True, xml_declaration=True,
340 doctype='<!DOCTYPE html>'
342 return output_html, toc, chars
345 def remove_empty_lists_from_toc(toc):
346 for i, e in enumerate(toc):
347 if isinstance(e, tuple):
349 remove_empty_lists_from_toc(e[1])
355 def transform_file(wldoc, chunk_counter=1, first=True, sample=None, hyphenate=False, output_type='epub', spine=None, output=None, annotations=None):
356 """ processes one input file and proceeds to its children """
358 replace_characters(wldoc.edoc.getroot())
360 hyphenator = set_hyph_language(
362 ) if hyphenate else None
363 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
365 # every input file will have a TOC entry,
366 # pointing to starting chunk
370 "part%d.xhtml" % chunk_counter,
371 wldoc.book_info.title,
372 "path%d-start" % chunk_counter
379 # write book title page
380 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
381 outputtype=output_type)
382 chars = used_chars(html_tree.getroot())
383 html_string = etree.tostring(
384 html_tree, pretty_print=True, xml_declaration=True,
386 doctype='<!DOCTYPE html>'
388 item = epub.EpubItem(
390 file_name="title.xhtml",
391 media_type="application/xhtml+xml",
392 content=squeeze_whitespace(html_string)
395 output.add_item(item)
396 # add a title page TOC entry
405 item = epub.EpubNav()
413 output.add_item(item)
424 elif wldoc.book_info.parts:
425 # write title page for every parent
426 if sample is not None and sample <= 0:
429 get_resource('epub/emptyChunk.xhtml')).read()
431 html_tree = xslt(wldoc.edoc,
432 get_resource('epub/xsltChunkTitle.xsl'))
433 chars = used_chars(html_tree.getroot())
434 html_string = etree.tostring(
435 html_tree, pretty_print=True, xml_declaration=True,
437 doctype='<!DOCTYPE html>'
439 item = epub.EpubItem(
440 uid="part%d" % chunk_counter,
441 file_name="part%d.xhtml" % chunk_counter,
442 media_type="application/xhtml+xml",
443 content=squeeze_whitespace(html_string)
445 output.add_item(item)
450 if len(wldoc.edoc.getroot()) > 1:
451 # rdf before style master
452 main_text = wldoc.edoc.getroot()[1]
454 # rdf in style master
455 main_text = wldoc.edoc.getroot()[0]
456 if main_text.tag == RDFNS('RDF'):
459 if main_text is not None:
460 for chunk_xml in chop(main_text):
462 if sample is not None:
466 sample -= len(chunk_xml.xpath(
467 '//strofa|//akap|//akap_cd|//akap_dialog'
469 chunk_html, chunk_toc, chunk_chars = transform_chunk(
470 chunk_xml, chunk_counter, annotations, empty)
472 toc[-1][1].extend(chunk_toc)
473 chars = chars.union(chunk_chars)
474 item = epub.EpubItem(
475 uid="part%d" % chunk_counter,
476 file_name="part%d.xhtml" % chunk_counter,
477 media_type="application/xhtml+xml",
478 content=squeeze_whitespace(chunk_html)
480 output.add_item(item)
484 for child in wldoc.parts():
485 child_toc, chunk_counter, chunk_chars, sample = transform_file(
486 child, chunk_counter, first=False, sample=sample,
487 hyphenate=hyphenate, output_type=output_type,
488 spine=spine, output=output, annotations=annotations,
490 toc[-1][1].extend(child_toc)
491 chars = chars.union(chunk_chars)
493 return toc, chunk_counter, chars, sample
496 def transform(wldoc, verbose=False, style=None,
497 sample=None, cover=None, flags=None, hyphenate=False,
498 base_url='file://./', output_type='epub'):
499 """ produces a EPUB file
501 sample=n: generate sample e-book (with at least n paragraphs)
502 cover: a cover.Cover factory or True for default
503 flags: less-advertising, without-fonts, working-copy
507 document = deepcopy(wldoc)
512 document.edoc.getroot().set(flag, 'yes')
514 document.clean_ed_note()
515 document.clean_ed_note('abstrakt')
518 editors = document.editors()
520 document.edoc.getroot().set('editors', u', '.join(sorted(
521 editor.readable() for editor in editors)))
522 if document.book_info.funders:
523 document.edoc.getroot().set('funders', u', '.join(
524 document.book_info.funders))
525 if document.book_info.thanks:
526 document.edoc.getroot().set('thanks', document.book_info.thanks)
528 output = epub.EpubBook()
529 output.set_identifier(six.text_type(document.book_info.url))
530 output.set_language(functions.lang_code_3to2(document.book_info.language))
531 output.set_title(document.book_info.title)
532 for i, author in enumerate(document.book_info.authors):
535 file_as=six.text_type(author),
536 uid='creator{}'.format(i)
538 for translator in document.book_info.translators:
540 translator.readable(),
541 file_as=six.text_type(translator),
543 uid='translator{}'.format(i)
545 for publisher in document.book_info.publisher:
546 output.add_metadata("DC", "publisher", publisher)
547 output.add_metadata("DC", "date", document.book_info.created_at)
549 output.guide.append({
552 "href": "part1.xhtml"
555 output.add_item(epub.EpubNcx())
559 functions.reg_mathml_epub(output)
562 for i, ilustr in enumerate(document.edoc.findall('//ilustr')):
563 url = six.moves.urllib.parse.urljoin(
567 imgfile = six.moves.urllib.request.urlopen(url)
568 img = Image.open(imgfile)
570 th_format, ext, media_type = {
571 'GIF': ('GIF', 'gif', 'image/gif'),
572 'PNG': ('PNG', 'png', 'image/png'),
573 }.get(img.format, ('JPEG', 'jpg', 'image/jpeg'))
576 if img.size[0] < width:
579 th = img.resize((width, round(width * img.size[1] / img.size[0])))
583 buffer = six.BytesIO()
584 th.save(buffer, format=th_format)
586 file_name = 'image%d.%s' % (i, ext)
587 ilustr.set('src', file_name)
592 media_type=media_type,
593 content=buffer.getvalue()
597 # write static elements
599 with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
602 uid="logo_wolnelektury.png",
603 file_name="logo_wolnelektury.png",
604 media_type="image/png",
608 with open(get_resource('res/jedenprocent.png'), 'rb') as f:
612 file_name="jedenprocent.png",
613 media_type="image/png",
619 style = get_resource('epub/style.css')
620 with open(style, 'rb') as f:
624 file_name="style.css",
625 media_type="text/css",
634 cover_file = six.BytesIO()
635 bound_cover = cover(document.book_info)
636 bound_cover.save(cover_file)
637 cover_name = 'cover.%s' % bound_cover.ext()
640 file_name=cover_name,
641 content=cover_file.getvalue(),
643 spine.append('cover')
644 output.guide.append({
646 "href": "cover.xhtml",
652 if bound_cover.uses_dc_cover:
653 if document.book_info.cover_by:
654 document.edoc.getroot().set('data-cover-by',
655 document.book_info.cover_by)
656 if document.book_info.cover_source:
657 document.edoc.getroot().set('data-cover-source',
658 document.book_info.cover_source)
660 annotations = etree.Element('annotations')
662 toc, chunk_counter, chars, sample = transform_file(
663 document, sample=sample,
664 hyphenate=hyphenate, output_type=output_type,
665 spine=spine, output=output, annotations=annotations
667 output.toc = toc[0][1]
669 # Last modifications in container files and EPUB creation
670 if len(annotations) > 0:
678 replace_by_verse(annotations)
679 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
680 chars = chars.union(used_chars(html_tree.getroot()))
682 item = epub.EpubItem(
684 file_name="annotations.xhtml",
685 media_type="application/xhtml+xml",
686 content=etree.tostring(
687 html_tree, pretty_print=True, xml_declaration=True,
689 doctype='<!DOCTYPE html>'
692 output.add_item(item)
698 "Wesprzyj Wolne Lektury",
702 with open(get_resource('epub/support.xhtml'), 'rb') as f:
703 html_string = f.read()
704 chars.update(used_chars(etree.fromstring(html_string)))
705 item = epub.EpubItem(
707 file_name="support.xhtml",
708 media_type="application/xhtml+xml",
709 content=squeeze_whitespace(html_string)
711 output.add_item(item)
721 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
722 outputtype=output_type)
723 chars.update(used_chars(html_tree.getroot()))
724 item = epub.EpubItem(
726 file_name="last.xhtml",
727 media_type="application/xhtml+xml",
728 content=squeeze_whitespace(etree.tostring(
729 html_tree, pretty_print=True, xml_declaration=True,
731 doctype='<!DOCTYPE html>'
734 output.add_item(item)
737 if not flags or 'without-fonts' not in flags:
739 tmpdir = mkdtemp('-librarian-epub')
745 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
747 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
748 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
749 optimizer_call = ['perl', 'subset.pl', '--chars',
750 ''.join(chars).encode('utf-8'),
751 get_resource('fonts/' + fname),
752 os.path.join(tmpdir, fname)]
753 env = {"PERL_USE_UNSAFE_INC": "1"}
755 print("Running font-optimizer")
756 subprocess.check_call(optimizer_call, env=env)
758 dev_null = open(os.devnull, 'w')
759 subprocess.check_call(optimizer_call, stdout=dev_null,
760 stderr=dev_null, env=env)
761 with open(os.path.join(tmpdir, fname), 'rb') as f:
766 media_type="font/ttf",
774 remove_empty_lists_from_toc(output.toc)
777 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
780 epub.write_epub(output_file.name, output, {'epub3_landmark': False})
781 return OutputFile.from_filename(output_file.name)