1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from ebooklib import epub
17 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
31 def squeeze_whitespace(s):
32 return re.sub(b'\\s+', b' ', s)
35 def set_hyph_language(source_tree):
36 bibl_lng = etree.XPath('//dc:language//text()',
37 namespaces={'dc': str(DCNS)})(source_tree)
38 short_lng = functions.lang_code_3to2(bibl_lng[0])
40 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
46 def hyphenate_and_fix_conjunctions(source_tree, hyph):
47 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
49 parent = t.getparent()
52 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
54 newt += hyph.inserted(w, u'\u00AD')
57 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
65 """ returns node's text and children as a string
67 >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
71 nt = node.text if node.text is not None else ''
73 [nt] + [etree.tostring(child, encoding='unicode') for child in node]
77 def set_inner_xml(node, text):
78 """ sets node's text and children from a string
80 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
81 >>> set_inner_xml(e, 'x<b>y</b>z')
82 >>> print(etree.tostring(e, encoding='unicode'))
86 p = etree.fromstring('<x>%s</x>' % text)
92 """ Find out a node's name
94 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
98 tempnode = deepcopy(node)
100 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
101 for e in tempnode.findall('.//%s' % p):
105 etree.strip_tags(tempnode, '*')
109 def xslt(xml, sheet, **kwargs):
110 if isinstance(xml, etree._Element):
111 xml = etree.ElementTree(xml)
112 with open(sheet) as xsltf:
113 transform = etree.XSLT(etree.parse(xsltf))
115 (key, transform.strparam(value))
116 for key, value in kwargs.items()
118 return transform(xml, **params)
121 def replace_characters(node):
122 def replace_chars(text):
125 return text.replace(u"\ufeff", u"")\
126 .replace("---", u"\u2014")\
127 .replace("--", u"\u2013")\
128 .replace(",,", u"\u201E")\
129 .replace('"', u"\u201D")\
130 .replace("'", u"\u2019")
131 if node.tag in ('uwaga', 'extra'):
135 node.text = replace_chars(node.text)
136 node.tail = replace_chars(node.tail)
138 replace_characters(child)
141 def find_annotations(annotations, source, part_no):
143 if child.tag in ('pe', 'pa', 'pt', 'pr'):
144 annotation = deepcopy(child)
145 number = str(len(annotations) + 1)
146 annotation.set('number', number)
147 annotation.set('part', str(part_no))
149 annotations.append(annotation)
154 if child.tag not in ('extra', 'uwaga'):
155 find_annotations(annotations, child, part_no)
158 class Stanza(object):
160 Converts / verse endings into verse elements in a stanza.
162 Slashes may only occur directly in the stanza. Any slashes in subelements
163 will be ignored, and the subelements will be put inside verse elements.
165 >>> s = etree.fromstring(
166 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
168 >>> Stanza(s).versify()
169 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
171 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
172 <wers_normalny>b<x>x/
173 y</x>c</wers_normalny>
174 <wers_normalny>d</wers_normalny>
178 def __init__(self, stanza_elem):
179 self.stanza = stanza_elem
181 self.open_verse = None
184 self.push_text(self.stanza.text)
185 for elem in self.stanza:
187 self.push_text(elem.tail)
188 tail = self.stanza.tail
190 self.stanza.tail = tail
192 verse for verse in self.verses
193 if verse.text or len(verse) > 0
196 def open_normal_verse(self):
197 self.open_verse = self.stanza.makeelement("wers_normalny")
198 self.verses.append(self.open_verse)
200 def get_open_verse(self):
201 if self.open_verse is None:
202 self.open_normal_verse()
203 return self.open_verse
205 def push_text(self, text):
208 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
210 self.open_normal_verse()
211 if not verse_text.strip():
213 verse = self.get_open_verse()
215 verse[-1].tail = (verse[-1].tail or "") + verse_text
217 verse.text = (verse.text or "") + verse_text
219 def push_elem(self, elem):
220 if elem.tag.startswith("wers"):
221 verse = deepcopy(elem)
223 self.verses.append(verse)
224 self.open_verse = verse
226 appended = deepcopy(elem)
228 self.get_open_verse().append(appended)
231 def replace_by_verse(tree):
232 """ Find stanzas and create new verses in place of a '/' character """
234 stanzas = tree.findall('.//' + WLNS('strofa'))
235 for stanza in stanzas:
236 Stanza(stanza).versify()
239 def used_chars(element):
240 """ Lists characters used in an ETree Element """
241 chars = set((element.text or '') + (element.tail or ''))
242 for child in element:
243 chars = chars.union(used_chars(child))
248 """ divide main content of the XML file into chunks """
250 # prepare a container for each chunk
251 part_xml = etree.Element('utwor')
252 etree.SubElement(part_xml, 'master')
253 main_xml_part = part_xml[0] # master
255 last_node_part = False
257 # The below loop are workaround for a problem with epubs
258 # in drama ebooks without acts.
261 for one_part in main_text:
263 if name == 'naglowek_scena':
265 elif name == 'naglowek_akt':
268 for one_part in main_text:
270 if is_act is False and is_scene is True:
271 if name == 'naglowek_czesc':
273 last_node_part = True
274 main_xml_part[:] = [deepcopy(one_part)]
275 elif not last_node_part and name == "naglowek_scena":
277 main_xml_part[:] = [deepcopy(one_part)]
279 main_xml_part.append(deepcopy(one_part))
280 last_node_part = False
282 if name == 'naglowek_czesc':
284 last_node_part = True
285 main_xml_part[:] = [deepcopy(one_part)]
286 elif (not last_node_part
288 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
291 main_xml_part[:] = [deepcopy(one_part)]
293 main_xml_part.append(deepcopy(one_part))
294 last_node_part = False
298 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
299 _empty_html_static=[]):
301 Transforms one chunk, returns a HTML string, a TOC object
302 and a set of used characters.
306 for element in chunk_xml[0]:
307 if element.tag == "naglowek_czesc":
311 "part%d.xhtml#book-text" % chunk_no,
313 "part%d-text" % chunk_no
318 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
322 "part%d.xhtml" % chunk_no,
329 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
334 "part%d.xhtml" % chunk_no,
342 subnumber = len(toc[-1][1])
345 "part%d.xhtml#sub%d" % (chunk_no, subnumber),
347 "part%d-sub%d" % (chunk_no, subnumber)
350 element.set('sub', six.text_type(subnumber))
352 if not _empty_html_static:
353 with open(get_resource('epub/emptyChunk.xhtml')) as f:
354 _empty_html_static.append(f.read())
356 output_html = _empty_html_static[0]
358 find_annotations(annotations, chunk_xml, chunk_no)
359 replace_by_verse(chunk_xml)
360 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
361 chars = used_chars(html_tree.getroot())
362 output_html = etree.tostring(
363 html_tree, pretty_print=True, xml_declaration=True,
365 doctype='<!DOCTYPE html>'
367 return output_html, toc, chars
370 def remove_empty_lists_from_toc(toc):
371 for i, e in enumerate(toc):
372 if isinstance(e, tuple):
374 remove_empty_lists_from_toc(e[1])
379 def transform(wldoc, verbose=False, style=None,
380 sample=None, cover=None, flags=None, hyphenate=False,
381 ilustr_path='', output_type='epub'):
382 """ produces a EPUB file
384 sample=n: generate sample e-book (with at least n paragraphs)
385 cover: a cover.Cover factory or True for default
386 flags: less-advertising, without-fonts, working-copy
389 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
390 """ processes one input file and proceeds to its children """
392 replace_characters(wldoc.edoc.getroot())
394 hyphenator = set_hyph_language(
396 ) if hyphenate else None
397 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
399 # every input file will have a TOC entry,
400 # pointing to starting chunk
404 "part%d.xhtml" % chunk_counter,
405 wldoc.book_info.title,
406 "path%d-start" % chunk_counter
413 # write book title page
414 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
415 outputtype=output_type)
416 chars = used_chars(html_tree.getroot())
417 html_string = etree.tostring(
418 html_tree, pretty_print=True, xml_declaration=True,
420 doctype='<!DOCTYPE html>'
422 item = epub.EpubItem(
424 file_name="title.xhtml",
425 media_type="application/xhtml+xml",
426 content=squeeze_whitespace(html_string)
429 output.add_item(item)
430 # add a title page TOC entry
439 item = epub.EpubNav()
447 output.add_item(item)
450 elif wldoc.book_info.parts:
451 # write title page for every parent
452 if sample is not None and sample <= 0:
455 get_resource('epub/emptyChunk.xhtml')).read()
457 html_tree = xslt(wldoc.edoc,
458 get_resource('epub/xsltChunkTitle.xsl'))
459 chars = used_chars(html_tree.getroot())
460 html_string = etree.tostring(
461 html_tree, pretty_print=True, xml_declaration=True,
463 doctype='<!DOCTYPE html>'
465 item = epub.EpubItem(
466 uid="part%d" % chunk_counter,
467 file_name="part%d.xhtml" % chunk_counter,
468 media_type="application/xhtml+xml",
469 content=squeeze_whitespace(html_string)
471 output.add_item(item)
476 if len(wldoc.edoc.getroot()) > 1:
477 # rdf before style master
478 main_text = wldoc.edoc.getroot()[1]
480 # rdf in style master
481 main_text = wldoc.edoc.getroot()[0]
482 if main_text.tag == RDFNS('RDF'):
485 if main_text is not None:
486 for chunk_xml in chop(main_text):
488 if sample is not None:
492 sample -= len(chunk_xml.xpath(
493 '//strofa|//akap|//akap_cd|//akap_dialog'
495 chunk_html, chunk_toc, chunk_chars = transform_chunk(
496 chunk_xml, chunk_counter, annotations, empty)
498 toc[-1][1].extend(chunk_toc)
499 chars = chars.union(chunk_chars)
500 item = epub.EpubItem(
501 uid="part%d" % chunk_counter,
502 file_name="part%d.xhtml" % chunk_counter,
503 media_type="application/xhtml+xml",
504 content=squeeze_whitespace(chunk_html)
506 output.add_item(item)
510 for child in wldoc.parts():
511 child_toc, chunk_counter, chunk_chars, sample = transform_file(
512 child, chunk_counter, first=False, sample=sample)
513 toc[-1][1].extend(child_toc)
514 chars = chars.union(chunk_chars)
516 return toc, chunk_counter, chars, sample
518 document = deepcopy(wldoc)
523 document.edoc.getroot().set(flag, 'yes')
525 document.clean_ed_note()
526 document.clean_ed_note('abstrakt')
529 editors = document.editors()
531 document.edoc.getroot().set('editors', u', '.join(sorted(
532 editor.readable() for editor in editors)))
533 if document.book_info.funders:
534 document.edoc.getroot().set('funders', u', '.join(
535 document.book_info.funders))
536 if document.book_info.thanks:
537 document.edoc.getroot().set('thanks', document.book_info.thanks)
539 output = epub.EpubBook()
540 output.set_identifier(six.text_type(document.book_info.url))
541 output.set_language(functions.lang_code_3to2(document.book_info.language))
542 output.set_title(document.book_info.title)
543 for author in document.book_info.authors:
546 file_as=six.text_type(author)
548 for translator in document.book_info.translators:
550 translator.readable(),
551 file_as=six.text_type(translator),
554 for publisher in document.book_info.publisher:
555 output.add_metadata("DC", "publisher", publisher)
556 output.add_metadata("DC", "date", document.book_info.created_at)
558 output.guide.append({
561 "href": "part1.xhtml"
564 output.add_item(epub.EpubNcx())
568 functions.reg_mathml_epub(output)
570 if os.path.isdir(ilustr_path):
571 ilustr_elements = set(ilustr.get('src')
572 for ilustr in document.edoc.findall('//ilustr'))
573 for i, filename in enumerate(os.listdir(ilustr_path)):
574 if filename not in ilustr_elements:
576 file_path = os.path.join(ilustr_path, filename)
577 with open(file_path, 'rb') as f:
582 media_type=guess_type(file_path)[0],
587 # write static elements
589 with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
592 uid="logo_wolnelektury.png",
593 file_name="logo_wolnelektury.png",
594 media_type="image/png",
598 with open(get_resource('res/jedenprocent.png'), 'rb') as f:
602 file_name="jedenprocent.png",
603 media_type="image/png",
609 style = get_resource('epub/style.css')
610 with open(style, 'rb') as f:
614 file_name="style.css",
615 media_type="text/css",
624 cover_file = six.BytesIO()
625 bound_cover = cover(document.book_info)
626 bound_cover.save(cover_file)
627 cover_name = 'cover.%s' % bound_cover.ext()
630 file_name=cover_name,
631 content=cover_file.getvalue(),
633 spine.append('cover')
634 output.guide.append({
636 "href": "cover.xhtml",
642 if bound_cover.uses_dc_cover:
643 if document.book_info.cover_by:
644 document.edoc.getroot().set('data-cover-by',
645 document.book_info.cover_by)
646 if document.book_info.cover_source:
647 document.edoc.getroot().set('data-cover-source',
648 document.book_info.cover_source)
650 annotations = etree.Element('annotations')
652 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
653 output.toc = toc[0][1]
664 # Last modifications in container files and EPUB creation
665 if len(annotations) > 0:
673 replace_by_verse(annotations)
674 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
675 chars = chars.union(used_chars(html_tree.getroot()))
677 item = epub.EpubItem(
679 file_name="annotations.xhtml",
680 media_type="application/xhtml+xml",
681 content=etree.tostring(
682 html_tree, pretty_print=True, xml_declaration=True,
684 doctype='<!DOCTYPE html>'
687 output.add_item(item)
693 "Wesprzyj Wolne Lektury",
697 with open(get_resource('epub/support.xhtml'), 'rb') as f:
698 html_string = f.read()
699 chars.update(used_chars(etree.fromstring(html_string)))
700 item = epub.EpubItem(
702 file_name="support.xhtml",
703 media_type="application/xhtml+xml",
704 content=squeeze_whitespace(html_string)
706 output.add_item(item)
716 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
717 outputtype=output_type)
718 chars.update(used_chars(html_tree.getroot()))
719 item = epub.EpubItem(
721 file_name="last.xhtml",
722 media_type="application/xhtml+xml",
723 content=squeeze_whitespace(etree.tostring(
724 html_tree, pretty_print=True, xml_declaration=True,
726 doctype='<!DOCTYPE html>'
729 output.add_item(item)
732 if not flags or 'without-fonts' not in flags:
734 tmpdir = mkdtemp('-librarian-epub')
740 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
742 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
743 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
744 optimizer_call = ['perl', 'subset.pl', '--chars',
745 ''.join(chars).encode('utf-8'),
746 get_resource('fonts/' + fname),
747 os.path.join(tmpdir, fname)]
748 env = {"PERL_USE_UNSAFE_INC": "1"}
750 print("Running font-optimizer")
751 subprocess.check_call(optimizer_call, env=env)
753 dev_null = open(os.devnull, 'w')
754 subprocess.check_call(optimizer_call, stdout=dev_null,
755 stderr=dev_null, env=env)
756 with open(os.path.join(tmpdir, fname), 'rb') as f:
761 media_type="font/ttf",
769 remove_empty_lists_from_toc(output.toc)
771 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
774 epub.write_epub(output_file.name, output, {'epub3_landmark': False})
775 return OutputFile.from_filename(output_file.name)