1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import print_function, unicode_literals
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from ebooklib import epub
17 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
31 def squeeze_whitespace(s):
32 return re.sub(b'\\s+', b' ', s)
35 def set_hyph_language(source_tree):
36 bibl_lng = etree.XPath('//dc:language//text()',
37 namespaces={'dc': str(DCNS)})(source_tree)
38 short_lng = functions.lang_code_3to2(bibl_lng[0])
40 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
46 def hyphenate_and_fix_conjunctions(source_tree, hyph):
47 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
49 parent = t.getparent()
52 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
54 newt += hyph.inserted(w, u'\u00AD')
57 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
65 """ returns node's text and children as a string
67 >>> print(inner_xml(etree.fromstring('<a>x<b>y</b>z</a>')))
71 nt = node.text if node.text is not None else ''
73 [nt] + [etree.tostring(child, encoding='unicode') for child in node]
77 def set_inner_xml(node, text):
78 """ sets node's text and children from a string
80 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
81 >>> set_inner_xml(e, 'x<b>y</b>z')
82 >>> print(etree.tostring(e, encoding='unicode'))
86 p = etree.fromstring('<x>%s</x>' % text)
92 """ Find out a node's name
94 >>> print(node_name(etree.fromstring('<a>X<b>Y</b>Z</a>')))
98 tempnode = deepcopy(node)
100 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
101 for e in tempnode.findall('.//%s' % p):
105 etree.strip_tags(tempnode, '*')
109 def xslt(xml, sheet, **kwargs):
110 if isinstance(xml, etree._Element):
111 xml = etree.ElementTree(xml)
112 with open(sheet) as xsltf:
113 transform = etree.XSLT(etree.parse(xsltf))
115 (key, transform.strparam(value))
116 for key, value in kwargs.items()
118 return transform(xml, **params)
121 def replace_characters(node):
122 def replace_chars(text):
125 return text.replace(u"\ufeff", u"")\
126 .replace("---", u"\u2014")\
127 .replace("--", u"\u2013")\
128 .replace(",,", u"\u201E")\
129 .replace('"', u"\u201D")\
130 .replace("'", u"\u2019")
131 if node.tag in ('uwaga', 'extra'):
135 node.text = replace_chars(node.text)
136 node.tail = replace_chars(node.tail)
138 replace_characters(child)
141 def find_annotations(annotations, source, part_no):
143 if child.tag in ('pe', 'pa', 'pt', 'pr'):
144 annotation = deepcopy(child)
145 number = str(len(annotations) + 1)
146 annotation.set('number', number)
147 annotation.set('part', str(part_no))
149 annotations.append(annotation)
154 if child.tag not in ('extra', 'uwaga'):
155 find_annotations(annotations, child, part_no)
158 class Stanza(object):
160 Converts / verse endings into verse elements in a stanza.
162 Slashes may only occur directly in the stanza. Any slashes in subelements
163 will be ignored, and the subelements will be put inside verse elements.
165 >>> s = etree.fromstring(
166 ... "<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>"
168 >>> Stanza(s).versify()
169 >>> print(etree.tostring(s, encoding='unicode', pretty_print=True).strip())
171 <wers_normalny>a <b>c</b><b>c</b></wers_normalny>
172 <wers_normalny>b<x>x/
173 y</x>c</wers_normalny>
174 <wers_normalny>d</wers_normalny>
178 def __init__(self, stanza_elem):
179 self.stanza = stanza_elem
181 self.open_verse = None
184 self.push_text(self.stanza.text)
185 for elem in self.stanza:
187 self.push_text(elem.tail)
188 tail = self.stanza.tail
190 self.stanza.tail = tail
192 verse for verse in self.verses
193 if verse.text or len(verse) > 0
196 def open_normal_verse(self):
197 self.open_verse = self.stanza.makeelement("wers_normalny")
198 self.verses.append(self.open_verse)
200 def get_open_verse(self):
201 if self.open_verse is None:
202 self.open_normal_verse()
203 return self.open_verse
205 def push_text(self, text):
208 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
210 self.open_normal_verse()
211 if not verse_text.strip():
213 verse = self.get_open_verse()
215 verse[-1].tail = (verse[-1].tail or "") + verse_text
217 verse.text = (verse.text or "") + verse_text
219 def push_elem(self, elem):
220 if elem.tag.startswith("wers"):
221 verse = deepcopy(elem)
223 self.verses.append(verse)
224 self.open_verse = verse
226 appended = deepcopy(elem)
228 self.get_open_verse().append(appended)
231 def replace_by_verse(tree):
232 """ Find stanzas and create new verses in place of a '/' character """
234 stanzas = tree.findall('.//' + WLNS('strofa'))
235 for stanza in stanzas:
236 Stanza(stanza).versify()
239 def used_chars(element):
240 """ Lists characters used in an ETree Element """
241 chars = set((element.text or '') + (element.tail or ''))
242 for child in element:
243 chars = chars.union(used_chars(child))
248 """ divide main content of the XML file into chunks """
250 # prepare a container for each chunk
251 part_xml = etree.Element('utwor')
252 etree.SubElement(part_xml, 'master')
253 main_xml_part = part_xml[0] # master
255 last_node_part = False
257 # The below loop are workaround for a problem with epubs
258 # in drama ebooks without acts.
261 for one_part in main_text:
263 if name == 'naglowek_scena':
265 elif name == 'naglowek_akt':
268 for one_part in main_text:
270 if is_act is False and is_scene is True:
271 if name == 'naglowek_czesc':
273 last_node_part = True
274 main_xml_part[:] = [deepcopy(one_part)]
275 elif not last_node_part and name == "naglowek_scena":
277 main_xml_part[:] = [deepcopy(one_part)]
279 main_xml_part.append(deepcopy(one_part))
280 last_node_part = False
282 if name == 'naglowek_czesc':
284 last_node_part = True
285 main_xml_part[:] = [deepcopy(one_part)]
286 elif (not last_node_part
288 "naglowek_rozdzial", "naglowek_akt", "srodtytul"
291 main_xml_part[:] = [deepcopy(one_part)]
293 main_xml_part.append(deepcopy(one_part))
294 last_node_part = False
298 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False,
299 _empty_html_static=[]):
301 Transforms one chunk, returns a HTML string, a TOC object
302 and a set of used characters.
306 for element in chunk_xml[0]:
307 if element.tag == "naglowek_czesc":
311 "part%d.xhtml#book-text" % chunk_no,
313 "part%d-text" % chunk_no
318 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
322 "part%d.xhtml" % chunk_no,
329 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
330 subnumber = len(toc[-1][1])
333 "part%d.xhtml#sub%d" % (chunk_no, subnumber),
335 "part%d-sub%d" % (chunk_no, subnumber)
338 element.set('sub', six.text_type(subnumber))
340 if not _empty_html_static:
341 with open(get_resource('epub/emptyChunk.xhtml')) as f:
342 _empty_html_static.append(f.read())
344 output_html = _empty_html_static[0]
346 find_annotations(annotations, chunk_xml, chunk_no)
347 replace_by_verse(chunk_xml)
348 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
349 chars = used_chars(html_tree.getroot())
350 output_html = etree.tostring(
351 html_tree, pretty_print=True, xml_declaration=True,
353 doctype='<!DOCTYPE html>'
355 return output_html, toc, chars
358 def remove_empty_lists_from_toc(toc):
359 for i, e in enumerate(toc):
360 if isinstance(e, tuple):
362 remove_empty_lists_from_toc(e[1])
367 def transform(wldoc, verbose=False, style=None,
368 sample=None, cover=None, flags=None, hyphenate=False,
369 ilustr_path='', output_type='epub'):
370 """ produces a EPUB file
372 sample=n: generate sample e-book (with at least n paragraphs)
373 cover: a cover.Cover factory or True for default
374 flags: less-advertising, without-fonts, working-copy
377 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
378 """ processes one input file and proceeds to its children """
380 replace_characters(wldoc.edoc.getroot())
382 hyphenator = set_hyph_language(
384 ) if hyphenate else None
385 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
387 # every input file will have a TOC entry,
388 # pointing to starting chunk
392 "part%d.xhtml" % chunk_counter,
393 wldoc.book_info.title,
394 "path%d-start" % chunk_counter
401 # write book title page
402 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'),
403 outputtype=output_type)
404 chars = used_chars(html_tree.getroot())
405 html_string = etree.tostring(
406 html_tree, pretty_print=True, xml_declaration=True,
408 doctype='<!DOCTYPE html>'
410 item = epub.EpubItem(
412 file_name="title.xhtml",
413 media_type="application/xhtml+xml",
414 content=squeeze_whitespace(html_string)
417 output.add_item(item)
418 # add a title page TOC entry
427 item = epub.EpubNav()
435 output.add_item(item)
438 elif wldoc.book_info.parts:
439 # write title page for every parent
440 if sample is not None and sample <= 0:
443 get_resource('epub/emptyChunk.xhtml')).read()
445 html_tree = xslt(wldoc.edoc,
446 get_resource('epub/xsltChunkTitle.xsl'))
447 chars = used_chars(html_tree.getroot())
448 html_string = etree.tostring(
449 html_tree, pretty_print=True, xml_declaration=True,
451 doctype='<!DOCTYPE html>'
453 item = epub.EpubItem(
454 uid="part%d" % chunk_counter,
455 file_name="part%d.xhtml" % chunk_counter,
456 media_type="application/xhtml+xml",
457 content=squeeze_whitespace(html_string)
459 output.add_item(item)
464 if len(wldoc.edoc.getroot()) > 1:
465 # rdf before style master
466 main_text = wldoc.edoc.getroot()[1]
468 # rdf in style master
469 main_text = wldoc.edoc.getroot()[0]
470 if main_text.tag == RDFNS('RDF'):
473 if main_text is not None:
474 for chunk_xml in chop(main_text):
476 if sample is not None:
480 sample -= len(chunk_xml.xpath(
481 '//strofa|//akap|//akap_cd|//akap_dialog'
483 chunk_html, chunk_toc, chunk_chars = transform_chunk(
484 chunk_xml, chunk_counter, annotations, empty)
486 toc[-1][1].extend(chunk_toc)
487 chars = chars.union(chunk_chars)
488 item = epub.EpubItem(
489 uid="part%d" % chunk_counter,
490 file_name="part%d.xhtml" % chunk_counter,
491 media_type="application/xhtml+xml",
492 content=squeeze_whitespace(chunk_html)
494 output.add_item(item)
498 for child in wldoc.parts():
499 child_toc, chunk_counter, chunk_chars, sample = transform_file(
500 child, chunk_counter, first=False, sample=sample)
501 toc[-1][1].extend(child_toc)
502 chars = chars.union(chunk_chars)
504 return toc, chunk_counter, chars, sample
506 document = deepcopy(wldoc)
511 document.edoc.getroot().set(flag, 'yes')
513 document.clean_ed_note()
514 document.clean_ed_note('abstrakt')
517 editors = document.editors()
519 document.edoc.getroot().set('editors', u', '.join(sorted(
520 editor.readable() for editor in editors)))
521 if document.book_info.funders:
522 document.edoc.getroot().set('funders', u', '.join(
523 document.book_info.funders))
524 if document.book_info.thanks:
525 document.edoc.getroot().set('thanks', document.book_info.thanks)
527 output = epub.EpubBook()
528 output.set_identifier(six.text_type(document.book_info.url))
529 output.set_language(functions.lang_code_3to2(document.book_info.language))
530 output.set_title(document.book_info.title)
531 for author in document.book_info.authors:
534 file_as=six.text_type(author)
536 for translator in document.book_info.translators:
538 translator.readable(),
539 file_as=six.text_type(translator),
542 for publisher in document.book_info.publisher:
543 output.add_metadata("DC", "publisher", publisher)
544 output.add_metadata("DC", "date", document.book_info.created_at)
546 output.guide.append({
549 "href": "part1.xhtml"
552 output.add_item(epub.EpubNcx())
556 functions.reg_mathml_epub(zip)
558 if os.path.isdir(ilustr_path):
559 ilustr_elements = set(ilustr.get('src')
560 for ilustr in document.edoc.findall('//ilustr'))
561 for i, filename in enumerate(os.listdir(ilustr_path)):
562 if filename not in ilustr_elements:
564 file_path = os.path.join(ilustr_path, filename)
565 with open(file_path, 'rb') as f:
570 media_type=guess_type(file_path)[0],
575 # write static elements
577 with open(get_resource('res/wl-logo-small.png'), 'rb') as f:
580 uid="logo_wolnelektury.png",
581 file_name="logo_wolnelektury.png",
582 media_type="image/png",
586 with open(get_resource('res/jedenprocent.png'), 'rb') as f:
590 file_name="jedenprocent.png",
591 media_type="image/png",
597 style = get_resource('epub/style.css')
598 with open(style, 'rb') as f:
602 file_name="style.css",
603 media_type="text/css",
612 cover_file = six.BytesIO()
613 bound_cover = cover(document.book_info)
614 bound_cover.save(cover_file)
615 cover_name = 'cover.%s' % bound_cover.ext()
618 file_name=cover_name,
619 content=cover_file.getvalue(),
621 spine.append('cover')
622 output.guide.append({
624 "href": "cover.xhtml",
630 if bound_cover.uses_dc_cover:
631 if document.book_info.cover_by:
632 document.edoc.getroot().set('data-cover-by',
633 document.book_info.cover_by)
634 if document.book_info.cover_source:
635 document.edoc.getroot().set('data-cover-source',
636 document.book_info.cover_source)
638 annotations = etree.Element('annotations')
640 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
641 output.toc = toc[0][1]
652 # Last modifications in container files and EPUB creation
653 if len(annotations) > 0:
661 replace_by_verse(annotations)
662 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
663 chars = chars.union(used_chars(html_tree.getroot()))
665 item = epub.EpubItem(
667 file_name="annotations.xhtml",
668 media_type="application/xhtml+xml",
669 content=etree.tostring(
670 html_tree, pretty_print=True, xml_declaration=True,
672 doctype='<!DOCTYPE html>'
675 output.add_item(item)
681 "Wesprzyj Wolne Lektury",
685 with open(get_resource('epub/support.xhtml'), 'rb') as f:
686 html_string = f.read()
687 chars.update(used_chars(etree.fromstring(html_string)))
688 item = epub.EpubItem(
690 file_name="support.xhtml",
691 media_type="application/xhtml+xml",
692 content=squeeze_whitespace(html_string)
694 output.add_item(item)
704 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'),
705 outputtype=output_type)
706 chars.update(used_chars(html_tree.getroot()))
707 item = epub.EpubItem(
709 file_name="last.xhtml",
710 media_type="application/xhtml+xml",
711 content=squeeze_whitespace(etree.tostring(
712 html_tree, pretty_print=True, xml_declaration=True,
714 doctype='<!DOCTYPE html>'
717 output.add_item(item)
720 if not flags or 'without-fonts' not in flags:
722 tmpdir = mkdtemp('-librarian-epub')
728 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)),
730 for fname in ('DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf',
731 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf'):
732 optimizer_call = ['perl', 'subset.pl', '--chars',
733 ''.join(chars).encode('utf-8'),
734 get_resource('fonts/' + fname),
735 os.path.join(tmpdir, fname)]
736 env = {"PERL_USE_UNSAFE_INC": "1"}
738 print("Running font-optimizer")
739 subprocess.check_call(optimizer_call, env=env)
741 dev_null = open(os.devnull, 'w')
742 subprocess.check_call(optimizer_call, stdout=dev_null,
743 stderr=dev_null, env=env)
744 with open(os.path.join(tmpdir, fname), 'rb') as f:
749 media_type="font/ttf",
757 remove_empty_lists_from_toc(output.toc)
759 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub',
762 epub.write_epub(output_file.name, output, {'epub3_landmark': False})
763 return OutputFile.from_filename(output_file.name)