1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from mimetypes import guess_type
16 from lxml import etree
18 from tempfile import mkdtemp, NamedTemporaryFile
19 from shutil import rmtree
21 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, DCNS, OutputFile
22 from librarian.cover import make_cover
24 from librarian import functions, get_resource
26 from librarian.hyphenator import Hyphenator
28 functions.reg_person_name()
29 functions.reg_lang_code_3to2()
32 def set_hyph_language(source_tree):
33 def get_short_lng_code(text):
36 with open(get_resource('res/ISO-639-2_8859-1.txt'), 'rb') as f:
38 list = line.strip().split('|')
45 bibl_lng = etree.XPath('//dc:language//text()',
46 namespaces={'dc': str(DCNS)})(source_tree)
47 short_lng = get_short_lng_code(bibl_lng[0])
49 return Hyphenator(get_resource('res/hyph-dictionaries/hyph_' +
55 def hyphenate_and_fix_conjunctions(source_tree, hyph):
56 texts = etree.XPath('/utwor/*[2]//text()')(source_tree)
58 parent = t.getparent()
61 wlist = re.compile(r'\w+|[^\w]', re.UNICODE).findall(t)
63 newt += hyph.inserted(w, u'\u00AD')
66 newt = re.sub(r'(?<=\s\w)\s+', u'\u00A0', newt)
74 """ returns node's text and children as a string
76 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
80 nt = node.text if node.text is not None else ''
81 return ''.join([nt] + [etree.tostring(child) for child in node])
84 def set_inner_xml(node, text):
85 """ sets node's text and children from a string
87 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
88 >>> set_inner_xml(e, 'x<b>y</b>z')
89 >>> print etree.tostring(e)
93 p = etree.fromstring('<x>%s</x>' % text)
99 """ Find out a node's name
101 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
105 tempnode = deepcopy(node)
107 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
108 for e in tempnode.findall('.//%s' % p):
112 etree.strip_tags(tempnode, '*')
116 def xslt(xml, sheet, **kwargs):
117 if isinstance(xml, etree._Element):
118 xml = etree.ElementTree(xml)
119 with open(sheet) as xsltf:
120 transform = etree.XSLT(etree.parse(xsltf))
121 params = dict((key, transform.strparam(value)) for key, value in kwargs.iteritems())
122 return transform(xml, **params)
125 def replace_characters(node):
126 def replace_chars(text):
129 return text.replace(u"\ufeff", u"")\
130 .replace("---", u"\u2014")\
131 .replace("--", u"\u2013")\
132 .replace(",,", u"\u201E")\
133 .replace('"', u"\u201D")\
134 .replace("'", u"\u2019")
135 if node.tag in ('uwaga', 'extra'):
139 node.text = replace_chars(node.text)
140 node.tail = replace_chars(node.tail)
142 replace_characters(child)
145 def find_annotations(annotations, source, part_no):
147 if child.tag in ('pe', 'pa', 'pt', 'pr'):
148 annotation = deepcopy(child)
149 number = str(len(annotations) + 1)
150 annotation.set('number', number)
151 annotation.set('part', str(part_no))
153 annotations.append(annotation)
158 if child.tag not in ('extra', 'uwaga'):
159 find_annotations(annotations, child, part_no)
162 class Stanza(object):
164 Converts / verse endings into verse elements in a stanza.
166 Slashes may only occur directly in the stanza. Any slashes in subelements
167 will be ignored, and the subelements will be put inside verse elements.
169 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
170 >>> Stanza(s).versify()
171 >>> print etree.tostring(s)
172 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
173 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
176 def __init__(self, stanza_elem):
177 self.stanza = stanza_elem
179 self.open_verse = None
182 self.push_text(self.stanza.text)
183 for elem in self.stanza:
185 self.push_text(elem.tail)
186 tail = self.stanza.tail
188 self.stanza.tail = tail
189 self.stanza.extend(self.verses)
191 def open_normal_verse(self):
192 self.open_verse = self.stanza.makeelement("wers_normalny")
193 self.verses.append(self.open_verse)
195 def get_open_verse(self):
196 if self.open_verse is None:
197 self.open_normal_verse()
198 return self.open_verse
200 def push_text(self, text):
203 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
204 if not verse_text.strip():
207 self.open_normal_verse()
208 verse = self.get_open_verse()
210 verse[-1].tail = (verse[-1].tail or "") + verse_text
212 verse.text = (verse.text or "") + verse_text
214 def push_elem(self, elem):
215 if elem.tag.startswith("wers"):
216 verse = deepcopy(elem)
218 self.verses.append(verse)
219 self.open_verse = verse
221 appended = deepcopy(elem)
223 self.get_open_verse().append(appended)
226 def replace_by_verse(tree):
227 """ Find stanzas and create new verses in place of a '/' character """
229 stanzas = tree.findall('.//' + WLNS('strofa'))
230 for stanza in stanzas:
231 Stanza(stanza).versify()
234 def add_to_manifest(manifest, partno):
235 """ Adds a node to the manifest section in content.opf file """
237 partstr = 'part%d' % partno
238 e = manifest.makeelement(
239 OPFNS('item'), attrib={'id': partstr, 'href': partstr + '.html',
240 'media-type': 'application/xhtml+xml'}
245 def add_to_spine(spine, partno):
246 """ Adds a node to the spine section in content.opf file """
248 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno})
253 def __init__(self, name=None, part_href=None):
256 self.part_href = part_href
257 self.sub_number = None
259 def add(self, name, part_href, level=0, is_part=True, index=None):
260 assert level == 0 or index is None
261 if level > 0 and self.children:
262 return self.children[-1].add(name, part_href, level - 1, is_part)
265 t.part_href = part_href
266 if index is not None:
267 self.children.insert(index, t)
269 self.children.append(t)
271 t.sub_number = len(self.children) + 1
274 def append(self, toc):
275 self.children.append(toc)
277 def extend(self, toc):
278 self.children.extend(toc.children)
282 return max((c.depth() for c in self.children)) + 1
288 if self.sub_number is not None:
289 src += '#sub%d' % self.sub_number
292 def write_to_xml(self, nav_map, counter=1):
293 for child in self.children:
294 nav_point = nav_map.makeelement(NCXNS('navPoint'))
295 nav_point.set('id', 'NavPoint-%d' % counter)
296 nav_point.set('playOrder', str(counter))
298 nav_label = nav_map.makeelement(NCXNS('navLabel'))
299 text = nav_map.makeelement(NCXNS('text'))
300 if child.name is not None:
301 text.text = re.sub(r'\n', ' ', child.name)
303 text.text = child.name
304 nav_label.append(text)
305 nav_point.append(nav_label)
307 content = nav_map.makeelement(NCXNS('content'))
308 content.set('src', child.href())
309 nav_point.append(content)
310 nav_map.append(nav_point)
311 counter = child.write_to_xml(nav_point, counter + 1)
314 def html_part(self, depth=0):
316 for child in self.children:
318 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
319 (depth, child.href(), child.name))
320 texts.append(child.html_part(depth + 1))
321 return "\n".join(texts)
324 with open(get_resource('epub/toc.html')) as f:
325 t = unicode(f.read(), 'utf-8')
326 return t % self.html_part()
329 def used_chars(element):
330 """ Lists characters used in an ETree Element """
331 chars = set((element.text or '') + (element.tail or ''))
332 for child in element:
333 chars = chars.union(used_chars(child))
338 """ divide main content of the XML file into chunks """
340 # prepare a container for each chunk
341 part_xml = etree.Element('utwor')
342 etree.SubElement(part_xml, 'master')
343 main_xml_part = part_xml[0] # master
345 last_node_part = False
347 # the below loop are workaround for a problem with epubs in drama ebooks without acts
350 for one_part in main_text:
352 if name == 'naglowek_scena':
354 elif name == 'naglowek_akt':
357 for one_part in main_text:
359 if is_act is False and is_scene is True:
360 if name == 'naglowek_czesc':
362 last_node_part = True
363 main_xml_part[:] = [deepcopy(one_part)]
364 elif not last_node_part and name == "naglowek_scena":
366 main_xml_part[:] = [deepcopy(one_part)]
368 main_xml_part.append(deepcopy(one_part))
369 last_node_part = False
371 if name == 'naglowek_czesc':
373 last_node_part = True
374 main_xml_part[:] = [deepcopy(one_part)]
375 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
377 main_xml_part[:] = [deepcopy(one_part)]
379 main_xml_part.append(deepcopy(one_part))
380 last_node_part = False
384 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
385 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
388 for element in chunk_xml[0]:
389 if element.tag == "naglowek_czesc":
390 toc.add(node_name(element), "part%d.html#book-text" % chunk_no)
391 elif element.tag in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
392 toc.add(node_name(element), "part%d.html" % chunk_no)
393 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
394 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
395 element.set('sub', str(subnumber))
397 if not _empty_html_static:
398 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
400 output_html = _empty_html_static[0]
402 find_annotations(annotations, chunk_xml, chunk_no)
403 replace_by_verse(chunk_xml)
404 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
405 chars = used_chars(html_tree.getroot())
406 output_html = etree.tostring(
407 html_tree, pretty_print=True, xml_declaration=True,
409 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
410 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
412 return output_html, toc, chars
415 def transform(wldoc, verbose=False, style=None, html_toc=False,
416 sample=None, cover=None, flags=None, hyphenate=False, ilustr_path='', output_type='epub'):
417 """ produces a EPUB file
419 sample=n: generate sample e-book (with at least n paragraphs)
420 cover: a cover.Cover factory or True for default
421 flags: less-advertising, without-fonts, working-copy
424 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
425 """ processes one input file and proceeds to its children """
427 replace_characters(wldoc.edoc.getroot())
429 hyphenator = set_hyph_language(wldoc.edoc.getroot()) if hyphenate else None
430 hyphenate_and_fix_conjunctions(wldoc.edoc.getroot(), hyphenator)
432 # every input file will have a TOC entry,
433 # pointing to starting chunk
434 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
437 # write book title page
438 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'), outputtype=output_type)
439 chars = used_chars(html_tree.getroot())
443 html_tree, pretty_print=True, xml_declaration=True,
445 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
446 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
449 # add a title page TOC entry
450 toc.add(u"Strona tytułowa", "title.html")
451 elif wldoc.book_info.parts:
452 # write title page for every parent
453 if sample is not None and sample <= 0:
455 html_string = open(get_resource('epub/emptyChunk.html')).read()
457 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
458 chars = used_chars(html_tree.getroot())
459 html_string = etree.tostring(
460 html_tree, pretty_print=True, xml_declaration=True,
462 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN"' +
463 ' "http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
465 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
466 add_to_manifest(manifest, chunk_counter)
467 add_to_spine(spine, chunk_counter)
470 if len(wldoc.edoc.getroot()) > 1:
471 # rdf before style master
472 main_text = wldoc.edoc.getroot()[1]
474 # rdf in style master
475 main_text = wldoc.edoc.getroot()[0]
476 if main_text.tag == RDFNS('RDF'):
479 if main_text is not None:
480 for chunk_xml in chop(main_text):
482 if sample is not None:
486 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
487 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
489 toc.extend(chunk_toc)
490 chars = chars.union(chunk_chars)
491 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
492 add_to_manifest(manifest, chunk_counter)
493 add_to_spine(spine, chunk_counter)
496 for child in wldoc.parts():
497 child_toc, chunk_counter, chunk_chars, sample = transform_file(
498 child, chunk_counter, first=False, sample=sample)
499 toc.append(child_toc)
500 chars = chars.union(chunk_chars)
502 return toc, chunk_counter, chars, sample
504 document = deepcopy(wldoc)
509 document.edoc.getroot().set(flag, 'yes')
511 document.clean_ed_note()
512 document.clean_ed_note('abstrakt')
515 editors = document.editors()
517 document.edoc.getroot().set('editors', u', '.join(sorted(
518 editor.readable() for editor in editors)))
519 if document.book_info.funders:
520 document.edoc.getroot().set('funders', u', '.join(
521 document.book_info.funders))
522 if document.book_info.thanks:
523 document.edoc.getroot().set('thanks', document.book_info.thanks)
525 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
526 manifest = opf.find('.//' + OPFNS('manifest'))
527 guide = opf.find('.//' + OPFNS('guide'))
528 spine = opf.find('.//' + OPFNS('spine'))
530 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
531 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
533 functions.reg_mathml_epub(zip)
535 if os.path.isdir(ilustr_path):
536 for i, filename in enumerate(os.listdir(ilustr_path)):
537 file_path = os.path.join(ilustr_path, filename)
538 zip.write(file_path, os.path.join('OPS', filename))
539 image_id = 'image%s' % i
540 manifest.append(etree.fromstring(
541 '<item id="%s" href="%s" media-type="%s" />' % (image_id, filename, guess_type(file_path)[0])))
543 # write static elements
544 mime = zipfile.ZipInfo()
545 mime.filename = 'mimetype'
546 mime.compress_type = zipfile.ZIP_STORED
548 zip.writestr(mime, 'application/epub+zip')
550 'META-INF/container.xml',
551 '<?xml version="1.0" ?>'
552 '<container version="1.0" '
553 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">'
554 '<rootfiles><rootfile full-path="OPS/content.opf" '
555 'media-type="application/oebps-package+xml" />'
556 '</rootfiles></container>'
558 zip.write(get_resource('res/wl-logo-small.png'),
559 os.path.join('OPS', 'logo_wolnelektury.png'))
560 zip.write(get_resource('res/jedenprocent.png'),
561 os.path.join('OPS', 'jedenprocent.png'))
563 style = get_resource('epub/style.css')
564 zip.write(style, os.path.join('OPS', 'style.css'))
570 cover_file = StringIO()
571 bound_cover = cover(document.book_info)
572 bound_cover.save(cover_file)
573 cover_name = 'cover.%s' % bound_cover.ext()
574 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
577 cover_tree = etree.parse(get_resource('epub/cover.html'))
578 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
579 zip.writestr('OPS/cover.html', etree.tostring(
580 cover_tree, pretty_print=True, xml_declaration=True,
582 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
583 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
586 if bound_cover.uses_dc_cover:
587 if document.book_info.cover_by:
588 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
589 if document.book_info.cover_source:
590 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
592 manifest.append(etree.fromstring(
593 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
594 manifest.append(etree.fromstring(
595 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
596 spine.insert(0, etree.fromstring('<itemref idref="cover"/>'))
597 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
598 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
600 annotations = etree.Element('annotations')
602 toc_file = etree.fromstring(
603 '<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC '
604 '"-//NISO//DTD ncx 2005-1//EN" '
605 '"http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">'
606 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" '
607 'version="2005-1"><head></head><docTitle></docTitle><navMap>'
610 nav_map = toc_file[-1]
613 manifest.append(etree.fromstring(
614 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
615 spine.append(etree.fromstring(
616 '<itemref idref="html_toc" />'))
617 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
619 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
621 if len(toc.children) < 2:
622 toc.add(u"Początek utworu", "part1.html")
624 # Last modifications in container files and EPUB creation
625 if len(annotations) > 0:
626 toc.add("Przypisy", "annotations.html")
627 manifest.append(etree.fromstring(
628 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
629 spine.append(etree.fromstring(
630 '<itemref idref="annotations" />'))
631 replace_by_verse(annotations)
632 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
633 chars = chars.union(used_chars(html_tree.getroot()))
634 zip.writestr('OPS/annotations.html', etree.tostring(
635 html_tree, pretty_print=True, xml_declaration=True,
637 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
638 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
641 toc.add("Wesprzyj Wolne Lektury", "support.html")
642 manifest.append(etree.fromstring(
643 '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
644 spine.append(etree.fromstring(
645 '<itemref idref="support" />'))
646 html_string = open(get_resource('epub/support.html')).read()
647 chars.update(used_chars(etree.fromstring(html_string)))
648 zip.writestr('OPS/support.html', html_string)
650 toc.add("Strona redakcyjna", "last.html")
651 manifest.append(etree.fromstring(
652 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
653 spine.append(etree.fromstring(
654 '<itemref idref="last" />'))
655 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'), outputtype=output_type)
656 chars.update(used_chars(html_tree.getroot()))
657 zip.writestr('OPS/last.html', etree.tostring(
658 html_tree, pretty_print=True, xml_declaration=True,
660 doctype='<!DOCTYPE html PUBLIC "-//W3C//DTD XHTML 1.1//EN" ' +
661 '"http://www.w3.org/TR/xhtml11/DTD/xhtml11.dtd">'
664 if not flags or 'without-fonts' not in flags:
666 tmpdir = mkdtemp('-librarian-epub')
672 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
673 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
674 optimizer_call = ['perl', 'subset.pl', '--chars',
675 ''.join(chars).encode('utf-8'),
676 get_resource('fonts/' + fname),
677 os.path.join(tmpdir, fname)]
679 print "Running font-optimizer"
680 subprocess.check_call(optimizer_call)
682 dev_null = open(os.devnull, 'w')
683 subprocess.check_call(optimizer_call, stdout=dev_null, stderr=dev_null)
684 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
685 manifest.append(etree.fromstring(
686 '<item id="%s" href="%s" media-type="application/x-font-truetype" />' % (fname, fname)))
690 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True,
691 xml_declaration=True, encoding="utf-8"))
692 title = document.book_info.title
693 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
694 for st in attributes:
695 meta = toc_file.makeelement(NCXNS('meta'))
697 meta.set('content', '0')
698 toc_file[0].append(meta)
699 toc_file[0][0].set('content', str(document.book_info.url))
700 toc_file[0][1].set('content', str(toc.depth()))
701 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
705 toc.add(u"Spis treści", "toc.html", index=1)
706 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
707 toc.write_to_xml(nav_map)
708 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True,
709 xml_declaration=True, encoding="utf-8"))
712 return OutputFile.from_filename(output_file.name)