1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
18 from mimetypes import guess_type
20 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
21 from librarian.cover import WLCover, FutureOfCopyrightCover
22 from librarian.latex import LatexFragment
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 class Stanza(object):
116 Converts / verse endings into verse elements in a stanza.
118 Slashes may only occur directly in the stanza. Any slashes in subelements
119 will be ignored, and the subelements will be put inside verse elements.
121 >>> s = etree.fromstring("<strofa>a <b>c</b> <b>c</b>/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
122 >>> Stanza(s).versify()
123 >>> print etree.tostring(s)
124 <strofa><wers_normalny>a <b>c</b> <b>c</b></wers_normalny><wers_normalny>b<x>x/
125 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
128 def __init__(self, stanza_elem):
129 self.stanza = stanza_elem
131 self.open_verse = None
134 self.push_text(self.stanza.text)
135 for elem in self.stanza:
137 self.push_text(elem.tail)
138 tail = self.stanza.tail
140 self.stanza.tail = tail
141 self.stanza.extend(self.verses)
143 def open_normal_verse(self):
144 self.open_verse = self.stanza.makeelement("wers_normalny")
145 self.verses.append(self.open_verse)
147 def get_open_verse(self):
148 if self.open_verse is None:
149 self.open_normal_verse()
150 return self.open_verse
152 def push_text(self, text):
155 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
157 self.open_normal_verse()
158 verse = self.get_open_verse()
160 verse[-1].tail = (verse[-1].tail or "") + verse_text
162 verse.text = (verse.text or "") + verse_text
164 def push_elem(self, elem):
165 if elem.tag.startswith("wers"):
166 verse = deepcopy(elem)
168 self.verses.append(verse)
169 self.open_verse = verse
171 appended = deepcopy(elem)
173 self.get_open_verse().append(appended)
176 def replace_by_verse(tree):
177 """ Find stanzas and create new verses in place of a '/' character """
179 stanzas = tree.findall('.//' + WLNS('strofa'))
180 for stanza in stanzas:
181 Stanza(stanza).versify()
184 def add_to_manifest(manifest, partno):
185 """ Adds a node to the manifest section in content.opf file """
187 partstr = 'part%d' % partno
188 e = manifest.makeelement(OPFNS('item'), attrib={
190 'href': partstr + '.html',
191 'media-type': 'application/xhtml+xml',
196 def add_to_spine(spine, partno):
197 """ Adds a node to the spine section in content.opf file """
199 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
204 def __init__(self, name=None, part_href=None):
207 self.part_href = part_href
208 self.sub_number = None
210 def add(self, name, part_href, level=0, is_part=True, index=None):
211 assert level == 0 or index is None
212 if level > 0 and self.children:
213 return self.children[-1].add(name, part_href, level-1, is_part)
216 t.part_href = part_href
217 if index is not None:
218 self.children.insert(index, t)
220 self.children.append(t)
222 t.sub_number = len(self.children) + 1
225 def append(self, toc):
226 self.children.append(toc)
228 def extend(self, toc):
229 self.children.extend(toc.children)
233 return max((c.depth() for c in self.children)) + 1
239 if self.sub_number is not None:
240 src += '#sub%d' % self.sub_number
243 def write_to_xml(self, nav_map, counter=1):
244 for child in self.children:
245 nav_point = nav_map.makeelement(NCXNS('navPoint'))
246 nav_point.set('id', 'NavPoint-%d' % counter)
247 nav_point.set('playOrder', str(counter))
249 nav_label = nav_map.makeelement(NCXNS('navLabel'))
250 text = nav_map.makeelement(NCXNS('text'))
251 text.text = child.name
252 nav_label.append(text)
253 nav_point.append(nav_label)
255 content = nav_map.makeelement(NCXNS('content'))
256 content.set('src', child.href())
257 nav_point.append(content)
258 nav_map.append(nav_point)
259 counter = child.write_to_xml(nav_point, counter + 1)
262 def html_part(self, depth=0):
264 for child in self.children:
266 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
267 (depth, child.href(), child.name))
268 texts.append(child.html_part(depth+1))
269 return "\n".join(texts)
272 with open(get_resource('epub/toc.html')) as f:
273 t = unicode(f.read(), 'utf-8')
274 return t % self.html_part()
277 def used_chars(element):
278 """ Lists characters used in an ETree Element """
279 chars = set((element.text or '') + (element.tail or ''))
280 for child in element:
281 chars = chars.union(used_chars(child))
286 """ divide main content of the XML file into chunks """
288 # prepare a container for each chunk
289 part_xml = etree.Element('utwor')
290 etree.SubElement(part_xml, 'master')
291 main_xml_part = part_xml[0] # master
293 last_node_part = False
294 for one_part in main_text:
296 if name == 'naglowek_czesc':
298 last_node_part = True
299 main_xml_part[:] = [deepcopy(one_part)]
300 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
302 main_xml_part[:] = [deepcopy(one_part)]
304 main_xml_part.append(deepcopy(one_part))
305 last_node_part = False
309 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
310 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
313 for element in chunk_xml[0]:
314 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
315 toc.add(node_name(element), "part%d.html" % chunk_no)
316 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
317 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
318 element.set('sub', str(subnumber))
320 if not _empty_html_static:
321 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
323 output_html = _empty_html_static[0]
325 find_annotations(annotations, chunk_xml, chunk_no)
326 replace_by_verse(chunk_xml)
327 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
328 chars = used_chars(html_tree.getroot())
329 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
330 return output_html, toc, chars
333 def flatten_image_paths(wldoc):
334 root = wldoc.edoc.getroot()
335 for node in root.findall(".//ilustr"):
336 node.attrib['src'] = os.path.basename(node.attrib['src'])
339 def render_latex(wldoc, prefix="latex"):
341 Renders <latex>CODE</latex> as images and returns
342 (changed_wldoc, [ (epub_filepath1, latexfragment_object1), ... ]
344 root = wldoc.edoc.getroot()
345 latex_nodes = root.findall(".//latex")
347 for ln in latex_nodes:
348 fragment = LatexFragment(ln.text, resize=40)
349 images.append((os.path.join(prefix, fragment.filename), fragment))
351 ln.text = os.path.join(prefix, fragment.filename)
356 def transform(wldoc, verbose=False,
357 style=None, html_toc=False,
358 sample=None, cover=None, flags=None, resources=None,
359 intro_file=None, cover_file=None):
360 """ produces a EPUB file
362 sample=n: generate sample e-book (with at least n paragraphs)
363 cover: a cover.Cover factory or True for default
364 flags: less-advertising, without-fonts, working-copy
367 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
368 """ processes one input file and proceeds to its children """
370 replace_characters(wldoc.edoc.getroot())
372 # every input file will have a TOC entry,
373 # pointing to starting chunk
374 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
377 # write book title page
378 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
379 chars = used_chars(html_tree.getroot())
380 zip.writestr('OPS/title.html',
381 etree.tostring(html_tree, method="html", pretty_print=True))
382 # add a title page TOC entry
383 toc.add(u"Tytuł", "title.html")
384 elif wldoc.book_info.parts:
385 # write title page for every parent
386 if sample is not None and sample <= 0:
388 html_string = open(get_resource('epub/emptyChunk.html')).read()
390 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
391 chars = used_chars(html_tree.getroot())
392 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
393 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
394 add_to_manifest(manifest, chunk_counter)
395 add_to_spine(spine, chunk_counter)
398 if len(wldoc.edoc.getroot()) > 1:
399 # rdf before style master
400 main_text = wldoc.edoc.getroot()[1]
402 # rdf in style master
403 main_text = wldoc.edoc.getroot()[0]
404 if main_text.tag == RDFNS('RDF'):
407 flatten_image_paths(wldoc)
409 if main_text is not None:
410 for chunk_xml in chop(main_text):
412 if sample is not None:
416 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
417 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
419 toc.extend(chunk_toc)
420 chars = chars.union(chunk_chars)
421 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
422 add_to_manifest(manifest, chunk_counter)
423 add_to_spine(spine, chunk_counter)
426 for child in wldoc.parts():
427 child_toc, chunk_counter, chunk_chars, sample = transform_file(
428 child, chunk_counter, first=False, sample=sample)
429 toc.append(child_toc)
430 chars = chars.union(chunk_chars)
432 return toc, chunk_counter, chars, sample
435 document = deepcopy(wldoc)
440 document.edoc.getroot().set(flag, 'yes')
443 document.edoc.getroot().set('editors', u', '.join(sorted(
444 editor.readable() for editor in document.editors())))
446 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
447 manifest = opf.find('.//' + OPFNS('manifest'))
448 guide = opf.find('.//' + OPFNS('guide'))
449 spine = opf.find('.//' + OPFNS('spine'))
451 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
452 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
454 # write static elements
455 mime = zipfile.ZipInfo()
456 mime.filename = 'mimetype'
457 mime.compress_type = zipfile.ZIP_STORED
459 zip.writestr(mime, 'application/epub+zip')
460 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
461 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
462 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
463 'media-type="application/oebps-package+xml" />' \
464 '</rootfiles></container>')
465 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
466 zip.write(get_resource('res/logo.png'), os.path.join('OPS', 'logo.png'))
467 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
469 style = get_resource('epub/style.css')
470 zip.write(style, os.path.join('OPS', 'style.css'))
472 document, latex_images = render_latex(document)
473 for image in latex_images:
474 zip.write(image[1].path, os.path.join('OPS', image[0]))
478 if os.path.isdir(resources):
479 for dp, dirs, files in os.walk(resources):
481 fpath = os.path.join(dp, fname)
482 if os.path.isfile(fpath):
483 zip.write(fpath, os.path.join('OPS', fname))
484 manifest.append(etree.fromstring(
485 '<item id="%s" href="%s" media-type="%s" />' % (os.path.splitext(fname)[0], fname, guess_type(fpath)[0])))
488 print "resources path %s is not directory" % resources
493 cover = FutureOfCopyrightCover
495 cover_file = StringIO()
496 c = cover(document.book_info)
498 c_name = 'cover.%s' % c.ext()
499 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
502 cover_tree = etree.parse(get_resource('epub/cover.html'))
503 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
504 zip.writestr('OPS/cover.html', etree.tostring(
505 cover_tree, method="html", pretty_print=True))
508 if document.book_info.cover_by:
509 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
510 if document.book_info.cover_source:
511 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
513 manifest.append(etree.fromstring(
514 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
515 manifest.append(etree.fromstring(
516 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
517 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
518 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
519 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
523 annotations = etree.Element('annotations')
525 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
526 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
527 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
528 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
530 nav_map = toc_file[-1]
532 manifest.append(etree.fromstring(
533 '<item id="first" href="first.html" media-type="application/xhtml+xml" />'))
534 spine.append(etree.fromstring(
535 '<itemref idref="first" />'))
536 html_tree = xslt(document.edoc, get_resource('epub/xsltFirst.xsl'))
537 # chars.update(used_chars(html_tree.getroot()))
538 zip.writestr('OPS/first.html', etree.tostring(
539 html_tree, method="html", pretty_print=True))
542 manifest.append(etree.fromstring(
543 '<item id="intro" href="intro.html" media-type="application/xhtml+xml" />'))
544 spine.append(etree.fromstring(
545 '<itemref idref="intro" />'))
546 zip.writestr('OPS/intro.html', open(intro_file or get_resource('epub/intro.html')).read())
550 manifest.append(etree.fromstring(
551 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
552 spine.append(etree.fromstring(
553 '<itemref idref="html_toc" />'))
554 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Table of Contents"/>'))
556 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
558 toc.add("Informacje redakcyjne", "first.html", index=0)
560 if len(toc.children) < 2:
561 toc.add(u"Początek książki", "part1.html")
563 # Last modifications in container files and EPUB creation
564 if len(annotations) > 0:
565 toc.add("Przypisy", "annotations.html")
566 manifest.append(etree.fromstring(
567 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
568 spine.append(etree.fromstring(
569 '<itemref idref="annotations" />'))
570 replace_by_verse(annotations)
571 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
572 chars = chars.union(used_chars(html_tree.getroot()))
573 zip.writestr('OPS/annotations.html', etree.tostring(
574 html_tree, method="html", pretty_print=True))
576 # toc.add("Weprzyj Wolne Lektury", "support.html")
577 # manifest.append(etree.fromstring(
578 # '<item id="support" href="support.html" media-type="application/xhtml+xml" />'))
579 # spine.append(etree.fromstring(
580 # '<itemref idref="support" />'))
581 # html_string = open(get_resource('epub/support.html')).read()
582 # chars.update(used_chars(etree.fromstring(html_string)))
583 # zip.writestr('OPS/support.html', html_string)
585 toc.add("Informacje redakcyjne", "last.html")
586 manifest.append(etree.fromstring(
587 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
588 spine.append(etree.fromstring(
589 '<itemref idref="last" />'))
590 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
591 chars.update(used_chars(html_tree.getroot()))
592 zip.writestr('OPS/last.html', etree.tostring(
593 html_tree, method="html", pretty_print=True))
595 if not flags or not 'without-fonts' in flags:
597 tmpdir = mkdtemp('-librarian-epub')
603 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
604 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
605 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
606 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
608 print "Running font-optimizer"
609 subprocess.check_call(optimizer_call)
611 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
612 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
613 manifest.append(etree.fromstring(
614 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
619 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
620 title = document.book_info.title
621 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
622 for st in attributes:
623 meta = toc_file.makeelement(NCXNS('meta'))
625 meta.set('content', '0')
626 toc_file[0].append(meta)
627 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
628 toc_file[0][1].set('content', str(toc.depth()))
629 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
633 toc.add(u"Spis treści", "toc.html", index=1)
634 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
635 toc.write_to_xml(nav_map)
636 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
639 return OutputFile.from_filename(output_file.name)