1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp
16 from shutil import rmtree
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag in ('uwaga', 'extra'):
91 node.text = replace_chars(node.text)
92 node.tail = replace_chars(node.tail)
94 replace_characters(child)
97 def find_annotations(annotations, source, part_no):
99 if child.tag in ('pe', 'pa', 'pt', 'pr'):
100 annotation = deepcopy(child)
101 number = str(len(annotations)+1)
102 annotation.set('number', number)
103 annotation.set('part', str(part_no))
105 annotations.append(annotation)
110 if child.tag not in ('extra', 'uwaga'):
111 find_annotations(annotations, child, part_no)
114 def replace_by_verse(tree):
115 """ Find stanzas and create new verses in place of a '/' character """
117 stanzas = tree.findall('.//' + WLNS('strofa'))
119 for child_node in node:
120 if child_node.tag in ('slowo_obce', 'wyroznienie'):
121 foreign_verses = inner_xml(child_node).split('/\n')
122 if len(foreign_verses) > 1:
124 for foreign_verse in foreign_verses:
125 if foreign_verse.startswith('<wers'):
126 new_foreign += foreign_verse
128 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
129 set_inner_xml(child_node, new_foreign)
130 verses = inner_xml(node).split('/\n')
132 modified_inner_xml = ''
134 if verse.startswith('<wers') or verse.startswith('<extra'):
135 modified_inner_xml += verse
137 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
138 set_inner_xml(node, modified_inner_xml)
141 def add_to_manifest(manifest, partno):
142 """ Adds a node to the manifest section in content.opf file """
144 partstr = 'part%d' % partno
145 e = manifest.makeelement(OPFNS('item'), attrib={
147 'href': partstr + '.html',
148 'media-type': 'application/xhtml+xml',
153 def add_to_spine(spine, partno):
154 """ Adds a node to the spine section in content.opf file """
156 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
161 def __init__(self, name=None, part_number=None):
164 self.part_number = part_number
165 self.sub_number = None
167 def add(self, name, part_number, level=0, is_part=True):
168 if level > 0 and self.children:
169 return self.children[-1].add(name, part_number, level-1, is_part)
172 t.part_number = part_number
173 self.children.append(t)
175 t.sub_number = len(self.children) + 1
178 def append(self, toc):
179 self.children.append(toc)
181 def extend(self, toc):
182 self.children.extend(toc.children)
186 return max((c.depth() for c in self.children)) + 1
190 def write_to_xml(self, nav_map, counter):
191 for child in self.children:
192 nav_point = nav_map.makeelement(NCXNS('navPoint'))
193 nav_point.set('id', 'NavPoint-%d' % counter)
194 nav_point.set('playOrder', str(counter))
196 nav_label = nav_map.makeelement(NCXNS('navLabel'))
197 text = nav_map.makeelement(NCXNS('text'))
198 text.text = child.name
199 nav_label.append(text)
200 nav_point.append(nav_label)
202 content = nav_map.makeelement(NCXNS('content'))
203 src = 'part%d.html' % child.part_number
204 if child.sub_number is not None:
205 src += '#sub%d' % child.sub_number
206 content.set('src', src)
207 nav_point.append(content)
208 nav_map.append(nav_point)
209 counter = child.write_to_xml(nav_point, counter + 1)
213 def used_chars(element):
214 """ Lists characters used in an ETree Element """
215 chars = set((element.text or '') + (element.tail or ''))
216 for child in element:
217 chars = chars.union(used_chars(child))
222 """ divide main content of the XML file into chunks """
224 # prepare a container for each chunk
225 part_xml = etree.Element('utwor')
226 etree.SubElement(part_xml, 'master')
227 main_xml_part = part_xml[0] # master
229 last_node_part = False
230 for one_part in main_text:
232 if name == 'naglowek_czesc':
234 last_node_part = True
235 main_xml_part[:] = [deepcopy(one_part)]
236 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
238 main_xml_part[:] = [deepcopy(one_part)]
240 main_xml_part.append(deepcopy(one_part))
241 last_node_part = False
245 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
246 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
249 for element in chunk_xml[0]:
250 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
251 toc.add(node_name(element), chunk_no)
252 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
253 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
254 element.set('sub', str(subnumber))
256 if not _empty_html_static:
257 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
259 output_html = _empty_html_static[0]
261 find_annotations(annotations, chunk_xml, chunk_no)
262 replace_by_verse(chunk_xml)
263 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
264 chars = used_chars(html_tree.getroot())
265 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
266 return output_html, toc, chars
269 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
271 sample=None, cover=None, flags=None):
272 """ produces a EPUB file
274 provider: a DocProvider
275 slug: slug of file to process, available by provider
276 output_file: file-like object or path to output file
277 output_dir: path to directory to save output file to; either this or output_file must be present
278 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
279 sample=n: generate sample e-book (with at least n paragraphs)
280 cover: a cover.Cover object
281 flags: less-advertising, without-fonts
284 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
285 """ processes one input file and proceeds to its children """
287 replace_characters(input_xml.getroot())
289 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
291 # every input file will have a TOC entry,
292 # pointing to starting chunk
293 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
296 # write book title page
297 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
298 chars = used_chars(html_tree.getroot())
299 zip.writestr('OPS/title.html',
300 etree.tostring(html_tree, method="html", pretty_print=True))
302 # write title page for every parent
303 if sample is not None and sample <= 0:
305 html_string = open(get_resource('epub/emptyChunk.html')).read()
307 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
308 chars = used_chars(html_tree.getroot())
309 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
310 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
311 add_to_manifest(manifest, chunk_counter)
312 add_to_spine(spine, chunk_counter)
315 if len(input_xml.getroot()) > 1:
316 # rdf before style master
317 main_text = input_xml.getroot()[1]
319 # rdf in style master
320 main_text = input_xml.getroot()[0]
321 if main_text.tag == RDFNS('RDF'):
324 if main_text is not None:
325 for chunk_xml in chop(main_text):
327 if sample is not None:
331 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
332 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
334 toc.extend(chunk_toc)
335 chars = chars.union(chunk_chars)
336 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
337 add_to_manifest(manifest, chunk_counter)
338 add_to_spine(spine, chunk_counter)
342 for child in children:
343 child_xml = etree.parse(provider.by_uri(child))
344 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
345 toc.append(child_toc)
346 chars = chars.union(chunk_chars)
348 return toc, chunk_counter, chars, sample
350 # read metadata from the first file
353 raise ValueError('slug or file_path should be specified, not both')
354 f = open(file_path, 'r')
355 input_xml = etree.parse(f)
359 raise ValueError('either slug or file_path should be specified')
360 input_xml = etree.parse(provider[slug])
364 input_xml.getroot().set(flag, 'yes')
366 metadata = input_xml.find('.//'+RDFNS('Description'))
368 raise NoDublinCore('Document has no DublinCore - which is required.')
369 book_info = BookInfo.from_element(input_xml)
370 metadata = etree.ElementTree(metadata)
372 # if output to dir, create the file
373 if output_dir is not None:
375 author = unicode(book_info.author)
376 output_dir = os.path.join(output_dir, author)
378 os.makedirs(output_dir)
382 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
384 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
386 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
388 # write static elements
389 mime = zipfile.ZipInfo()
390 mime.filename = 'mimetype'
391 mime.compress_type = zipfile.ZIP_STORED
393 zip.writestr(mime, 'application/epub+zip')
394 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
395 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
396 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
397 'media-type="application/oebps-package+xml" />' \
398 '</rootfiles></container>')
399 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
400 zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
402 style = get_resource('epub/style.css')
403 zip.write(style, os.path.join('OPS', 'style.css'))
405 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
406 manifest = opf.find('.//' + OPFNS('manifest'))
407 spine = opf.find('.//' + OPFNS('spine'))
410 cover_file = StringIO()
411 c = cover(book_info.author.readable(), book_info.title)
413 c_name = 'cover.%s' % c.ext()
414 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
417 cover_tree = etree.parse(get_resource('epub/cover.html'))
418 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
419 zip.writestr('OPS/cover.html', etree.tostring(
420 cover_tree, method="html", pretty_print=True))
422 manifest.append(etree.fromstring(
423 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
424 manifest.append(etree.fromstring(
425 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
426 spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
427 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
428 opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
431 annotations = etree.Element('annotations')
433 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
434 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
435 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
436 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
437 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
438 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
439 '</navPoint></navMap></ncx>')
440 nav_map = toc_file[-1]
442 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
445 toc.add(u"Początek utworu", 1)
446 toc_counter = toc.write_to_xml(nav_map, 2)
448 # Last modifications in container files and EPUB creation
449 if len(annotations) > 0:
450 nav_map.append(etree.fromstring(
451 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
452 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
454 manifest.append(etree.fromstring(
455 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
456 spine.append(etree.fromstring(
457 '<itemref idref="annotations" />'))
458 replace_by_verse(annotations)
459 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
460 chars = chars.union(used_chars(html_tree.getroot()))
461 zip.writestr('OPS/annotations.html', etree.tostring(
462 html_tree, method="html", pretty_print=True))
464 nav_map.append(etree.fromstring(
465 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
466 '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
467 manifest.append(etree.fromstring(
468 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
469 spine.append(etree.fromstring(
470 '<itemref idref="last" />'))
471 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
472 chars.update(used_chars(html_tree.getroot()))
473 zip.writestr('OPS/last.html', etree.tostring(
474 html_tree, method="html", pretty_print=True))
476 if not flags or not 'without-fonts' in flags:
478 tmpdir = mkdtemp('-librarian-epub')
481 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
482 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
483 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
484 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
486 print "Running font-optimizer"
487 subprocess.check_call(optimizer_call)
489 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
490 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
491 manifest.append(etree.fromstring(
492 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
496 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
498 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
499 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
500 for st in attributes:
501 meta = toc_file.makeelement(NCXNS('meta'))
503 meta.set('content', '0')
504 toc_file[0].append(meta)
505 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
506 toc_file[0][1].set('content', str(toc.depth()))
507 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
508 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))