1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp
16 from shutil import rmtree
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
21 from librarian.dcparser import BookInfo
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag == 'extra':
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra',):
110 find_annotations(annotations, child, part_no)
113 def replace_by_verse(tree):
114 """ Find stanzas and create new verses in place of a '/' character """
116 stanzas = tree.findall('.//' + WLNS('strofa'))
118 for child_node in node:
119 if child_node.tag in ('slowo_obce', 'wyroznienie'):
120 foreign_verses = inner_xml(child_node).split('/\n')
121 if len(foreign_verses) > 1:
123 for foreign_verse in foreign_verses:
124 if foreign_verse.startswith('<wers'):
125 new_foreign += foreign_verse
127 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128 set_inner_xml(child_node, new_foreign)
129 verses = inner_xml(node).split('/\n')
131 modified_inner_xml = ''
133 if verse.startswith('<wers') or verse.startswith('<extra'):
134 modified_inner_xml += verse
136 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137 set_inner_xml(node, modified_inner_xml)
140 def add_to_manifest(manifest, partno):
141 """ Adds a node to the manifest section in content.opf file """
143 partstr = 'part%d' % partno
144 e = manifest.makeelement(OPFNS('item'), attrib={
146 'href': partstr + '.html',
147 'media-type': 'application/xhtml+xml',
152 def add_to_spine(spine, partno):
153 """ Adds a node to the spine section in content.opf file """
155 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
160 def __init__(self, name=None, part_number=None):
163 self.part_number = part_number
164 self.sub_number = None
166 def add(self, name, part_number, level=0, is_part=True):
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_number, level-1, is_part)
171 t.part_number = part_number
172 self.children.append(t)
174 t.sub_number = len(self.children) + 1
177 def append(self, toc):
178 self.children.append(toc)
180 def extend(self, toc):
181 self.children.extend(toc.children)
185 return max((c.depth() for c in self.children)) + 1
189 def write_to_xml(self, nav_map, counter):
190 for child in self.children:
191 nav_point = nav_map.makeelement(NCXNS('navPoint'))
192 nav_point.set('id', 'NavPoint-%d' % counter)
193 nav_point.set('playOrder', str(counter))
195 nav_label = nav_map.makeelement(NCXNS('navLabel'))
196 text = nav_map.makeelement(NCXNS('text'))
197 text.text = child.name
198 nav_label.append(text)
199 nav_point.append(nav_label)
201 content = nav_map.makeelement(NCXNS('content'))
202 src = 'part%d.html' % child.part_number
203 if child.sub_number is not None:
204 src += '#sub%d' % child.sub_number
205 content.set('src', src)
206 nav_point.append(content)
207 nav_map.append(nav_point)
208 counter = child.write_to_xml(nav_point, counter + 1)
212 def used_chars(element):
213 """ Lists characters used in an ETree Element """
214 chars = set((element.text or '') + (element.tail or ''))
215 for child in element:
216 chars = chars.union(used_chars(child))
221 """ divide main content of the XML file into chunks """
223 # prepare a container for each chunk
224 part_xml = etree.Element('utwor')
225 etree.SubElement(part_xml, 'master')
226 main_xml_part = part_xml[0] # master
228 last_node_part = False
229 for one_part in main_text:
231 if name == 'naglowek_czesc':
233 last_node_part = True
234 main_xml_part[:] = [deepcopy(one_part)]
235 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
237 main_xml_part[:] = [deepcopy(one_part)]
239 main_xml_part.append(deepcopy(one_part))
240 last_node_part = False
244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
245 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
248 for element in chunk_xml[0]:
249 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
250 toc.add(node_name(element), chunk_no)
251 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
252 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
253 element.set('sub', str(subnumber))
255 if not _empty_html_static:
256 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
258 output_html = _empty_html_static[0]
260 find_annotations(annotations, chunk_xml, chunk_no)
261 replace_by_verse(chunk_xml)
262 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
263 chars = used_chars(html_tree.getroot())
264 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
265 return output_html, toc, chars
268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
269 sample=None, cover=None, flags=None):
270 """ produces a EPUB file
272 provider: a DocProvider
273 slug: slug of file to process, available by provider
274 output_file: file-like object or path to output file
275 output_dir: path to directory to save output file to; either this or output_file must be present
276 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
277 sample=n: generate sample e-book (with at least n paragraphs)
278 cover: a cover.Cover object
279 flags: less-advertising,
282 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
283 """ processes one input file and proceeds to its children """
285 replace_characters(input_xml.getroot())
287 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
289 # every input file will have a TOC entry,
290 # pointing to starting chunk
291 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
294 # write book title page
295 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
296 chars = used_chars(html_tree.getroot())
297 zip.writestr('OPS/title.html',
298 etree.tostring(html_tree, method="html", pretty_print=True))
300 # write title page for every parent
301 if sample is not None and sample <= 0:
303 html_string = open(get_resource('epub/emptyChunk.html')).read()
305 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
306 chars = used_chars(html_tree.getroot())
307 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
308 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
309 add_to_manifest(manifest, chunk_counter)
310 add_to_spine(spine, chunk_counter)
313 if len(input_xml.getroot()) > 1:
314 # rdf before style master
315 main_text = input_xml.getroot()[1]
317 # rdf in style master
318 main_text = input_xml.getroot()[0]
319 if main_text.tag == RDFNS('RDF'):
322 if main_text is not None:
323 for chunk_xml in chop(main_text):
325 if sample is not None:
329 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
330 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
332 toc.extend(chunk_toc)
333 chars = chars.union(chunk_chars)
334 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
335 add_to_manifest(manifest, chunk_counter)
336 add_to_spine(spine, chunk_counter)
340 for child in children:
341 child_xml = etree.parse(provider.by_uri(child))
342 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
343 toc.append(child_toc)
344 chars = chars.union(chunk_chars)
346 return toc, chunk_counter, chars, sample
348 # read metadata from the first file
351 raise ValueError('slug or file_path should be specified, not both')
352 f = open(file_path, 'r')
353 input_xml = etree.parse(f)
357 raise ValueError('either slug or file_path should be specified')
358 input_xml = etree.parse(provider[slug])
362 input_xml.getroot().set(flag, 'yes')
364 metadata = input_xml.find('.//'+RDFNS('Description'))
366 raise NoDublinCore('Document has no DublinCore - which is required.')
367 book_info = BookInfo.from_element(input_xml)
368 metadata = etree.ElementTree(metadata)
370 # if output to dir, create the file
371 if output_dir is not None:
373 author = unicode(book_info.author)
374 output_dir = os.path.join(output_dir, author)
376 os.makedirs(output_dir)
380 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
382 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
384 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
386 # write static elements
387 mime = zipfile.ZipInfo()
388 mime.filename = 'mimetype'
389 mime.compress_type = zipfile.ZIP_STORED
391 zip.writestr(mime, 'application/epub+zip')
392 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
393 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
394 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
395 'media-type="application/oebps-package+xml" />' \
396 '</rootfiles></container>')
397 zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
398 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
400 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
401 manifest = opf.find('.//' + OPFNS('manifest'))
402 spine = opf.find('.//' + OPFNS('spine'))
405 cover_file = StringIO()
406 c = cover(book_info.author.readable(), book_info.title)
408 c_name = 'cover.%s' % c.ext()
409 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
412 cover_tree = etree.parse(get_resource('epub/cover.html'))
413 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
414 zip.writestr('OPS/cover.html', etree.tostring(
415 cover_tree, method="html", pretty_print=True))
417 manifest.append(etree.fromstring(
418 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
419 manifest.append(etree.fromstring(
420 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
421 spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
422 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
423 opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
426 annotations = etree.Element('annotations')
428 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
429 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
430 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
431 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
432 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
433 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
434 '</navPoint></navMap></ncx>')
435 nav_map = toc_file[-1]
437 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
440 toc.add(u"Początek utworu", 1)
441 toc_counter = toc.write_to_xml(nav_map, 2)
443 # Last modifications in container files and EPUB creation
444 if len(annotations) > 0:
445 nav_map.append(etree.fromstring(
446 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
447 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
449 manifest.append(etree.fromstring(
450 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
451 spine.append(etree.fromstring(
452 '<itemref idref="annotations" />'))
453 replace_by_verse(annotations)
454 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
455 chars = chars.union(used_chars(html_tree.getroot()))
456 zip.writestr('OPS/annotations.html', etree.tostring(
457 html_tree, method="html", pretty_print=True))
459 nav_map.append(etree.fromstring(
460 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
461 '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
462 manifest.append(etree.fromstring(
463 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
464 spine.append(etree.fromstring(
465 '<itemref idref="last" />'))
466 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
467 chars.update(used_chars(html_tree.getroot()))
468 zip.writestr('OPS/last.html', etree.tostring(
469 html_tree, method="html", pretty_print=True))
472 tmpdir = mkdtemp('-librarian-epub')
475 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
476 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
477 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
478 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
480 print "Running font-optimizer"
481 subprocess.check_call(optimizer_call)
483 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
484 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
488 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
490 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
491 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
492 for st in attributes:
493 meta = toc_file.makeelement(NCXNS('meta'))
495 meta.set('content', '0')
496 toc_file[0].append(meta)
497 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
498 toc_file[0][1].set('content', str(toc.depth()))
499 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
500 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))