1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from StringIO import StringIO
12 from copy import deepcopy
13 from lxml import etree
15 from tempfile import mkdtemp
16 from shutil import rmtree
20 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
21 from librarian.dcparser import BookInfo
23 from librarian import functions, get_resource
25 functions.reg_person_name()
29 """ returns node's text and children as a string
31 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
35 nt = node.text if node.text is not None else ''
36 return ''.join([nt] + [etree.tostring(child) for child in node])
38 def set_inner_xml(node, text):
39 """ sets node's text and children from a string
41 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
42 >>> set_inner_xml(e, 'x<b>y</b>z')
43 >>> print etree.tostring(e)
47 p = etree.fromstring('<x>%s</x>' % text)
53 """ Find out a node's name
55 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
59 tempnode = deepcopy(node)
61 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
62 for e in tempnode.findall('.//%s' % p):
66 etree.strip_tags(tempnode, '*')
71 if isinstance(xml, etree._Element):
72 xml = etree.ElementTree(xml)
73 with open(sheet) as xsltf:
74 return xml.xslt(etree.parse(xsltf))
77 def replace_characters(node):
78 def replace_chars(text):
81 return text.replace(u"\ufeff", u"")\
82 .replace("---", u"\u2014")\
83 .replace("--", u"\u2013")\
84 .replace(",,", u"\u201E")\
85 .replace('"', u"\u201D")\
86 .replace("'", u"\u2019")
87 if node.tag == 'extra':
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra',):
110 find_annotations(annotations, child, part_no)
113 def replace_by_verse(tree):
114 """ Find stanzas and create new verses in place of a '/' character """
116 stanzas = tree.findall('.//' + WLNS('strofa'))
118 for child_node in node:
119 if child_node.tag in ('slowo_obce', 'wyroznienie'):
120 foreign_verses = inner_xml(child_node).split('/\n')
121 if len(foreign_verses) > 1:
123 for foreign_verse in foreign_verses:
124 if foreign_verse.startswith('<wers'):
125 new_foreign += foreign_verse
127 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
128 set_inner_xml(child_node, new_foreign)
129 verses = inner_xml(node).split('/\n')
131 modified_inner_xml = ''
133 if verse.startswith('<wers') or verse.startswith('<extra'):
134 modified_inner_xml += verse
136 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
137 set_inner_xml(node, modified_inner_xml)
140 def add_to_manifest(manifest, partno):
141 """ Adds a node to the manifest section in content.opf file """
143 partstr = 'part%d' % partno
144 e = manifest.makeelement(OPFNS('item'), attrib={
146 'href': partstr + '.html',
147 'media-type': 'application/xhtml+xml',
152 def add_to_spine(spine, partno):
153 """ Adds a node to the spine section in content.opf file """
155 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
160 def __init__(self, name=None, part_number=None):
163 self.part_number = part_number
164 self.sub_number = None
166 def add(self, name, part_number, level=0, is_part=True):
167 if level > 0 and self.children:
168 return self.children[-1].add(name, part_number, level-1, is_part)
171 t.part_number = part_number
172 self.children.append(t)
174 t.sub_number = len(self.children) + 1
177 def append(self, toc):
178 self.children.append(toc)
180 def extend(self, toc):
181 self.children.extend(toc.children)
185 return max((c.depth() for c in self.children)) + 1
189 def write_to_xml(self, nav_map, counter):
190 for child in self.children:
191 nav_point = nav_map.makeelement(NCXNS('navPoint'))
192 nav_point.set('id', 'NavPoint-%d' % counter)
193 nav_point.set('playOrder', str(counter))
195 nav_label = nav_map.makeelement(NCXNS('navLabel'))
196 text = nav_map.makeelement(NCXNS('text'))
197 text.text = child.name
198 nav_label.append(text)
199 nav_point.append(nav_label)
201 content = nav_map.makeelement(NCXNS('content'))
202 src = 'part%d.html' % child.part_number
203 if child.sub_number is not None:
204 src += '#sub%d' % child.sub_number
205 content.set('src', src)
206 nav_point.append(content)
207 nav_map.append(nav_point)
208 counter = child.write_to_xml(nav_point, counter + 1)
212 def used_chars(element):
213 """ Lists characters used in an ETree Element """
214 chars = set((element.text or '') + (element.tail or ''))
215 for child in element:
216 chars = chars.union(used_chars(child))
221 """ divide main content of the XML file into chunks """
223 # prepare a container for each chunk
224 part_xml = etree.Element('utwor')
225 etree.SubElement(part_xml, 'master')
226 main_xml_part = part_xml[0] # master
228 last_node_part = False
229 for one_part in main_text:
231 if name == 'naglowek_czesc':
233 last_node_part = True
234 main_xml_part[:] = [deepcopy(one_part)]
235 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
237 main_xml_part[:] = [deepcopy(one_part)]
239 main_xml_part.append(deepcopy(one_part))
240 last_node_part = False
244 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
245 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
248 for element in chunk_xml[0]:
249 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
250 toc.add(node_name(element), chunk_no)
251 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
252 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
253 element.set('sub', str(subnumber))
255 if not _empty_html_static:
256 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
258 output_html = _empty_html_static[0]
260 find_annotations(annotations, chunk_xml, chunk_no)
261 replace_by_verse(chunk_xml)
262 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
263 chars = used_chars(html_tree.getroot())
264 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
265 return output_html, toc, chars
268 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None, cover_fn=None):
269 """ produces a EPUB file
271 provider: a DocProvider
272 slug: slug of file to process, available by provider
273 output_file: file-like object or path to output file
274 output_dir: path to directory to save output file to; either this or output_file must be present
275 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
276 sample=n: generate sample e-book (with at least n paragraphs)
277 cover_fn: function(author, title) -> cover image
280 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
281 """ processes one input file and proceeds to its children """
283 replace_characters(input_xml.getroot())
285 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
287 # every input file will have a TOC entry,
288 # pointing to starting chunk
289 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
292 # write book title page
293 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
294 chars = used_chars(html_tree.getroot())
295 zip.writestr('OPS/title.html',
296 etree.tostring(html_tree, method="html", pretty_print=True))
298 # write title page for every parent
299 if sample is not None and sample <= 0:
301 html_string = open(get_resource('epub/emptyChunk.html')).read()
303 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
304 chars = used_chars(html_tree.getroot())
305 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
306 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
307 add_to_manifest(manifest, chunk_counter)
308 add_to_spine(spine, chunk_counter)
311 if len(input_xml.getroot()) > 1:
312 # rdf before style master
313 main_text = input_xml.getroot()[1]
315 # rdf in style master
316 main_text = input_xml.getroot()[0]
317 if main_text.tag == RDFNS('RDF'):
320 if main_text is not None:
321 for chunk_xml in chop(main_text):
323 if sample is not None:
327 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
328 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
330 toc.extend(chunk_toc)
331 chars = chars.union(chunk_chars)
332 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
333 add_to_manifest(manifest, chunk_counter)
334 add_to_spine(spine, chunk_counter)
338 for child in children:
339 child_xml = etree.parse(provider.by_uri(child))
340 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
341 toc.append(child_toc)
342 chars = chars.union(chunk_chars)
344 return toc, chunk_counter, chars, sample
346 # read metadata from the first file
349 raise ValueError('slug or file_path should be specified, not both')
350 f = open(file_path, 'r')
351 input_xml = etree.parse(f)
355 raise ValueError('either slug or file_path should be specified')
356 input_xml = etree.parse(provider[slug])
358 metadata = input_xml.find('.//'+RDFNS('Description'))
360 raise NoDublinCore('Document has no DublinCore - which is required.')
361 book_info = BookInfo.from_element(input_xml)
362 metadata = etree.ElementTree(metadata)
364 # if output to dir, create the file
365 if output_dir is not None:
367 author = unicode(book_info.author)
368 output_dir = os.path.join(output_dir, author)
370 os.makedirs(output_dir)
374 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
376 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
378 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
380 # write static elements
381 mime = zipfile.ZipInfo()
382 mime.filename = 'mimetype'
383 mime.compress_type = zipfile.ZIP_STORED
385 zip.writestr(mime, 'application/epub+zip')
386 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
387 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
388 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
389 'media-type="application/oebps-package+xml" />' \
390 '</rootfiles></container>')
391 zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
392 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
394 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
395 manifest = opf.find('.//' + OPFNS('manifest'))
396 spine = opf.find('.//' + OPFNS('spine'))
400 cover_fn(book_info.author.readable(), book_info.title).save(cover, format='JPEG')
401 zip.writestr(os.path.join('OPS', 'cover.jpg'), cover.getvalue())
403 zip.writestr('OPS/cover.html', open(get_resource('epub/cover.html')).read())
404 manifest.append(etree.fromstring(
405 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
406 manifest.append(etree.fromstring(
407 '<item id="cover-image" href="cover.jpg" media-type="image/jpeg" />'))
408 spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
409 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
410 opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
413 annotations = etree.Element('annotations')
415 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
416 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
417 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
418 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
419 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
420 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
421 '</navPoint></navMap></ncx>')
422 nav_map = toc_file[-1]
424 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
427 toc.add(u"Początek utworu", 1)
428 toc_counter = toc.write_to_xml(nav_map, 2)
430 # Last modifications in container files and EPUB creation
431 if len(annotations) > 0:
432 nav_map.append(etree.fromstring(
433 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
434 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
436 manifest.append(etree.fromstring(
437 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
438 spine.append(etree.fromstring(
439 '<itemref idref="annotations" />'))
440 replace_by_verse(annotations)
441 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
442 chars = chars.union(used_chars(html_tree.getroot()))
443 zip.writestr('OPS/annotations.html', etree.tostring(
444 html_tree, method="html", pretty_print=True))
446 nav_map.append(etree.fromstring(
447 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
448 '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
449 manifest.append(etree.fromstring(
450 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
451 spine.append(etree.fromstring(
452 '<itemref idref="last" />'))
453 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
454 chars.update(used_chars(html_tree.getroot()))
455 zip.writestr('OPS/last.html', etree.tostring(
456 html_tree, method="html", pretty_print=True))
459 tmpdir = mkdtemp('-librarian-epub')
462 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
463 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
464 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
465 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
467 print "Running font-optimizer"
468 subprocess.check_call(optimizer_call)
470 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
471 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
475 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
477 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
478 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
479 for st in attributes:
480 meta = toc_file.makeelement(NCXNS('meta'))
482 meta.set('content', '0')
483 toc_file[0].append(meta)
484 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
485 toc_file[0][1].set('content', str(toc.depth()))
486 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
487 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))