1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
8 from copy import deepcopy
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp
17 from shutil import rmtree
21 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, XHTMLNS, NoDublinCore
22 from librarian.dcparser import BookInfo
23 from librarian.cover import ImageCover
25 from librarian import functions, get_resource
27 functions.reg_person_name()
31 """ returns node's text and children as a string
33 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
37 nt = node.text if node.text is not None else ''
38 return ''.join([nt] + [etree.tostring(child) for child in node])
40 def set_inner_xml(node, text):
41 """ sets node's text and children from a string
43 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
44 >>> set_inner_xml(e, 'x<b>y</b>z')
45 >>> print etree.tostring(e)
49 p = etree.fromstring('<x>%s</x>' % text)
55 """ Find out a node's name
57 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
61 tempnode = deepcopy(node)
63 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
64 for e in tempnode.findall('.//%s' % p):
68 etree.strip_tags(tempnode, '*')
73 if isinstance(xml, etree._Element):
74 xml = etree.ElementTree(xml)
75 with open(sheet) as xsltf:
76 return xml.xslt(etree.parse(xsltf))
79 def replace_characters(node):
80 def replace_chars(text):
83 return text.replace(u"\ufeff", u"")\
84 .replace("---", u"\u2014")\
85 .replace("--", u"\u2013")\
86 .replace(",,", u"\u201E")\
87 .replace('"', u"\u201D")\
88 .replace("'", u"\u2019")
89 if node.tag in ('uwaga', 'extra'):
93 node.text = replace_chars(node.text)
94 node.tail = replace_chars(node.tail)
96 replace_characters(child)
99 def find_annotations(annotations, source, part_no):
101 if child.tag in ('pe', 'pa', 'pt', 'pr'):
102 annotation = deepcopy(child)
103 number = str(len(annotations)+1)
104 annotation.set('number', number)
105 annotation.set('part', str(part_no))
107 annotations.append(annotation)
112 if child.tag not in ('extra', 'uwaga'):
113 find_annotations(annotations, child, part_no)
116 def replace_by_verse(tree):
117 """ Find stanzas and create new verses in place of a '/' character """
119 stanzas = tree.findall('.//' + WLNS('strofa'))
121 for child_node in node:
122 if child_node.tag in ('slowo_obce', 'wyroznienie'):
123 foreign_verses = inner_xml(child_node).split('/\n')
124 if len(foreign_verses) > 1:
126 for foreign_verse in foreign_verses:
127 if foreign_verse.startswith('<wers'):
128 new_foreign += foreign_verse
130 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
131 set_inner_xml(child_node, new_foreign)
132 verses = inner_xml(node).split('/\n')
134 modified_inner_xml = ''
136 if verse.startswith('<wers') or verse.startswith('<extra'):
137 modified_inner_xml += verse
139 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
140 set_inner_xml(node, modified_inner_xml)
143 def add_to_manifest(manifest, partno):
144 """ Adds a node to the manifest section in content.opf file """
146 partstr = 'part%d' % partno
147 e = manifest.makeelement(OPFNS('item'), attrib={
149 'href': partstr + '.html',
150 'media-type': 'application/xhtml+xml',
155 def add_to_spine(spine, partno):
156 """ Adds a node to the spine section in content.opf file """
158 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
163 def __init__(self, name=None, part_number=None):
166 self.part_number = part_number
167 self.sub_number = None
169 def add(self, name, part_number, level=0, is_part=True):
170 if level > 0 and self.children:
171 return self.children[-1].add(name, part_number, level-1, is_part)
174 t.part_number = part_number
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
192 def write_to_xml(self, nav_map, counter):
193 for child in self.children:
194 nav_point = nav_map.makeelement(NCXNS('navPoint'))
195 nav_point.set('id', 'NavPoint-%d' % counter)
196 nav_point.set('playOrder', str(counter))
198 nav_label = nav_map.makeelement(NCXNS('navLabel'))
199 text = nav_map.makeelement(NCXNS('text'))
200 text.text = child.name
201 nav_label.append(text)
202 nav_point.append(nav_label)
204 content = nav_map.makeelement(NCXNS('content'))
205 src = 'part%d.html' % child.part_number
206 if child.sub_number is not None:
207 src += '#sub%d' % child.sub_number
208 content.set('src', src)
209 nav_point.append(content)
210 nav_map.append(nav_point)
211 counter = child.write_to_xml(nav_point, counter + 1)
215 def used_chars(element):
216 """ Lists characters used in an ETree Element """
217 chars = set((element.text or '') + (element.tail or ''))
218 for child in element:
219 chars = chars.union(used_chars(child))
224 """ divide main content of the XML file into chunks """
226 # prepare a container for each chunk
227 part_xml = etree.Element('utwor')
228 etree.SubElement(part_xml, 'master')
229 main_xml_part = part_xml[0] # master
231 last_node_part = False
232 for one_part in main_text:
234 if name == 'naglowek_czesc':
236 last_node_part = True
237 main_xml_part[:] = [deepcopy(one_part)]
238 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
240 main_xml_part[:] = [deepcopy(one_part)]
242 main_xml_part.append(deepcopy(one_part))
243 last_node_part = False
247 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
248 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
251 for element in chunk_xml[0]:
252 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253 toc.add(node_name(element), chunk_no)
254 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
255 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
256 element.set('sub', str(subnumber))
258 if not _empty_html_static:
259 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
261 output_html = _empty_html_static[0]
263 find_annotations(annotations, chunk_xml, chunk_no)
264 replace_by_verse(chunk_xml)
265 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
266 chars = used_chars(html_tree.getroot())
267 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
268 return output_html, toc, chars
271 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False,
272 sample=None, cover=None, flags=None):
273 """ produces a EPUB file
275 provider: a DocProvider
276 slug: slug of file to process, available by provider
277 output_file: file-like object or path to output file
278 output_dir: path to directory to save output file to; either this or output_file must be present
279 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
280 sample=n: generate sample e-book (with at least n paragraphs)
281 cover: a cover.Cover object
282 flags: less-advertising, images, not-wl
285 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
286 """ processes one input file and proceeds to its children """
288 replace_characters(input_xml.getroot())
290 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
292 # every input file will have a TOC entry,
293 # pointing to starting chunk
294 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
297 # write book title page
298 html_tree = xslt(input_xml, get_resource('epub/xsltTitle.xsl'))
299 chars = used_chars(html_tree.getroot())
300 zip.writestr('OPS/title.html',
301 etree.tostring(html_tree, method="html", pretty_print=True))
303 # write title page for every parent
304 if sample is not None and sample <= 0:
306 html_string = open(get_resource('epub/emptyChunk.html')).read()
308 html_tree = xslt(input_xml, get_resource('epub/xsltChunkTitle.xsl'))
309 chars = used_chars(html_tree.getroot())
310 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
311 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
312 add_to_manifest(manifest, chunk_counter)
313 add_to_spine(spine, chunk_counter)
316 if len(input_xml.getroot()) > 1:
317 # rdf before style master
318 main_text = input_xml.getroot()[1]
320 # rdf in style master
321 main_text = input_xml.getroot()[0]
322 if main_text.tag == RDFNS('RDF'):
325 if main_text is not None:
326 for chunk_xml in chop(main_text):
328 if sample is not None:
332 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
333 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
335 toc.extend(chunk_toc)
336 chars = chars.union(chunk_chars)
337 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
338 add_to_manifest(manifest, chunk_counter)
339 add_to_spine(spine, chunk_counter)
343 for child in children:
344 child_xml = etree.parse(provider.by_uri(child))
345 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
346 toc.append(child_toc)
347 chars = chars.union(chunk_chars)
349 return toc, chunk_counter, chars, sample
351 # read metadata from the first file
354 raise ValueError('slug or file_path should be specified, not both')
355 f = open(file_path, 'r')
356 input_xml = etree.parse(f)
360 raise ValueError('either slug or file_path should be specified')
361 input_xml = etree.parse(provider[slug])
365 input_xml.getroot().set(flag, 'yes')
367 metadata = input_xml.find('.//'+RDFNS('Description'))
369 raise NoDublinCore('Document has no DublinCore - which is required.')
370 book_info = BookInfo.from_element(input_xml)
371 metadata = etree.ElementTree(metadata)
373 # if output to dir, create the file
374 if output_dir is not None:
376 author = unicode(book_info.author)
377 output_dir = os.path.join(output_dir, author)
379 os.makedirs(output_dir)
383 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
385 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
387 opf = xslt(metadata, get_resource('epub/xsltContent.xsl'))
388 manifest = opf.find('.//' + OPFNS('manifest'))
389 spine = opf.find('.//' + OPFNS('spine'))
391 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
393 # write static elements
394 mime = zipfile.ZipInfo()
395 mime.filename = 'mimetype'
396 mime.compress_type = zipfile.ZIP_STORED
398 zip.writestr(mime, 'application/epub+zip')
399 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
400 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
401 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
402 'media-type="application/oebps-package+xml" />' \
403 '</rootfiles></container>')
404 zip.write(get_resource('epub/style.css'), os.path.join('OPS', 'style.css'))
405 if not flags or 'not-wl' not in flags:
406 manifest.append(etree.fromstring(
407 '<item id="logo_wolnelektury" href="logo_wolnelektury.png" media-type="image/png" />'))
408 zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
411 cover_file = StringIO()
412 c = cover(book_info.author.readable(), book_info.title)
414 c_name = 'cover.%s' % c.ext()
415 zip.writestr(os.path.join('OPS', c_name), cover_file.getvalue())
418 cover_tree = etree.parse(get_resource('epub/cover.html'))
419 cover_tree.find('//' + XHTMLNS('img')).set('src', c_name)
420 zip.writestr('OPS/cover.html', etree.tostring(
421 cover_tree, method="html", pretty_print=True))
423 manifest.append(etree.fromstring(
424 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
425 manifest.append(etree.fromstring(
426 '<item id="cover-image" href="%s" media-type="%s" />' % (c_name, c.mime_type())))
427 spine.insert(0, etree.fromstring('<itemref idref="cover" />'))
428 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
429 opf.getroot().append(etree.fromstring('<guide><reference href="cover.html" type="cover" title="Okładka"/></guide>'))
431 if flags and 'images' in flags:
432 for ilustr in input_xml.findall('//ilustr'):
433 src = ilustr.get('src')
434 mime = ImageCover(src)().mime_type()
435 zip.write(src, os.path.join('OPS', src))
436 manifest.append(etree.fromstring(
437 '<item id="%s" href="%s" media-type="%s" />' % (src, src, mime)))
438 # get it up to master
440 while after.getparent().tag not in ['powiesc', 'opowiadanie', 'liryka_l', 'liryka_lp', 'dramat_wierszowany_l', 'dramat_wierszowany_lp', 'dramat_wspolczesny']:
441 after = after.getparent()
442 if not(after is ilustr):
443 moved = deepcopy(ilustr)
449 for ilustr in input_xml.findall('//ilustr'):
452 annotations = etree.Element('annotations')
454 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
455 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
456 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
457 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
458 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
459 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
460 '</navPoint></navMap></ncx>')
461 nav_map = toc_file[-1]
463 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
466 toc.add(u"Początek utworu", 1)
467 toc_counter = toc.write_to_xml(nav_map, 2)
469 # Last modifications in container files and EPUB creation
470 if len(annotations) > 0:
471 nav_map.append(etree.fromstring(
472 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
473 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
475 manifest.append(etree.fromstring(
476 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
477 spine.append(etree.fromstring(
478 '<itemref idref="annotations" />'))
479 replace_by_verse(annotations)
480 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
481 chars = chars.union(used_chars(html_tree.getroot()))
482 zip.writestr('OPS/annotations.html', etree.tostring(
483 html_tree, method="html", pretty_print=True))
485 nav_map.append(etree.fromstring(
486 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Strona redakcyjna</text>'\
487 '</navLabel><content src="last.html" /></navPoint>' % {'i': toc_counter}))
488 manifest.append(etree.fromstring(
489 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
490 spine.append(etree.fromstring(
491 '<itemref idref="last" />'))
492 stopka = input_xml.find('//stopka')
493 if stopka is not None:
494 stopka.tag = 'stopka_'
495 replace_by_verse(stopka)
496 html_tree = xslt(stopka, get_resource('epub/xsltScheme.xsl'))
498 html_tree = xslt(input_xml, get_resource('epub/xsltLast.xsl'))
499 chars.update(used_chars(html_tree.getroot()))
500 zip.writestr('OPS/last.html', etree.tostring(
501 html_tree, method="html", pretty_print=True))
504 tmpdir = mkdtemp('-librarian-epub')
507 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
508 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
509 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
510 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
512 print "Running font-optimizer"
513 subprocess.check_call(optimizer_call)
515 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
516 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
520 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
522 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
523 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
524 for st in attributes:
525 meta = toc_file.makeelement(NCXNS('meta'))
527 meta.set('content', '0')
528 toc_file[0].append(meta)
529 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
530 toc_file[0][1].set('content', str(toc.depth()))
531 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
532 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))