1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from copy import deepcopy
12 from lxml import etree
14 from tempfile import mkdtemp
15 from shutil import rmtree
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
22 from librarian import functions
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
78 return os.path.join(_resdir, fname)
81 def replace_characters(node):
82 def replace_chars(text):
85 return text.replace(u"\ufeff", u"")\
86 .replace("---", u"\u2014")\
87 .replace("--", u"\u2013")\
88 .replace(",,", u"\u201E")\
89 .replace('"', u"\u201D")\
90 .replace("'", u"\u2019")
91 if node.tag == 'extra':
94 node.text = replace_chars(node.text)
95 node.tail = replace_chars(node.tail)
97 replace_characters(child)
100 def find_annotations(annotations, source, part_no):
102 if child.tag in ('pe', 'pa', 'pt', 'pr'):
103 annotation = deepcopy(child)
104 number = str(len(annotations)+1)
105 annotation.set('number', number)
106 annotation.set('part', str(part_no))
108 annotations.append(annotation)
113 if child.tag not in ('extra', 'podtytul'):
114 find_annotations(annotations, child, part_no)
117 def replace_by_verse(tree):
118 """ Find stanzas and create new verses in place of a '/' character """
120 stanzas = tree.findall('.//' + WLNS('strofa'))
122 for child_node in node:
123 if child_node.tag in ('slowo_obce', 'wyroznienie'):
124 foreign_verses = inner_xml(child_node).split('/\n')
125 if len(foreign_verses) > 1:
127 for foreign_verse in foreign_verses:
128 if foreign_verse.startswith('<wers'):
129 new_foreign += foreign_verse
131 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
132 set_inner_xml(child_node, new_foreign)
133 verses = inner_xml(node).split('/\n')
135 modified_inner_xml = ''
137 if verse.startswith('<wers') or verse.startswith('<extra'):
138 modified_inner_xml += verse
140 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
141 set_inner_xml(node, modified_inner_xml)
144 def add_to_manifest(manifest, partno):
145 """ Adds a node to the manifest section in content.opf file """
147 partstr = 'part%d' % partno
148 e = manifest.makeelement(OPFNS('item'), attrib={
150 'href': partstr + '.html',
151 'media-type': 'application/xhtml+xml',
156 def add_to_spine(spine, partno):
157 """ Adds a node to the spine section in content.opf file """
159 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
164 def __init__(self, name=None, part_number=None):
167 self.part_number = part_number
168 self.sub_number = None
170 def add(self, name, part_number, level=0, is_part=True):
171 if level > 0 and self.children:
172 return self.children[-1].add(name, part_number, level-1, is_part)
175 t.part_number = part_number
176 self.children.append(t)
178 t.sub_number = len(self.children) + 1
181 def append(self, toc):
182 self.children.append(toc)
184 def extend(self, toc):
185 self.children.extend(toc.children)
189 return max((c.depth() for c in self.children)) + 1
193 def write_to_xml(self, nav_map, counter):
194 for child in self.children:
195 nav_point = nav_map.makeelement(NCXNS('navPoint'))
196 nav_point.set('id', 'NavPoint-%d' % counter)
197 nav_point.set('playOrder', str(counter))
199 nav_label = nav_map.makeelement(NCXNS('navLabel'))
200 text = nav_map.makeelement(NCXNS('text'))
201 text.text = child.name
202 nav_label.append(text)
203 nav_point.append(nav_label)
205 content = nav_map.makeelement(NCXNS('content'))
206 src = 'part%d.html' % child.part_number
207 if child.sub_number is not None:
208 src += '#sub%d' % child.sub_number
209 content.set('src', src)
210 nav_point.append(content)
211 nav_map.append(nav_point)
212 counter = child.write_to_xml(nav_point, counter + 1)
216 def used_chars(element):
217 """ Lists characters used in an ETree Element """
218 chars = set((element.text or '') + (element.tail or ''))
219 for child in element:
220 chars = chars.union(used_chars(child))
225 """ divide main content of the XML file into chunks """
227 # prepare a container for each chunk
228 part_xml = etree.Element('utwor')
229 etree.SubElement(part_xml, 'master')
230 main_xml_part = part_xml[0] # master
232 last_node_part = False
233 for one_part in main_text:
235 if name == 'naglowek_czesc':
237 last_node_part = True
238 main_xml_part[:] = [deepcopy(one_part)]
239 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
241 main_xml_part[:] = [deepcopy(one_part)]
243 main_xml_part.append(deepcopy(one_part))
244 last_node_part = False
248 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
249 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
252 for element in chunk_xml[0]:
253 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
254 toc.add(node_name(element), chunk_no)
255 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
256 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
257 element.set('sub', str(subnumber))
259 if not _empty_html_static:
260 _empty_html_static.append(open(res('emptyChunk.html')).read())
262 output_html = _empty_html_static[0]
264 find_annotations(annotations, chunk_xml, chunk_no)
265 replace_by_verse(chunk_xml)
266 html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
267 chars = used_chars(html_tree.getroot())
268 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
269 return output_html, toc, chars
272 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False, sample=None):
273 """ produces a EPUB file
275 provider: a DocProvider
276 slug: slug of file to process, available by provider
277 output_file: file-like object or path to output file
278 output_dir: path to directory to save output file to; either this or output_file must be present
279 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
280 sample=n: generate sample e-book (with at least n paragraphs)
283 def transform_file(input_xml, chunk_counter=1, first=True, sample=None):
284 """ processes one input file and proceeds to its children """
286 replace_characters(input_xml.getroot())
288 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
290 # every input file will have a TOC entry,
291 # pointing to starting chunk
292 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
295 # write book title page
296 html_tree = xslt(input_xml, res('xsltTitle.xsl'))
297 chars = used_chars(html_tree.getroot())
298 zip.writestr('OPS/title.html',
299 etree.tostring(html_tree, method="html", pretty_print=True))
301 # write title page for every parent
302 if sample is not None and sample <= 0:
304 html_string = open(res('emptyChunk.html')).read()
306 html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
307 chars = used_chars(html_tree.getroot())
308 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
309 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
310 add_to_manifest(manifest, chunk_counter)
311 add_to_spine(spine, chunk_counter)
314 if len(input_xml.getroot()) > 1:
315 # rdf before style master
316 main_text = input_xml.getroot()[1]
318 # rdf in style master
319 main_text = input_xml.getroot()[0]
320 if main_text.tag == RDFNS('RDF'):
323 if main_text is not None:
324 for chunk_xml in chop(main_text):
326 if sample is not None:
330 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
331 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
333 toc.extend(chunk_toc)
334 chars = chars.union(chunk_chars)
335 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
336 add_to_manifest(manifest, chunk_counter)
337 add_to_spine(spine, chunk_counter)
341 for child in children:
342 child_xml = etree.parse(provider.by_uri(child))
343 child_toc, chunk_counter, chunk_chars, sample = transform_file(child_xml, chunk_counter, first=False, sample=sample)
344 toc.append(child_toc)
345 chars = chars.union(chunk_chars)
347 return toc, chunk_counter, chars, sample
349 # read metadata from the first file
352 raise ValueError('slug or file_path should be specified, not both')
353 f = open(file_path, 'r')
354 input_xml = etree.parse(f)
358 raise ValueError('either slug or file_path should be specified')
359 input_xml = etree.parse(provider[slug])
361 metadata = input_xml.find('.//'+RDFNS('Description'))
363 raise NoDublinCore('Document has no DublinCore - which is required.')
364 book_info = BookInfo.from_element(input_xml)
365 metadata = etree.ElementTree(metadata)
367 # if output to dir, create the file
368 if output_dir is not None:
370 author = unicode(book_info.author)
371 output_dir = os.path.join(output_dir, author)
373 os.makedirs(output_dir)
377 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
379 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
381 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
383 # write static elements
384 mime = zipfile.ZipInfo()
385 mime.filename = 'mimetype'
386 mime.compress_type = zipfile.ZIP_STORED
388 zip.writestr(mime, 'application/epub+zip')
389 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
390 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
391 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
392 'media-type="application/oebps-package+xml" />' \
393 '</rootfiles></container>')
394 for fname in 'style.css', 'logo_wolnelektury.png':
395 zip.write(res(fname), os.path.join('OPS', fname))
397 opf = xslt(metadata, res('xsltContent.xsl'))
398 manifest = opf.find('.//' + OPFNS('manifest'))
399 spine = opf.find('.//' + OPFNS('spine'))
401 annotations = etree.Element('annotations')
403 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
404 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
405 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
406 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
407 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
408 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
409 '</navPoint></navMap></ncx>')
410 nav_map = toc_file[-1]
412 toc, chunk_counter, chars, sample = transform_file(input_xml, sample=sample)
415 toc.add(u"Początek utworu", 1)
416 toc_counter = toc.write_to_xml(nav_map, 2)
418 # Last modifications in container files and EPUB creation
419 if len(annotations) > 0:
420 nav_map.append(etree.fromstring(
421 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
422 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
423 manifest.append(etree.fromstring(
424 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
425 spine.append(etree.fromstring(
426 '<itemref idref="annotations" />'))
427 replace_by_verse(annotations)
428 html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
429 chars = chars.union(used_chars(html_tree.getroot()))
430 zip.writestr('OPS/annotations.html', etree.tostring(
431 html_tree, method="html", pretty_print=True))
434 tmpdir = mkdtemp('-librarian-epub')
437 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
438 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
439 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
441 print "Running font-optimizer"
442 subprocess.check_call(optimizer_call)
444 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
445 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
449 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
451 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
452 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
453 for st in attributes:
454 meta = toc_file.makeelement(NCXNS('meta'))
456 meta.set('content', '0')
457 toc_file[0].append(meta)
458 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
459 toc_file[0][1].set('content', str(toc.depth()))
460 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
461 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))