1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
11 from copy import deepcopy
12 from lxml import etree
14 from tempfile import mkdtemp
15 from shutil import rmtree
19 from librarian import XMLNamespace, RDFNS, DCNS, WLNS, NCXNS, OPFNS, NoDublinCore
20 from librarian.dcparser import BookInfo
22 from librarian import functions
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 _resdir = os.path.join(os.path.dirname(os.path.realpath(__file__)), 'epub')
78 return os.path.join(_resdir, fname)
81 def replace_characters(node):
82 def replace_chars(text):
85 return text.replace("---", u"\u2014")\
86 .replace("--", u"\u2013")\
87 .replace(",,", u"\u201E")\
88 .replace('"', u"\u201D")\
89 .replace("'", u"\u2019")
90 if node.tag == 'extra':
93 node.text = replace_chars(node.text)
94 node.tail = replace_chars(node.tail)
96 replace_characters(child)
99 def find_annotations(annotations, source, part_no):
101 if child.tag in ('pe', 'pa', 'pt', 'pr'):
102 annotation = deepcopy(child)
103 number = str(len(annotations)+1)
104 annotation.set('number', number)
105 annotation.set('part', str(part_no))
107 annotations.append(annotation)
112 if child.tag not in ('extra', 'podtytul'):
113 find_annotations(annotations, child, part_no)
116 def replace_by_verse(tree):
117 """ Find stanzas and create new verses in place of a '/' character """
119 stanzas = tree.findall('.//' + WLNS('strofa'))
121 for child_node in node:
122 if child_node.tag in ('slowo_obce', 'wyroznienie'):
123 foreign_verses = inner_xml(child_node).split('/\n')
124 if len(foreign_verses) > 1:
126 for foreign_verse in foreign_verses:
127 if foreign_verse.startswith('<wers'):
128 new_foreign += foreign_verse
130 new_foreign += ''.join(('<wers_normalny>', foreign_verse, '</wers_normalny>'))
131 set_inner_xml(child_node, new_foreign)
132 verses = inner_xml(node).split('/\n')
134 modified_inner_xml = ''
136 if verse.startswith('<wers') or verse.startswith('<extra'):
137 modified_inner_xml += verse
139 modified_inner_xml += ''.join(('<wers_normalny>', verse, '</wers_normalny>'))
140 set_inner_xml(node, modified_inner_xml)
143 def add_to_manifest(manifest, partno):
144 """ Adds a node to the manifest section in content.opf file """
146 partstr = 'part%d' % partno
147 e = manifest.makeelement(OPFNS('item'), attrib={
149 'href': partstr + '.html',
150 'media-type': 'application/xhtml+xml',
155 def add_to_spine(spine, partno):
156 """ Adds a node to the spine section in content.opf file """
158 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
163 def __init__(self, name=None, part_number=None):
166 self.part_number = part_number
167 self.sub_number = None
169 def add(self, name, part_number, level=0, is_part=True):
170 if level > 0 and self.children:
171 return self.children[-1].add(name, part_number, level-1, is_part)
174 t.part_number = part_number
175 self.children.append(t)
177 t.sub_number = len(self.children) + 1
180 def append(self, toc):
181 self.children.append(toc)
183 def extend(self, toc):
184 self.children.extend(toc.children)
188 return max((c.depth() for c in self.children)) + 1
192 def write_to_xml(self, nav_map, counter):
193 for child in self.children:
194 nav_point = nav_map.makeelement(NCXNS('navPoint'))
195 nav_point.set('id', 'NavPoint-%d' % counter)
196 nav_point.set('playOrder', str(counter))
198 nav_label = nav_map.makeelement(NCXNS('navLabel'))
199 text = nav_map.makeelement(NCXNS('text'))
200 text.text = child.name
201 nav_label.append(text)
202 nav_point.append(nav_label)
204 content = nav_map.makeelement(NCXNS('content'))
205 src = 'part%d.html' % child.part_number
206 if child.sub_number is not None:
207 src += '#sub%d' % child.sub_number
208 content.set('src', src)
209 nav_point.append(content)
210 nav_map.append(nav_point)
211 counter = child.write_to_xml(nav_point, counter + 1)
215 def used_chars(element):
216 """ Lists characters used in an ETree Element """
217 chars = set((element.text or '') + (element.tail or ''))
218 for child in element:
219 chars = chars.union(used_chars(child))
224 """ divide main content of the XML file into chunks """
226 # prepare a container for each chunk
227 part_xml = etree.Element('utwor')
228 etree.SubElement(part_xml, 'master')
229 main_xml_part = part_xml[0] # master
231 last_node_part = False
232 for one_part in main_text:
234 if name == 'naglowek_czesc':
236 last_node_part = True
237 main_xml_part[:] = [deepcopy(one_part)]
238 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
240 main_xml_part[:] = [deepcopy(one_part)]
242 main_xml_part.append(deepcopy(one_part))
243 last_node_part = False
247 def transform_chunk(chunk_xml, chunk_no, annotations):
248 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
251 for element in chunk_xml[0]:
252 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
253 toc.add(node_name(element), chunk_no)
254 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
255 subnumber = toc.add(node_name(element), chunk_no, level=1, is_part=False)
256 element.set('sub', str(subnumber))
257 find_annotations(annotations, chunk_xml, chunk_no)
258 replace_by_verse(chunk_xml)
259 html_tree = xslt(chunk_xml, res('xsltScheme.xsl'))
260 chars = used_chars(html_tree.getroot())
261 output_html = etree.tostring(html_tree, pretty_print=True)
262 return output_html, toc, chars
265 def transform(provider, slug=None, file_path=None, output_file=None, output_dir=None, make_dir=False, verbose=False):
266 """ produces a EPUB file
268 provider: a DocProvider
269 slug: slug of file to process, available by provider
270 output_file: file-like object or path to output file
271 output_dir: path to directory to save output file to; either this or output_file must be present
272 make_dir: writes output to <output_dir>/<author>/<slug>.epub instead of <output_dir>/<slug>.epub
275 def transform_file(input_xml, chunk_counter=1, first=True):
276 """ processes one input file and proceeds to its children """
278 children = [child.text for child in input_xml.findall('.//'+DCNS('relation.hasPart'))]
280 # every input file will have a TOC entry,
281 # pointing to starting chunk
282 toc = TOC(node_name(input_xml.find('.//'+DCNS('title'))), chunk_counter)
285 # write book title page
286 html_tree = xslt(input_xml, res('xsltTitle.xsl'))
287 chars = used_chars(html_tree.getroot())
288 zip.writestr('OPS/title.html',
289 etree.tostring(html_tree, pretty_print=True))
291 # write title page for every parent
292 html_tree = xslt(input_xml, res('xsltChunkTitle.xsl'))
293 chars = used_chars(html_tree.getroot())
294 zip.writestr('OPS/part%d.html' % chunk_counter,
295 etree.tostring(html_tree, pretty_print=True))
296 add_to_manifest(manifest, chunk_counter)
297 add_to_spine(spine, chunk_counter)
300 if len(input_xml.getroot()) > 1:
301 # rdf before style master
302 main_text = input_xml.getroot()[1]
304 # rdf in style master
305 main_text = input_xml.getroot()[0]
306 if main_text.tag == RDFNS('RDF'):
309 if main_text is not None:
310 replace_characters(main_text)
312 for chunk_xml in chop(main_text):
313 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations)
314 toc.extend(chunk_toc)
315 chars = chars.union(chunk_chars)
316 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
317 add_to_manifest(manifest, chunk_counter)
318 add_to_spine(spine, chunk_counter)
322 for child in children:
323 child_xml = etree.parse(provider.by_uri(child))
324 child_toc, chunk_counter, chunk_chars = transform_file(child_xml, chunk_counter, first=False)
325 toc.append(child_toc)
326 chars = chars.union(chunk_chars)
328 return toc, chunk_counter, chars
330 # read metadata from the first file
333 raise ValueError('slug or file_path should be specified, not both')
334 f = open(file_path, 'r')
335 input_xml = etree.parse(f)
339 raise ValueError('either slug or file_path should be specified')
340 input_xml = etree.parse(provider[slug])
342 metadata = input_xml.find('.//'+RDFNS('Description'))
344 raise NoDublinCore('Document has no DublinCore - which is required.')
345 book_info = BookInfo.from_element(input_xml)
346 metadata = etree.ElementTree(metadata)
348 # if output to dir, create the file
349 if output_dir is not None:
351 author = unicode(book_info.author)
352 output_dir = os.path.join(output_dir, author)
354 os.makedirs(output_dir)
358 output_file = open(os.path.join(output_dir, '%s.epub' % slug), 'w')
360 output_file = open(os.path.join(output_dir, os.path.splitext(os.path.basename(file_path))[0] + '.epub'), 'w')
362 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
364 # write static elements
365 mime = zipfile.ZipInfo()
366 mime.filename = 'mimetype'
367 mime.compress_type = zipfile.ZIP_STORED
369 zip.writestr(mime, 'application/epub+zip')
370 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
371 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
372 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
373 'media-type="application/oebps-package+xml" />' \
374 '</rootfiles></container>')
375 for fname in 'style.css', 'logo_wolnelektury.png':
376 zip.write(res(fname), os.path.join('OPS', fname))
378 opf = xslt(metadata, res('xsltContent.xsl'))
379 manifest = opf.find('.//' + OPFNS('manifest'))
380 spine = opf.find('.//' + OPFNS('spine'))
382 annotations = etree.Element('annotations')
384 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
385 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
386 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
387 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
388 '<navPoint id="NavPoint-1" playOrder="1"><navLabel>' \
389 '<text>Strona tytułowa</text></navLabel><content src="title.html" />' \
390 '</navPoint></navMap></ncx>')
391 nav_map = toc_file[-1]
393 toc, chunk_counter, chars = transform_file(input_xml)
396 toc.add(u"Początek utworu", 1)
397 toc_counter = toc.write_to_xml(nav_map, 2)
399 # Last modifications in container files and EPUB creation
400 if len(annotations) > 0:
401 nav_map.append(etree.fromstring(
402 '<navPoint id="NavPoint-%(i)d" playOrder="%(i)d" ><navLabel><text>Przypisy</text>'\
403 '</navLabel><content src="annotations.html" /></navPoint>' % {'i': toc_counter}))
404 manifest.append(etree.fromstring(
405 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
406 spine.append(etree.fromstring(
407 '<itemref idref="annotations" />'))
408 replace_by_verse(annotations)
409 html_tree = xslt(annotations, res("xsltAnnotations.xsl"))
410 chars = chars.union(used_chars(html_tree.getroot()))
411 zip.writestr('OPS/annotations.html', etree.tostring(
412 html_tree, pretty_print=True))
415 tmpdir = mkdtemp('-librarian-epub')
418 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
419 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
420 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'), res('../fonts/' + fname), os.path.join(tmpdir, fname)]
422 print "Running font-optimizer"
423 subprocess.check_call(optimizer_call)
425 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
426 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
430 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
432 title = node_name(etree.ETXPath('.//'+DCNS('title'))(input_xml)[0])
433 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
434 for st in attributes:
435 meta = toc_file.makeelement(NCXNS('meta'))
437 meta.set('content', '0')
438 toc_file[0].append(meta)
439 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
440 toc_file[0][1].set('content', str(toc.depth()))
441 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
442 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))