1 # -*- coding: utf-8 -*-
3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
6 from __future__ import with_statement
12 from StringIO import StringIO
13 from copy import deepcopy
14 from lxml import etree
16 from tempfile import mkdtemp, NamedTemporaryFile
17 from shutil import rmtree
19 from librarian import RDFNS, WLNS, NCXNS, OPFNS, XHTMLNS, OutputFile
20 from librarian.cover import WLCover
22 from librarian import functions, get_resource
24 functions.reg_person_name()
28 """ returns node's text and children as a string
30 >>> print inner_xml(etree.fromstring('<a>x<b>y</b>z</a>'))
34 nt = node.text if node.text is not None else ''
35 return ''.join([nt] + [etree.tostring(child) for child in node])
37 def set_inner_xml(node, text):
38 """ sets node's text and children from a string
40 >>> e = etree.fromstring('<a>b<b>x</b>x</a>')
41 >>> set_inner_xml(e, 'x<b>y</b>z')
42 >>> print etree.tostring(e)
46 p = etree.fromstring('<x>%s</x>' % text)
52 """ Find out a node's name
54 >>> print node_name(etree.fromstring('<a>X<b>Y</b>Z</a>'))
58 tempnode = deepcopy(node)
60 for p in ('pe', 'pa', 'pt', 'pr', 'motyw'):
61 for e in tempnode.findall('.//%s' % p):
65 etree.strip_tags(tempnode, '*')
70 if isinstance(xml, etree._Element):
71 xml = etree.ElementTree(xml)
72 with open(sheet) as xsltf:
73 return xml.xslt(etree.parse(xsltf))
76 def replace_characters(node):
77 def replace_chars(text):
80 return text.replace(u"\ufeff", u"")\
81 .replace("---", u"\u2014")\
82 .replace("--", u"\u2013")\
83 .replace(",,", u"\u201E")\
84 .replace('"', u"\u201D")\
85 .replace("'", u"\u2019")
86 if node.tag in ('uwaga', 'extra'):
90 node.text = replace_chars(node.text)
91 node.tail = replace_chars(node.tail)
93 replace_characters(child)
96 def find_annotations(annotations, source, part_no):
98 if child.tag in ('pe', 'pa', 'pt', 'pr'):
99 annotation = deepcopy(child)
100 number = str(len(annotations)+1)
101 annotation.set('number', number)
102 annotation.set('part', str(part_no))
104 annotations.append(annotation)
109 if child.tag not in ('extra', 'uwaga'):
110 find_annotations(annotations, child, part_no)
113 class Stanza(object):
115 Converts / verse endings into verse elements in a stanza.
117 Slashes may only occur directly in the stanza. Any slashes in subelements
118 will be ignored, and the subelements will be put inside verse elements.
120 >>> s = etree.fromstring("<strofa>a/\\nb<x>x/\\ny</x>c/ \\nd</strofa>")
121 >>> Stanza(s).versify()
122 >>> print etree.tostring(s)
123 <strofa><wers_normalny>a</wers_normalny><wers_normalny>b<x>x/
124 y</x>c</wers_normalny><wers_normalny>d</wers_normalny></strofa>
127 def __init__(self, stanza_elem):
128 self.stanza = stanza_elem
130 self.open_verse = None
133 self.push_text(self.stanza.text)
134 for elem in self.stanza:
136 self.push_text(elem.tail)
137 tail = self.stanza.tail
139 self.stanza.tail = tail
140 self.stanza.extend(self.verses)
142 def open_normal_verse(self):
143 self.open_verse = self.stanza.makeelement("wers_normalny")
144 self.verses.append(self.open_verse)
146 def get_open_verse(self):
147 if self.open_verse is None:
148 self.open_normal_verse()
149 return self.open_verse
151 def push_text(self, text):
152 if not text or not text.strip():
154 for i, verse_text in enumerate(re.split(r"/\s*\n", text)):
156 self.open_normal_verse()
157 verse = self.get_open_verse()
159 verse[-1].tail = (verse[-1].tail or "") + verse_text.strip()
161 verse.text = (verse.text or "") + verse_text.strip()
163 def push_elem(self, elem):
164 if elem.tag.startswith("wers"):
165 verse = deepcopy(elem)
167 self.verses.append(verse)
168 self.open_verse = verse
170 appended = deepcopy(elem)
172 self.get_open_verse().append(appended)
175 def replace_by_verse(tree):
176 """ Find stanzas and create new verses in place of a '/' character """
178 stanzas = tree.findall('.//' + WLNS('strofa'))
179 for stanza in stanzas:
180 Stanza(stanza).versify()
183 def add_to_manifest(manifest, partno):
184 """ Adds a node to the manifest section in content.opf file """
186 partstr = 'part%d' % partno
187 e = manifest.makeelement(OPFNS('item'), attrib={
189 'href': partstr + '.html',
190 'media-type': 'application/xhtml+xml',
195 def add_to_spine(spine, partno):
196 """ Adds a node to the spine section in content.opf file """
198 e = spine.makeelement(OPFNS('itemref'), attrib={'idref': 'part%d' % partno});
203 def __init__(self, name=None, part_href=None):
206 self.part_href = part_href
207 self.sub_number = None
209 def add(self, name, part_href, level=0, is_part=True, index=None):
210 assert level == 0 or index is None
211 if level > 0 and self.children:
212 return self.children[-1].add(name, part_href, level-1, is_part)
215 t.part_href = part_href
216 if index is not None:
217 self.children.insert(index, t)
219 self.children.append(t)
221 t.sub_number = len(self.children) + 1
224 def append(self, toc):
225 self.children.append(toc)
227 def extend(self, toc):
228 self.children.extend(toc.children)
232 return max((c.depth() for c in self.children)) + 1
238 if self.sub_number is not None:
239 src += '#sub%d' % self.sub_number
242 def write_to_xml(self, nav_map, counter=1):
243 for child in self.children:
244 nav_point = nav_map.makeelement(NCXNS('navPoint'))
245 nav_point.set('id', 'NavPoint-%d' % counter)
246 nav_point.set('playOrder', str(counter))
248 nav_label = nav_map.makeelement(NCXNS('navLabel'))
249 text = nav_map.makeelement(NCXNS('text'))
250 text.text = child.name
251 nav_label.append(text)
252 nav_point.append(nav_label)
254 content = nav_map.makeelement(NCXNS('content'))
255 content.set('src', child.href())
256 nav_point.append(content)
257 nav_map.append(nav_point)
258 counter = child.write_to_xml(nav_point, counter + 1)
261 def html_part(self, depth=0):
263 for child in self.children:
265 "<div style='margin-left:%dem;'><a href='%s'>%s</a></div>" %
266 (depth, child.href(), child.name))
267 texts.append(child.html_part(depth+1))
268 return "\n".join(texts)
271 with open(get_resource('epub/toc.html')) as f:
272 t = unicode(f.read(), 'utf-8')
273 return t % self.html_part()
276 def used_chars(element):
277 """ Lists characters used in an ETree Element """
278 chars = set((element.text or '') + (element.tail or ''))
279 for child in element:
280 chars = chars.union(used_chars(child))
285 """ divide main content of the XML file into chunks """
287 # prepare a container for each chunk
288 part_xml = etree.Element('utwor')
289 etree.SubElement(part_xml, 'master')
290 main_xml_part = part_xml[0] # master
292 last_node_part = False
293 for one_part in main_text:
295 if name == 'naglowek_czesc':
297 last_node_part = True
298 main_xml_part[:] = [deepcopy(one_part)]
299 elif not last_node_part and name in ("naglowek_rozdzial", "naglowek_akt", "srodtytul"):
301 main_xml_part[:] = [deepcopy(one_part)]
303 main_xml_part.append(deepcopy(one_part))
304 last_node_part = False
308 def transform_chunk(chunk_xml, chunk_no, annotations, empty=False, _empty_html_static=[]):
309 """ transforms one chunk, returns a HTML string, a TOC object and a set of used characters """
312 for element in chunk_xml[0]:
313 if element.tag in ("naglowek_czesc", "naglowek_rozdzial", "naglowek_akt", "srodtytul"):
314 toc.add(node_name(element), "part%d.html" % chunk_no)
315 elif element.tag in ('naglowek_podrozdzial', 'naglowek_scena'):
316 subnumber = toc.add(node_name(element), "part%d.html" % chunk_no, level=1, is_part=False)
317 element.set('sub', str(subnumber))
319 if not _empty_html_static:
320 _empty_html_static.append(open(get_resource('epub/emptyChunk.html')).read())
322 output_html = _empty_html_static[0]
324 find_annotations(annotations, chunk_xml, chunk_no)
325 replace_by_verse(chunk_xml)
326 html_tree = xslt(chunk_xml, get_resource('epub/xsltScheme.xsl'))
327 chars = used_chars(html_tree.getroot())
328 output_html = etree.tostring(html_tree, method="html", pretty_print=True)
329 return output_html, toc, chars
332 def transform(wldoc, verbose=False,
333 style=None, html_toc=False,
334 sample=None, cover=None, flags=None):
335 """ produces a EPUB file
337 sample=n: generate sample e-book (with at least n paragraphs)
338 cover: a cover.Cover factory or True for default
339 flags: less-advertising, without-fonts, working-copy
342 def transform_file(wldoc, chunk_counter=1, first=True, sample=None):
343 """ processes one input file and proceeds to its children """
345 replace_characters(wldoc.edoc.getroot())
347 # every input file will have a TOC entry,
348 # pointing to starting chunk
349 toc = TOC(wldoc.book_info.title, "part%d.html" % chunk_counter)
352 # write book title page
353 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltTitle.xsl'))
354 chars = used_chars(html_tree.getroot())
355 zip.writestr('OPS/title.html',
356 etree.tostring(html_tree, method="html", pretty_print=True))
357 # add a title page TOC entry
358 toc.add(u"Strona tytułowa", "title.html")
359 elif wldoc.book_info.parts:
360 # write title page for every parent
361 if sample is not None and sample <= 0:
363 html_string = open(get_resource('epub/emptyChunk.html')).read()
365 html_tree = xslt(wldoc.edoc, get_resource('epub/xsltChunkTitle.xsl'))
366 chars = used_chars(html_tree.getroot())
367 html_string = etree.tostring(html_tree, method="html", pretty_print=True)
368 zip.writestr('OPS/part%d.html' % chunk_counter, html_string)
369 add_to_manifest(manifest, chunk_counter)
370 add_to_spine(spine, chunk_counter)
373 if len(wldoc.edoc.getroot()) > 1:
374 # rdf before style master
375 main_text = wldoc.edoc.getroot()[1]
377 # rdf in style master
378 main_text = wldoc.edoc.getroot()[0]
379 if main_text.tag == RDFNS('RDF'):
382 if main_text is not None:
383 for chunk_xml in chop(main_text):
385 if sample is not None:
389 sample -= len(chunk_xml.xpath('//strofa|//akap|//akap_cd|//akap_dialog'))
390 chunk_html, chunk_toc, chunk_chars = transform_chunk(chunk_xml, chunk_counter, annotations, empty)
392 toc.extend(chunk_toc)
393 chars = chars.union(chunk_chars)
394 zip.writestr('OPS/part%d.html' % chunk_counter, chunk_html)
395 add_to_manifest(manifest, chunk_counter)
396 add_to_spine(spine, chunk_counter)
399 for child in wldoc.parts():
400 child_toc, chunk_counter, chunk_chars, sample = transform_file(
401 child, chunk_counter, first=False, sample=sample)
402 toc.append(child_toc)
403 chars = chars.union(chunk_chars)
405 return toc, chunk_counter, chars, sample
408 document = deepcopy(wldoc)
413 document.edoc.getroot().set(flag, 'yes')
416 document.edoc.getroot().set('editors', u', '.join(sorted(
417 editor.readable() for editor in document.editors())))
419 opf = xslt(document.book_info.to_etree(), get_resource('epub/xsltContent.xsl'))
420 manifest = opf.find('.//' + OPFNS('manifest'))
421 guide = opf.find('.//' + OPFNS('guide'))
422 spine = opf.find('.//' + OPFNS('spine'))
424 output_file = NamedTemporaryFile(prefix='librarian', suffix='.epub', delete=False)
425 zip = zipfile.ZipFile(output_file, 'w', zipfile.ZIP_DEFLATED)
427 # write static elements
428 mime = zipfile.ZipInfo()
429 mime.filename = 'mimetype'
430 mime.compress_type = zipfile.ZIP_STORED
432 zip.writestr(mime, 'application/epub+zip')
433 zip.writestr('META-INF/container.xml', '<?xml version="1.0" ?><container version="1.0" ' \
434 'xmlns="urn:oasis:names:tc:opendocument:xmlns:container">' \
435 '<rootfiles><rootfile full-path="OPS/content.opf" ' \
436 'media-type="application/oebps-package+xml" />' \
437 '</rootfiles></container>')
438 #zip.write(get_resource('res/wl-logo-small.png'), os.path.join('OPS', 'logo_wolnelektury.png'))
439 zip.write(get_resource('res/koedlogo.png'), os.path.join('OPS', 'logo_koed.png'))
440 #zip.write(get_resource('res/jedenprocent.png'), os.path.join('OPS', 'jedenprocent.png'))
442 style = get_resource('epub/style.css')
443 zip.write(style, os.path.join('OPS', 'style.css'))
449 cover_file = StringIO()
450 bound_cover = cover(document.book_info)
451 bound_cover.save(cover_file)
452 cover_name = 'cover.%s' % bound_cover.ext()
453 zip.writestr(os.path.join('OPS', cover_name), cover_file.getvalue())
456 cover_tree = etree.parse(get_resource('epub/cover.html'))
457 cover_tree.find('//' + XHTMLNS('img')).set('src', cover_name)
458 zip.writestr('OPS/cover.html', etree.tostring(
459 cover_tree, method="html", pretty_print=True))
461 if bound_cover.uses_dc_cover:
462 if document.book_info.cover_by:
463 document.edoc.getroot().set('data-cover-by', document.book_info.cover_by)
464 if document.book_info.cover_source:
465 document.edoc.getroot().set('data-cover-source', document.book_info.cover_source)
467 manifest.append(etree.fromstring(
468 '<item id="cover" href="cover.html" media-type="application/xhtml+xml" />'))
469 manifest.append(etree.fromstring(
470 '<item id="cover-image" href="%s" media-type="%s" />' % (cover_name, bound_cover.mime_type())))
471 spine.insert(0, etree.fromstring('<itemref idref="cover" linear="no" />'))
472 opf.getroot()[0].append(etree.fromstring('<meta name="cover" content="cover-image"/>'))
473 guide.append(etree.fromstring('<reference href="cover.html" type="cover" title="Okładka"/>'))
476 annotations = etree.Element('annotations')
478 toc_file = etree.fromstring('<?xml version="1.0" encoding="utf-8"?><!DOCTYPE ncx PUBLIC ' \
479 '"-//NISO//DTD ncx 2005-1//EN" "http://www.daisy.org/z3986/2005/ncx-2005-1.dtd">' \
480 '<ncx xmlns="http://www.daisy.org/z3986/2005/ncx/" xml:lang="pl" ' \
481 'version="2005-1"><head></head><docTitle></docTitle><navMap>' \
483 nav_map = toc_file[-1]
486 manifest.append(etree.fromstring(
487 '<item id="html_toc" href="toc.html" media-type="application/xhtml+xml" />'))
488 spine.append(etree.fromstring(
489 '<itemref idref="html_toc" />'))
490 guide.append(etree.fromstring('<reference href="toc.html" type="toc" title="Spis treści"/>'))
492 toc, chunk_counter, chars, sample = transform_file(document, sample=sample)
494 if len(toc.children) < 2:
495 toc.add(u"Początek utworu", "part1.html")
497 # Last modifications in container files and EPUB creation
498 if len(annotations) > 0:
499 toc.add("Przypisy", "annotations.html")
500 manifest.append(etree.fromstring(
501 '<item id="annotations" href="annotations.html" media-type="application/xhtml+xml" />'))
502 spine.append(etree.fromstring(
503 '<itemref idref="annotations" />'))
504 replace_by_verse(annotations)
505 html_tree = xslt(annotations, get_resource('epub/xsltAnnotations.xsl'))
506 chars = chars.union(used_chars(html_tree.getroot()))
507 zip.writestr('OPS/annotations.html', etree.tostring(
508 html_tree, method="html", pretty_print=True))
510 toc.add("Strona redakcyjna", "last.html")
511 manifest.append(etree.fromstring(
512 '<item id="last" href="last.html" media-type="application/xhtml+xml" />'))
513 spine.append(etree.fromstring(
514 '<itemref idref="last" />'))
515 html_tree = xslt(document.edoc, get_resource('epub/xsltLast.xsl'))
516 chars.update(used_chars(html_tree.getroot()))
517 zip.writestr('OPS/last.html', etree.tostring(
518 html_tree, method="html", pretty_print=True))
520 if not flags or not 'without-fonts' in flags:
522 tmpdir = mkdtemp('-librarian-epub')
528 os.chdir(os.path.join(os.path.dirname(os.path.realpath(__file__)), 'font-optimizer'))
529 for fname in 'DejaVuSerif.ttf', 'DejaVuSerif-Bold.ttf', 'DejaVuSerif-Italic.ttf', 'DejaVuSerif-BoldItalic.ttf':
530 optimizer_call = ['perl', 'subset.pl', '--chars', ''.join(chars).encode('utf-8'),
531 get_resource('fonts/' + fname), os.path.join(tmpdir, fname)]
533 print "Running font-optimizer"
534 subprocess.check_call(optimizer_call)
536 subprocess.check_call(optimizer_call, stdout=subprocess.PIPE, stderr=subprocess.PIPE)
537 zip.write(os.path.join(tmpdir, fname), os.path.join('OPS', fname))
538 manifest.append(etree.fromstring(
539 '<item id="%s" href="%s" media-type="font/ttf" />' % (fname, fname)))
544 zip.writestr('OPS/content.opf', etree.tostring(opf, pretty_print=True))
545 title = document.book_info.title
546 attributes = "dtb:uid", "dtb:depth", "dtb:totalPageCount", "dtb:maxPageNumber"
547 for st in attributes:
548 meta = toc_file.makeelement(NCXNS('meta'))
550 meta.set('content', '0')
551 toc_file[0].append(meta)
552 toc_file[0][0].set('content', ''.join((title, 'WolneLektury.pl')))
553 toc_file[0][1].set('content', str(toc.depth()))
554 set_inner_xml(toc_file[1], ''.join(('<text>', title, '</text>')))
558 toc.add(u"Spis treści", "toc.html", index=1)
559 zip.writestr('OPS/toc.html', toc.html().encode('utf-8'))
560 toc.write_to_xml(nav_map)
561 zip.writestr('OPS/toc.ncx', etree.tostring(toc_file, pretty_print=True))
564 return OutputFile.from_filename(output_file.name)