X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/e18e5408fa57b7c680e7b834a964501898623117..23d025c8875cca1404f274aca7170c9db5e980e7:/src/librarian/builders/daisy.py diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py index b96226f..4949879 100644 --- a/src/librarian/builders/daisy.py +++ b/src/librarian/builders/daisy.py @@ -1,11 +1,14 @@ +# This file is part of Librarian, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Wolne Lektury. See NOTICE for more information. +# +from copy import deepcopy import subprocess import tempfile import zipfile -from aeneas.executetask import ExecuteTask -from aeneas.task import Task from lxml import etree import mutagen from librarian import OutputFile, get_resource +from librarian.html import raw_printable_text from .html import DaisyHtmlBuilder @@ -54,7 +57,7 @@ def populate(element, context): class DaisyBuilder: file_extension = 'daisy.zip' - def build(self, document, mp3, **kwargs): + def build(self, document, mp3, split_on=None, **kwargs): if not mp3: raise ValueError("Need MP3 files") @@ -63,130 +66,117 @@ class DaisyBuilder: directory = document.meta.url.slug + '/' - html = DaisyHtmlBuilder().build(document) - zipf.write( - html.get_filename(), - directory + 'book.html', - ) + if split_on: + documents = [] + headers = [] + present = True + n = 0 + while present: + present = False + n += 1 + newdoc = deepcopy(document) + newdoc.tree.getroot().document = newdoc + + master = newdoc.tree.getroot()[-1] + i = 0 + for item in list(master): + if item.tag == split_on: + # TODO: clear + i += 1 + if i == n: + headers.append(raw_printable_text(item)) + if i != n and not (n == 1 and not i): + master.remove(item) + else: + present = True + if present: + documents.append(newdoc) + else: + documents = [document] + headers = [document.meta.title] + + assert len(documents) == len(mp3) + + narrator = mutagen.File(mp3[0]).get('TPE1') + narrator = narrator.text[0] if narrator else '' durations = [] - for i, mp3_file in enumerate(mp3): - durations.append(get_duration(mp3_file)) + for i, part in enumerate(documents): + print('part', i) + html = DaisyHtmlBuilder().build(part) zipf.write( - mp3_file, + html.get_filename(), + directory + 'book%d.html' % i, + ) + + durations.append(get_duration(mp3[i])) + zipf.write( + mp3[i], directory + "book%d.mp3" % i, ) - duration = sum(durations) - config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" - task = Task(config_string=config_string) - - # TODO: concatenate all the - with tempfile.TemporaryDirectory() as temp: - with open(temp + "/book.mp3", "wb") as m: - for minput in mp3: - with open(minput, "rb") as minputf: - m.write(minputf.read()) - - - syncfile = temp + "/sync" - task.audio_file_path_absolute = temp + "/book.mp3" - task.text_file_path_absolute = html.get_filename() - task.sync_map_file_path_absolute = syncfile - - ExecuteTask(task).execute() - task.output_sync_map_file() - sync = [] - with open(syncfile) as f: - for line in f: - start, end, sec = line.strip().split('\t') - start = float(start) - end = float(end) - sync.append([start, end, sec]) + populate(tree.getroot(), context) - hms = format_hms(duration) + zipf.write( + syncfiles[i], + directory + 'content%d.smil' % i, + ) - narrator = mutagen.File(mp3[0]).get('TPE1') - narrator = narrator.text[0] if narrator else '' + for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): + zipf.write( + get_resource('res/daisy/' + fname), + directory + fname) + duration = sum(durations) + hms = format_hms(duration) context = { "VERSION": "1.10", - "HHMMSSmmm": hms, "HHMMSS": hms.split('.')[0], "Sd": "%.1f" % duration, - "TITLE": document.meta.title, "PUBLISHER": document.meta.publisher[0], "YEAR": document.meta.created_at[:4], "MONTH": document.meta.created_at[5:7], "AUTHOR": document.meta.author.readable(), - "NARRATOR": narrator, } - for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): - zipf.write( - get_resource('res/daisy/' + fname), - directory + fname) + tree = etree.parse(get_resource('res/daisy/er_book_info.xml')) + cont = tree.getroot()[0] + for i, dur in enumerate(durations): + etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur) + zipf.writestr( + directory + 'er_book_info.xml', + etree.tostring(tree, xml_declaration=True)) - for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'): - with open(get_resource('res/daisy/' + fname)) as f: - tree = etree.parse(f) - populate(tree.getroot(), context) - zipf.writestr( - directory + fname, - etree.tostring( - tree, - xml_declaration=True - ), - ) + tree = etree.parse(get_resource('res/daisy/master.smil')) + populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i) + zipf.writestr( + directory + 'master.smil', + etree.tostring(tree, xml_declaration=True)) - with open(get_resource('res/daisy/content.smil')) as f: - tree = etree.parse(f) + tree = etree.parse(get_resource('res/daisy/ncc.html')) populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + if not i: + h1 = etree.SubElement( + cont, 'h1', id='content', **{"class": "title"}) + etree.SubElement( + h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title + else: + h2 = etree.SubElement( + cont, 'h2', id='content', **{"class": "chapter"}) + etree.SubElement( + h2, "a", href='content%d.smil#par000001' % i).text = header - seq = tree.find('//seq') - for i, item in enumerate(sync): - par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last") - etree.SubElement( - par, - "text", - src="book.html#%s" % item[2]) - - # If we have a split between mp3 parts, err on the larger side. - i = 0 - start, end = item[0], item[1] - while start >= durations[i]: - start -= durations[i] - end -= durations[i] - i += 1 - if 2 * (end - durations[i]) > end - start: - start = 0 - end -= durations[i] - i += 1 - - audio = etree.SubElement( - par, - "audio", - src="book%d.mp3" % i, - **{ - "clip-begin": "npt=%.3fs" % start, - "clip-end": "npt=%.3fs" % end, - }, - ) - zipf.writestr( - directory + 'content.smil', - etree.tostring( - tree, - xml_declaration=True, - pretty_print=True, - ), - ) - - -# WHERE IS MP3 - + directory + 'ncc.html', + etree.tostring(tree, xml_declaration=True)) + zipf.close() return OutputFile.from_filename(outfile.name)