X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/0289b545510700fe90f881ae52f4a70b3dd6916a..e870e40d5fb975ba9ec0ec327014b3d16eea51e4:/src/librarian/builders/daisy.py diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py index b96226f..38e5338 100644 --- a/src/librarian/builders/daisy.py +++ b/src/librarian/builders/daisy.py @@ -1,3 +1,4 @@ +from copy import deepcopy import subprocess import tempfile import zipfile @@ -6,6 +7,7 @@ from aeneas.task import Task from lxml import etree import mutagen from librarian import OutputFile, get_resource +from librarian.html import raw_printable_text from .html import DaisyHtmlBuilder @@ -54,7 +56,7 @@ def populate(element, context): class DaisyBuilder: file_extension = 'daisy.zip' - def build(self, document, mp3, **kwargs): + def build(self, document, mp3, split_on=None, **kwargs): if not mp3: raise ValueError("Need MP3 files") @@ -63,130 +65,181 @@ class DaisyBuilder: directory = document.meta.url.slug + '/' - html = DaisyHtmlBuilder().build(document) - zipf.write( - html.get_filename(), - directory + 'book.html', - ) + if split_on: + documents = [] + headers = [] + present = True + n = 0 + while present: + present = False + n += 1 + newdoc = deepcopy(document) + newdoc.tree.getroot().document = newdoc + + master = newdoc.tree.getroot()[-1] + i = 0 + for item in list(master): + if item.tag == split_on: + # TODO: clear + i += 1 + if i == n: + headers.append(raw_printable_text(item)) + if i != n and not (n == 1 and not i): + master.remove(item) + else: + present = True + if present: + documents.append(newdoc) + else: + documents = [document] + headers = [document.meta.title] + + assert len(documents) == len(mp3) + + narrator = mutagen.File(mp3[0]).get('TPE1') + narrator = narrator.text[0] if narrator else '' durations = [] - for i, mp3_file in enumerate(mp3): - durations.append(get_duration(mp3_file)) + for i, part in enumerate(documents): + print('part', i) + html = DaisyHtmlBuilder().build(part) zipf.write( - mp3_file, + html.get_filename(), + directory + 'book%d.html' % i, + ) + + durations.append(get_duration(mp3[i])) + zipf.write( + mp3[i], directory + "book%d.mp3" % i, ) - duration = sum(durations) - config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" - task = Task(config_string=config_string) - - # TODO: concatenate all the - with tempfile.TemporaryDirectory() as temp: - with open(temp + "/book.mp3", "wb") as m: - for minput in mp3: - with open(minput, "rb") as minputf: - m.write(minputf.read()) - - - syncfile = temp + "/sync" - task.audio_file_path_absolute = temp + "/book.mp3" - task.text_file_path_absolute = html.get_filename() - task.sync_map_file_path_absolute = syncfile - - ExecuteTask(task).execute() - task.output_sync_map_file() - sync = [] - with open(syncfile) as f: - for line in f: - start, end, sec = line.strip().split('\t') - start = float(start) - end = float(end) - sync.append([start, end, sec]) + config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" + task = Task(config_string=config_string) - hms = format_hms(duration) + with tempfile.TemporaryDirectory() as temp: + syncfile = temp + "/sync" + task.audio_file_path_absolute = mp3[i] + task.text_file_path_absolute = html.get_filename() + task.sync_map_file_path_absolute = syncfile - narrator = mutagen.File(mp3[0]).get('TPE1') - narrator = narrator.text[0] if narrator else '' + ExecuteTask(task).execute() + task.output_sync_map_file() + + sync = [] + with open(syncfile) as f: + for line in f: + start, end, sec = line.strip().split('\t') + start = float(start) + end = float(end) + sync.append([start, end, sec]) + + hms = format_hms(durations[i]) + elapsed_hms = format_hms(sum(durations[:i])) + + context = { + "VERSION": "1.10", + + "HHMMSSmmm": hms, + "HHMMSS": hms.split('.')[0], + "Sd": "%.1f" % durations[i], + "ELAPSED": elapsed_hms, + "TITLE": document.meta.title, + "PUBLISHER": document.meta.publisher[0], + "YEAR": document.meta.created_at[:4], + "MONTH": document.meta.created_at[5:7], + "AUTHOR": document.meta.author.readable(), + + "NARRATOR": narrator, + } + + with open(get_resource('res/daisy/content.smil')) as f: + tree = etree.parse(f) + populate(tree.getroot(), context) + + seq = tree.find('//seq') + for si, item in enumerate(sync): + par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last") + etree.SubElement( + par, + "text", + src="book%d.html#%s" % (i, item[2])) + + audio = etree.SubElement( + par, + "audio", + src="book%d.mp3" % i, + **{ + "clip-begin": "npt=%.3fs" % item[0], + "clip-end": "npt=%.3fs" % item[1], + }, + ) + + zipf.writestr( + directory + 'content%d.smil' % i, + etree.tostring( + tree, + xml_declaration=True, + pretty_print=True, + ), + ) + + for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): + zipf.write( + get_resource('res/daisy/' + fname), + directory + fname) + + duration = sum(durations) + hms = format_hms(duration) context = { "VERSION": "1.10", - "HHMMSSmmm": hms, "HHMMSS": hms.split('.')[0], "Sd": "%.1f" % duration, - "TITLE": document.meta.title, "PUBLISHER": document.meta.publisher[0], "YEAR": document.meta.created_at[:4], "MONTH": document.meta.created_at[5:7], "AUTHOR": document.meta.author.readable(), - "NARRATOR": narrator, } - for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): - zipf.write( - get_resource('res/daisy/' + fname), - directory + fname) + tree = etree.parse(get_resource('res/daisy/er_book_info.xml')) + cont = tree.getroot()[0] + for i, dur in enumerate(durations): + etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur) + zipf.writestr( + directory + 'er_book_info.xml', + etree.tostring(tree, xml_declaration=True)) - for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'): - with open(get_resource('res/daisy/' + fname)) as f: - tree = etree.parse(f) - populate(tree.getroot(), context) - zipf.writestr( - directory + fname, - etree.tostring( - tree, - xml_declaration=True - ), - ) + tree = etree.parse(get_resource('res/daisy/master.smil')) + populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i) + zipf.writestr( + directory + 'master.smil', + etree.tostring(tree, xml_declaration=True)) - with open(get_resource('res/daisy/content.smil')) as f: - tree = etree.parse(f) + tree = etree.parse(get_resource('res/daisy/ncc.html')) populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + if not i: + h1 = etree.SubElement( + cont, 'h1', id='content', **{"class": "title"}) + etree.SubElement( + h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title + else: + h2 = etree.SubElement( + cont, 'h2', id='content', **{"class": "chapter"}) + etree.SubElement( + h2, "a", href='content%d.smil#par000001' % i).text = header - seq = tree.find('//seq') - for i, item in enumerate(sync): - par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last") - etree.SubElement( - par, - "text", - src="book.html#%s" % item[2]) - - # If we have a split between mp3 parts, err on the larger side. - i = 0 - start, end = item[0], item[1] - while start >= durations[i]: - start -= durations[i] - end -= durations[i] - i += 1 - if 2 * (end - durations[i]) > end - start: - start = 0 - end -= durations[i] - i += 1 - - audio = etree.SubElement( - par, - "audio", - src="book%d.mp3" % i, - **{ - "clip-begin": "npt=%.3fs" % start, - "clip-end": "npt=%.3fs" % end, - }, - ) - zipf.writestr( - directory + 'content.smil', - etree.tostring( - tree, - xml_declaration=True, - pretty_print=True, - ), - ) - - -# WHERE IS MP3 - + directory + 'ncc.html', + etree.tostring(tree, xml_declaration=True)) + zipf.close() return OutputFile.from_filename(outfile.name)