From e870e40d5fb975ba9ec0ec327014b3d16eea51e4 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Fri, 11 Dec 2020 13:13:44 +0100 Subject: [PATCH 1/1] Better handling of multipart DAISY. --- src/librarian/builders/daisy.py | 251 ++++++++++++++--------- src/librarian/builders/html.py | 21 +- src/librarian/builders/sanitize.py | 2 +- src/librarian/builders/txt.py | 2 +- src/librarian/document.py | 6 +- src/librarian/elements/root/__init__.py | 2 + src/librarian/res/daisy/content.smil | 2 +- src/librarian/res/daisy/er_book_info.xml | 4 +- src/librarian/res/daisy/master.smil | 4 +- src/librarian/res/daisy/ncc.html | 4 +- src/librarian/util.py | 2 +- 11 files changed, 172 insertions(+), 128 deletions(-) diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py index b96226f..38e5338 100644 --- a/src/librarian/builders/daisy.py +++ b/src/librarian/builders/daisy.py @@ -1,3 +1,4 @@ +from copy import deepcopy import subprocess import tempfile import zipfile @@ -6,6 +7,7 @@ from aeneas.task import Task from lxml import etree import mutagen from librarian import OutputFile, get_resource +from librarian.html import raw_printable_text from .html import DaisyHtmlBuilder @@ -54,7 +56,7 @@ def populate(element, context): class DaisyBuilder: file_extension = 'daisy.zip' - def build(self, document, mp3, **kwargs): + def build(self, document, mp3, split_on=None, **kwargs): if not mp3: raise ValueError("Need MP3 files") @@ -63,130 +65,181 @@ class DaisyBuilder: directory = document.meta.url.slug + '/' - html = DaisyHtmlBuilder().build(document) - zipf.write( - html.get_filename(), - directory + 'book.html', - ) + if split_on: + documents = [] + headers = [] + present = True + n = 0 + while present: + present = False + n += 1 + newdoc = deepcopy(document) + newdoc.tree.getroot().document = newdoc + + master = newdoc.tree.getroot()[-1] + i = 0 + for item in list(master): + if item.tag == split_on: + # TODO: clear + i += 1 + if i == n: + headers.append(raw_printable_text(item)) + if i != n and not (n == 1 and not i): + master.remove(item) + else: + present = True + if present: + documents.append(newdoc) + else: + documents = [document] + headers = [document.meta.title] + + assert len(documents) == len(mp3) + + narrator = mutagen.File(mp3[0]).get('TPE1') + narrator = narrator.text[0] if narrator else '' durations = [] - for i, mp3_file in enumerate(mp3): - durations.append(get_duration(mp3_file)) + for i, part in enumerate(documents): + print('part', i) + html = DaisyHtmlBuilder().build(part) zipf.write( - mp3_file, + html.get_filename(), + directory + 'book%d.html' % i, + ) + + durations.append(get_duration(mp3[i])) + zipf.write( + mp3[i], directory + "book%d.mp3" % i, ) - duration = sum(durations) - config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" - task = Task(config_string=config_string) - - # TODO: concatenate all the - with tempfile.TemporaryDirectory() as temp: - with open(temp + "/book.mp3", "wb") as m: - for minput in mp3: - with open(minput, "rb") as minputf: - m.write(minputf.read()) - - - syncfile = temp + "/sync" - task.audio_file_path_absolute = temp + "/book.mp3" - task.text_file_path_absolute = html.get_filename() - task.sync_map_file_path_absolute = syncfile - - ExecuteTask(task).execute() - task.output_sync_map_file() - sync = [] - with open(syncfile) as f: - for line in f: - start, end, sec = line.strip().split('\t') - start = float(start) - end = float(end) - sync.append([start, end, sec]) + config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" + task = Task(config_string=config_string) - hms = format_hms(duration) + with tempfile.TemporaryDirectory() as temp: + syncfile = temp + "/sync" + task.audio_file_path_absolute = mp3[i] + task.text_file_path_absolute = html.get_filename() + task.sync_map_file_path_absolute = syncfile - narrator = mutagen.File(mp3[0]).get('TPE1') - narrator = narrator.text[0] if narrator else '' + ExecuteTask(task).execute() + task.output_sync_map_file() + + sync = [] + with open(syncfile) as f: + for line in f: + start, end, sec = line.strip().split('\t') + start = float(start) + end = float(end) + sync.append([start, end, sec]) + + hms = format_hms(durations[i]) + elapsed_hms = format_hms(sum(durations[:i])) + + context = { + "VERSION": "1.10", + + "HHMMSSmmm": hms, + "HHMMSS": hms.split('.')[0], + "Sd": "%.1f" % durations[i], + "ELAPSED": elapsed_hms, + "TITLE": document.meta.title, + "PUBLISHER": document.meta.publisher[0], + "YEAR": document.meta.created_at[:4], + "MONTH": document.meta.created_at[5:7], + "AUTHOR": document.meta.author.readable(), + + "NARRATOR": narrator, + } + + with open(get_resource('res/daisy/content.smil')) as f: + tree = etree.parse(f) + populate(tree.getroot(), context) + + seq = tree.find('//seq') + for si, item in enumerate(sync): + par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last") + etree.SubElement( + par, + "text", + src="book%d.html#%s" % (i, item[2])) + + audio = etree.SubElement( + par, + "audio", + src="book%d.mp3" % i, + **{ + "clip-begin": "npt=%.3fs" % item[0], + "clip-end": "npt=%.3fs" % item[1], + }, + ) + + zipf.writestr( + directory + 'content%d.smil' % i, + etree.tostring( + tree, + xml_declaration=True, + pretty_print=True, + ), + ) + + for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): + zipf.write( + get_resource('res/daisy/' + fname), + directory + fname) + + duration = sum(durations) + hms = format_hms(duration) context = { "VERSION": "1.10", - "HHMMSSmmm": hms, "HHMMSS": hms.split('.')[0], "Sd": "%.1f" % duration, - "TITLE": document.meta.title, "PUBLISHER": document.meta.publisher[0], "YEAR": document.meta.created_at[:4], "MONTH": document.meta.created_at[5:7], "AUTHOR": document.meta.author.readable(), - "NARRATOR": narrator, } - for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): - zipf.write( - get_resource('res/daisy/' + fname), - directory + fname) + tree = etree.parse(get_resource('res/daisy/er_book_info.xml')) + cont = tree.getroot()[0] + for i, dur in enumerate(durations): + etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur) + zipf.writestr( + directory + 'er_book_info.xml', + etree.tostring(tree, xml_declaration=True)) - for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'): - with open(get_resource('res/daisy/' + fname)) as f: - tree = etree.parse(f) - populate(tree.getroot(), context) - zipf.writestr( - directory + fname, - etree.tostring( - tree, - xml_declaration=True - ), - ) + tree = etree.parse(get_resource('res/daisy/master.smil')) + populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i) + zipf.writestr( + directory + 'master.smil', + etree.tostring(tree, xml_declaration=True)) - with open(get_resource('res/daisy/content.smil')) as f: - tree = etree.parse(f) + tree = etree.parse(get_resource('res/daisy/ncc.html')) populate(tree.getroot(), context) + cont = tree.getroot()[-1] + for i, header in enumerate(headers): + if not i: + h1 = etree.SubElement( + cont, 'h1', id='content', **{"class": "title"}) + etree.SubElement( + h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title + else: + h2 = etree.SubElement( + cont, 'h2', id='content', **{"class": "chapter"}) + etree.SubElement( + h2, "a", href='content%d.smil#par000001' % i).text = header - seq = tree.find('//seq') - for i, item in enumerate(sync): - par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last") - etree.SubElement( - par, - "text", - src="book.html#%s" % item[2]) - - # If we have a split between mp3 parts, err on the larger side. - i = 0 - start, end = item[0], item[1] - while start >= durations[i]: - start -= durations[i] - end -= durations[i] - i += 1 - if 2 * (end - durations[i]) > end - start: - start = 0 - end -= durations[i] - i += 1 - - audio = etree.SubElement( - par, - "audio", - src="book%d.mp3" % i, - **{ - "clip-begin": "npt=%.3fs" % start, - "clip-end": "npt=%.3fs" % end, - }, - ) - zipf.writestr( - directory + 'content.smil', - etree.tostring( - tree, - xml_declaration=True, - pretty_print=True, - ), - ) - - -# WHERE IS MP3 - + directory + 'ncc.html', + etree.tostring(tree, xml_declaration=True)) + zipf.close() return OutputFile.from_filename(outfile.name) diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py index 165a265..ee50cb8 100644 --- a/src/librarian/builders/html.py +++ b/src/librarian/builders/html.py @@ -36,18 +36,14 @@ class HtmlBuilder: 'footnotes': self.footnotes, 'nota_red': self.nota_red, } - self.current_cursors = [None] + self.current_cursors = [text] @property def cursor(self): - return self.cursors[self.current_cursors[-1]] - - @cursor.setter - def cursor(self, value): - self.cursors[self.current_cursors[-1]] = value + return self.current_cursors[-1] def enter_fragment(self, fragment): - self.current_cursors.append(fragment) + self.current_cursors.append(self.cursors[fragment]) def exit_fragment(self): self.current_cursors.pop() @@ -63,7 +59,7 @@ class HtmlBuilder: document._compat_assign_ordered_ids() document._compat_assign_section_ids() - def build(self, document): + def build(self, document, **kwargs): self.preprocess(document) document.tree.getroot().html_build(self) self.postprocess(document) @@ -110,19 +106,16 @@ class HtmlBuilder: self.tree.append(self.footnotes) def start_element(self, tag, attrib=None): - self.cursor = etree.SubElement( + self.current_cursors.append(etree.SubElement( self.cursor, tag, **(attrib or {}) - ) + )) def end_element(self): - self.cursor = self.cursor.getparent() + self.current_cursors.pop() def push_text(self, text): - if text == 'Między nami nic nie było': - print(self.cursors) - print(self.current_cursors) cursor = self.cursor if len(cursor): cursor[-1].tail = (cursor[-1].tail or '') + text diff --git a/src/librarian/builders/sanitize.py b/src/librarian/builders/sanitize.py index 4d7f7f9..9897482 100644 --- a/src/librarian/builders/sanitize.py +++ b/src/librarian/builders/sanitize.py @@ -6,7 +6,7 @@ class Sanitizer: identifier = 'sanitize' file_extension = 'xml2' - def build(self, document): + def build(self, document, **kwargs): doc = document.tree.getroot() # TODO: copy doc.sanitize() return OutputFile.from_bytes( diff --git a/src/librarian/builders/txt.py b/src/librarian/builders/txt.py index 2830e89..3f19346 100644 --- a/src/librarian/builders/txt.py +++ b/src/librarian/builders/txt.py @@ -88,7 +88,7 @@ class TxtBuilder: def push_legacy_margin(self, margin, where=None): self.current_fragments[-1].push_legacy_margin(margin) - def build(self, document, raw_text=False): + def build(self, document, raw_text=False, **kwargs): document.tree.getroot().txt_build(self) meta = document.meta diff --git a/src/librarian/document.py b/src/librarian/document.py index 8876294..1c8f223 100644 --- a/src/librarian/document.py +++ b/src/librarian/document.py @@ -4,7 +4,7 @@ import re from urllib.request import urlopen from lxml import etree from .parser import parser -from . import dcparser +from . import dcparser, DCNS from .functions import lang_code_3to2 @@ -14,7 +14,9 @@ class WLDocument: tree = etree.parse(source, parser=parser) self.tree = tree tree.getroot().document = self - self.base_meta = dcparser.BookInfo({}, {}, validate_required=False) + self.base_meta = dcparser.BookInfo({}, { + DCNS('language'): ["pol"], + }, validate_required=False) @property def meta(self): diff --git a/src/librarian/elements/root/__init__.py b/src/librarian/elements/root/__init__.py index 8e624bd..541895a 100644 --- a/src/librarian/elements/root/__init__.py +++ b/src/librarian/elements/root/__init__.py @@ -17,6 +17,8 @@ class Utwor(WLElement): # This should not generally happen. if self.getparent() is not None: return self.getparent().meta + # Fallback + return self.document.base_meta @property def master(self): diff --git a/src/librarian/res/daisy/content.smil b/src/librarian/res/daisy/content.smil index ea9ff99..9f9350c 100644 --- a/src/librarian/res/daisy/content.smil +++ b/src/librarian/res/daisy/content.smil @@ -3,7 +3,7 @@ - + diff --git a/src/librarian/res/daisy/er_book_info.xml b/src/librarian/res/daisy/er_book_info.xml index 0ea3ffd..35ca837 100644 --- a/src/librarian/res/daisy/er_book_info.xml +++ b/src/librarian/res/daisy/er_book_info.xml @@ -1,6 +1,4 @@ - - - + diff --git a/src/librarian/res/daisy/master.smil b/src/librarian/res/daisy/master.smil index 666fed9..0a66301 100644 --- a/src/librarian/res/daisy/master.smil +++ b/src/librarian/res/daisy/master.smil @@ -11,7 +11,5 @@ - - - + diff --git a/src/librarian/res/daisy/ncc.html b/src/librarian/res/daisy/ncc.html index a9b20ec..1941d35 100644 --- a/src/librarian/res/daisy/ncc.html +++ b/src/librarian/res/daisy/ncc.html @@ -28,7 +28,5 @@ - -

{TITLE}

- + diff --git a/src/librarian/util.py b/src/librarian/util.py index 2c6b773..63e7996 100644 --- a/src/librarian/util.py +++ b/src/librarian/util.py @@ -137,5 +137,5 @@ def get_translation(language): return gettext.translation( 'messages', localedir=os.path.join(os.path.dirname(__file__), 'locale'), - languages=[lang_code_3to2(language)], + languages=[lang_code_3to2(language), 'pl'], ) -- 2.20.1