From e18e5408fa57b7c680e7b834a964501898623117 Mon Sep 17 00:00:00 2001 From: Radek Czajka Date: Wed, 9 Dec 2020 09:40:53 +0100 Subject: [PATCH] Experimental DAISY builder. --- CHANGELOG.md | 7 ++ setup.py | 4 +- src/librarian/builders/__init__.py | 2 + src/librarian/builders/daisy.py | 192 +++++++++++++++++++++++++++++ src/librarian/document.py | 12 +- 5 files changed, 210 insertions(+), 7 deletions(-) create mode 100644 src/librarian/builders/daisy.py diff --git a/CHANGELOG.md b/CHANGELOG.md index 4149659..521ef1c 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -3,6 +3,13 @@ This document records all notable changes to Librarian. +## 1.11 (2020-12-09) + +### Added + +- Experimental DAISY builder. + + ## 1.10 (2020-11-09) ### Added diff --git a/setup.py b/setup.py index 79d4f84..3a50c6c 100755 --- a/setup.py +++ b/setup.py @@ -22,7 +22,7 @@ def whole_tree(prefix, path): setup( name='librarian', - version='1.10', + version='1.11', description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats', author="Marek Stępniowski", author_email='marek@stepniowski.com', @@ -42,6 +42,8 @@ setup( 'six', 'texml', 'ebooklib', + 'aeneas', + 'mutagen', ], entry_points = { "console_scripts": [ diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py index b1afe94..dc5bdee 100644 --- a/src/librarian/builders/__init__.py +++ b/src/librarian/builders/__init__.py @@ -2,6 +2,7 @@ from collections import OrderedDict from .txt import TxtBuilder from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder from .sanitize import Sanitizer +from .daisy import DaisyBuilder builders = OrderedDict([ @@ -9,5 +10,6 @@ builders = OrderedDict([ ("html", HtmlBuilder), ("html-standalone", StandaloneHtmlBuilder), ("html-daisy", DaisyHtmlBuilder), + ("daisy", DaisyBuilder), ("sanitizer", Sanitizer), ]) diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py new file mode 100644 index 0000000..b96226f --- /dev/null +++ b/src/librarian/builders/daisy.py @@ -0,0 +1,192 @@ +import subprocess +import tempfile +import zipfile +from aeneas.executetask import ExecuteTask +from aeneas.task import Task +from lxml import etree +import mutagen +from librarian import OutputFile, get_resource +from .html import DaisyHtmlBuilder + + +def get_duration(path): + return float( + subprocess.run( + [ + "ffprobe", + "-i", + path, + "-show_entries", + "format=duration", + "-v", + "quiet", + "-of", + "csv=p=0", + ], + capture_output=True, + text=True, + check=True, + ).stdout + ) + + +def format_hms(t): + seconds = t % 60 + t -= seconds + t /= 60 + minutes = t % 60 + t -= minutes + hours = t / 60 + return "%02d:%02d:%02.3f" % (hours, minutes, seconds) + + +def populate(element, context): + if element.text: + element.text = element.text.format(**context) + if element.tail: + element.tail = element.tail.format(**context) + for k, v in element.attrib.items(): + element.attrib[k] = v.format(**context) + for child in element: + populate(child, context) + + +class DaisyBuilder: + file_extension = 'daisy.zip' + + def build(self, document, mp3, **kwargs): + if not mp3: + raise ValueError("Need MP3 files") + + outfile = tempfile.NamedTemporaryFile(delete=False) + zipf = zipfile.ZipFile(outfile, 'w') + + directory = document.meta.url.slug + '/' + + html = DaisyHtmlBuilder().build(document) + zipf.write( + html.get_filename(), + directory + 'book.html', + ) + + durations = [] + for i, mp3_file in enumerate(mp3): + durations.append(get_duration(mp3_file)) + zipf.write( + mp3_file, + directory + "book%d.mp3" % i, + ) + duration = sum(durations) + + config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab" + task = Task(config_string=config_string) + + # TODO: concatenate all the + with tempfile.TemporaryDirectory() as temp: + with open(temp + "/book.mp3", "wb") as m: + for minput in mp3: + with open(minput, "rb") as minputf: + m.write(minputf.read()) + + + syncfile = temp + "/sync" + task.audio_file_path_absolute = temp + "/book.mp3" + task.text_file_path_absolute = html.get_filename() + task.sync_map_file_path_absolute = syncfile + + ExecuteTask(task).execute() + task.output_sync_map_file() + sync = [] + with open(syncfile) as f: + for line in f: + start, end, sec = line.strip().split('\t') + start = float(start) + end = float(end) + sync.append([start, end, sec]) + + hms = format_hms(duration) + + narrator = mutagen.File(mp3[0]).get('TPE1') + narrator = narrator.text[0] if narrator else '' + + context = { + "VERSION": "1.10", + + "HHMMSSmmm": hms, + "HHMMSS": hms.split('.')[0], + "Sd": "%.1f" % duration, + + "TITLE": document.meta.title, + "PUBLISHER": document.meta.publisher[0], + "YEAR": document.meta.created_at[:4], + "MONTH": document.meta.created_at[5:7], + "AUTHOR": document.meta.author.readable(), + + "NARRATOR": narrator, + } + + for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'): + zipf.write( + get_resource('res/daisy/' + fname), + directory + fname) + + for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'): + with open(get_resource('res/daisy/' + fname)) as f: + tree = etree.parse(f) + populate(tree.getroot(), context) + zipf.writestr( + directory + fname, + etree.tostring( + tree, + xml_declaration=True + ), + ) + + with open(get_resource('res/daisy/content.smil')) as f: + tree = etree.parse(f) + populate(tree.getroot(), context) + + seq = tree.find('//seq') + for i, item in enumerate(sync): + par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last") + etree.SubElement( + par, + "text", + src="book.html#%s" % item[2]) + + # If we have a split between mp3 parts, err on the larger side. + i = 0 + start, end = item[0], item[1] + while start >= durations[i]: + start -= durations[i] + end -= durations[i] + i += 1 + if 2 * (end - durations[i]) > end - start: + start = 0 + end -= durations[i] + i += 1 + + audio = etree.SubElement( + par, + "audio", + src="book%d.mp3" % i, + **{ + "clip-begin": "npt=%.3fs" % start, + "clip-end": "npt=%.3fs" % end, + }, + ) + + zipf.writestr( + directory + 'content.smil', + etree.tostring( + tree, + xml_declaration=True, + pretty_print=True, + ), + ) + + +# WHERE IS MP3 + + zipf.close() + return OutputFile.from_filename(outfile.name) diff --git a/src/librarian/document.py b/src/librarian/document.py index c0efd3a..8876294 100644 --- a/src/librarian/document.py +++ b/src/librarian/document.py @@ -1,17 +1,17 @@ import gettext import os import re +from urllib.request import urlopen from lxml import etree -from .builders import builders from .parser import parser from . import dcparser from .functions import lang_code_3to2 class WLDocument: - def __init__(self, tree=None, filename=None): - if filename is not None: - tree = etree.parse(filename, parser=parser) + def __init__(self, filename=None, url=None): + source = filename or urlopen(url) + tree = etree.parse(source, parser=parser) self.tree = tree tree.getroot().document = self self.base_meta = dcparser.BookInfo({}, {}, validate_required=False) @@ -23,8 +23,8 @@ class WLDocument: return self.tree.getroot().meta return master.meta - def build(self, builder_id, **kwargs): - return builders[builder_id]().build(self, **kwargs) + def build(self, builder, **kwargs): + return builder().build(self, **kwargs) def _compat_assign_ordered_ids(self): """ -- 2.20.1