From: Radek Czajka <rczajka@rczajka.pl>
Date: Wed, 9 Dec 2020 08:40:53 +0000 (+0100)
Subject: Experimental DAISY builder.
X-Git-Tag: 1.11~2
X-Git-Url: https://git.mdrn.pl/librarian.git/commitdiff_plain/e18e5408fa57b7c680e7b834a964501898623117

Experimental DAISY builder.
---

diff --git a/CHANGELOG.md b/CHANGELOG.md
index 4149659..521ef1c 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
 This document records all notable changes to Librarian.
 
 
+## 1.11 (2020-12-09)
+
+### Added
+
+- Experimental DAISY builder.
+
+
 ## 1.10 (2020-11-09)
 
 ### Added
diff --git a/setup.py b/setup.py
index 79d4f84..3a50c6c 100755
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
 
 setup(
     name='librarian',
-    version='1.10',
+    version='1.11',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek StÄpniowski",
     author_email='marek@stepniowski.com',
@@ -42,6 +42,8 @@ setup(
         'six',
         'texml',
         'ebooklib',
+        'aeneas',
+        'mutagen',
     ],
     entry_points = {
         "console_scripts": [
diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py
index b1afe94..dc5bdee 100644
--- a/src/librarian/builders/__init__.py
+++ b/src/librarian/builders/__init__.py
@@ -2,6 +2,7 @@ from collections import OrderedDict
 from .txt import TxtBuilder
 from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
 from .sanitize import Sanitizer
+from .daisy import DaisyBuilder
 
 
 builders = OrderedDict([
@@ -9,5 +10,6 @@ builders = OrderedDict([
     ("html", HtmlBuilder),
     ("html-standalone", StandaloneHtmlBuilder),
     ("html-daisy", DaisyHtmlBuilder),
+    ("daisy", DaisyBuilder),
     ("sanitizer", Sanitizer),
 ])
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py
new file mode 100644
index 0000000..b96226f
--- /dev/null
+++ b/src/librarian/builders/daisy.py
@@ -0,0 +1,192 @@
+import subprocess
+import tempfile
+import zipfile
+from aeneas.executetask import ExecuteTask
+from aeneas.task import Task
+from lxml import etree
+import mutagen
+from librarian import OutputFile, get_resource
+from .html import DaisyHtmlBuilder
+
+
+def get_duration(path):
+    return float(
+        subprocess.run(
+            [
+                "ffprobe",
+                "-i",
+                path,
+                "-show_entries",
+                "format=duration",
+                "-v",
+                "quiet",
+                "-of",
+                "csv=p=0",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout
+    )
+
+
+def format_hms(t):
+    seconds = t % 60
+    t -= seconds
+    t /= 60
+    minutes = t % 60
+    t -= minutes
+    hours = t / 60
+    return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
+
+
+def populate(element, context):
+    if element.text:
+        element.text = element.text.format(**context)
+    if element.tail:
+        element.tail = element.tail.format(**context)
+    for k, v in element.attrib.items():
+        element.attrib[k] = v.format(**context)
+    for child in element:
+        populate(child, context)
+
+
+class DaisyBuilder:
+    file_extension = 'daisy.zip'
+
+    def build(self, document, mp3, **kwargs):
+        if not mp3:
+            raise ValueError("Need MP3 files")
+        
+        outfile = tempfile.NamedTemporaryFile(delete=False)
+        zipf = zipfile.ZipFile(outfile, 'w')
+
+        directory = document.meta.url.slug + '/'
+
+        html = DaisyHtmlBuilder().build(document)
+        zipf.write(
+            html.get_filename(),
+            directory + 'book.html',
+        )
+
+        durations = []
+        for i, mp3_file in enumerate(mp3):
+            durations.append(get_duration(mp3_file))
+            zipf.write(
+                mp3_file,
+                directory + "book%d.mp3" % i,
+            )
+        duration = sum(durations)
+
+        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+        task = Task(config_string=config_string)
+
+        # TODO: concatenate all the
+        with tempfile.TemporaryDirectory() as temp:
+            with open(temp + "/book.mp3", "wb") as m:
+                for minput in mp3:
+                    with open(minput, "rb") as minputf:
+                        m.write(minputf.read())
+                
+            
+            syncfile = temp + "/sync"
+            task.audio_file_path_absolute = temp + "/book.mp3"
+            task.text_file_path_absolute = html.get_filename()
+            task.sync_map_file_path_absolute = syncfile
+
+            ExecuteTask(task).execute()
+            task.output_sync_map_file()
+            sync = []
+            with open(syncfile) as f:
+                for line in f:
+                    start, end, sec = line.strip().split('\t')
+                    start = float(start)
+                    end = float(end)
+                    sync.append([start, end, sec])
+
+        hms = format_hms(duration)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
+
+        context = {
+            "VERSION": "1.10",
+
+            "HHMMSSmmm": hms,
+            "HHMMSS": hms.split('.')[0],
+            "Sd": "%.1f" % duration,
+
+            "TITLE": document.meta.title,
+            "PUBLISHER": document.meta.publisher[0],
+            "YEAR": document.meta.created_at[:4],
+            "MONTH": document.meta.created_at[5:7],
+            "AUTHOR": document.meta.author.readable(),
+
+            "NARRATOR": narrator,
+        }
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
+            with open(get_resource('res/daisy/' + fname)) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+            zipf.writestr(
+                directory + fname,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True
+                ),
+            )
+
+        with open(get_resource('res/daisy/content.smil')) as f:
+            tree = etree.parse(f)
+        populate(tree.getroot(), context)
+
+        seq = tree.find('//seq')
+        for i, item in enumerate(sync):
+            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
+            etree.SubElement(
+                par,
+                "text",
+                src="book.html#%s" % item[2])
+
+            # If we have a split between mp3 parts, err on the larger side.
+            i = 0
+            start, end = item[0], item[1]
+            while start >= durations[i]:
+                start -= durations[i]
+                end -= durations[i]
+                i += 1
+            if 2 * (end - durations[i]) > end - start:
+                start = 0
+                end -= durations[i]
+                i += 1
+
+            audio = etree.SubElement(
+                par,
+                "audio",
+                src="book%d.mp3" % i,
+                **{
+                    "clip-begin": "npt=%.3fs" % start,
+                    "clip-end": "npt=%.3fs" % end,
+                },
+            )
+            
+        zipf.writestr(
+            directory + 'content.smil',
+            etree.tostring(
+                tree,
+                xml_declaration=True,
+                pretty_print=True,
+            ),
+        )
+
+            
+# WHERE IS MP3
+        
+        zipf.close()
+        return OutputFile.from_filename(outfile.name)
diff --git a/src/librarian/document.py b/src/librarian/document.py
index c0efd3a..8876294 100644
--- a/src/librarian/document.py
+++ b/src/librarian/document.py
@@ -1,17 +1,17 @@
 import gettext
 import os
 import re
+from urllib.request import urlopen
 from lxml import etree
-from .builders import builders
 from .parser import parser
 from . import dcparser
 from .functions import lang_code_3to2
 
 
 class WLDocument:
-    def __init__(self, tree=None, filename=None):
-        if filename is not None:
-            tree = etree.parse(filename, parser=parser)
+    def __init__(self, filename=None, url=None):
+        source = filename or urlopen(url)
+        tree = etree.parse(source, parser=parser)
         self.tree = tree
         tree.getroot().document = self
         self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
@@ -23,8 +23,8 @@ class WLDocument:
         return self.tree.getroot().meta
         return master.meta
 
-    def build(self, builder_id, **kwargs):
-        return builders[builder_id]().build(self, **kwargs)
+    def build(self, builder, **kwargs):
+        return builder().build(self, **kwargs)
 
     def _compat_assign_ordered_ids(self):
         """