Experimental DAISY builder.
authorRadek Czajka <rczajka@rczajka.pl>
Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
committerRadek Czajka <rczajka@rczajka.pl>
Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
CHANGELOG.md
setup.py
src/librarian/builders/__init__.py
src/librarian/builders/daisy.py [new file with mode: 0644]
src/librarian/document.py

index 4149659..521ef1c 100644 (file)
@@ -3,6 +3,13 @@
 This document records all notable changes to Librarian.
 
 
 This document records all notable changes to Librarian.
 
 
+## 1.11 (2020-12-09)
+
+### Added
+
+- Experimental DAISY builder.
+
+
 ## 1.10 (2020-11-09)
 
 ### Added
 ## 1.10 (2020-11-09)
 
 ### Added
index 79d4f84..3a50c6c 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
 
 setup(
     name='librarian',
 
 setup(
     name='librarian',
-    version='1.10',
+    version='1.11',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek StÄ™pniowski",
     author_email='marek@stepniowski.com',
     description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
     author="Marek StÄ™pniowski",
     author_email='marek@stepniowski.com',
@@ -42,6 +42,8 @@ setup(
         'six',
         'texml',
         'ebooklib',
         'six',
         'texml',
         'ebooklib',
+        'aeneas',
+        'mutagen',
     ],
     entry_points = {
         "console_scripts": [
     ],
     entry_points = {
         "console_scripts": [
index b1afe94..dc5bdee 100644 (file)
@@ -2,6 +2,7 @@ from collections import OrderedDict
 from .txt import TxtBuilder
 from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
 from .sanitize import Sanitizer
 from .txt import TxtBuilder
 from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
 from .sanitize import Sanitizer
+from .daisy import DaisyBuilder
 
 
 builders = OrderedDict([
 
 
 builders = OrderedDict([
@@ -9,5 +10,6 @@ builders = OrderedDict([
     ("html", HtmlBuilder),
     ("html-standalone", StandaloneHtmlBuilder),
     ("html-daisy", DaisyHtmlBuilder),
     ("html", HtmlBuilder),
     ("html-standalone", StandaloneHtmlBuilder),
     ("html-daisy", DaisyHtmlBuilder),
+    ("daisy", DaisyBuilder),
     ("sanitizer", Sanitizer),
 ])
     ("sanitizer", Sanitizer),
 ])
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py
new file mode 100644 (file)
index 0000000..b96226f
--- /dev/null
@@ -0,0 +1,192 @@
+import subprocess
+import tempfile
+import zipfile
+from aeneas.executetask import ExecuteTask
+from aeneas.task import Task
+from lxml import etree
+import mutagen
+from librarian import OutputFile, get_resource
+from .html import DaisyHtmlBuilder
+
+
+def get_duration(path):
+    return float(
+        subprocess.run(
+            [
+                "ffprobe",
+                "-i",
+                path,
+                "-show_entries",
+                "format=duration",
+                "-v",
+                "quiet",
+                "-of",
+                "csv=p=0",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout
+    )
+
+
+def format_hms(t):
+    seconds = t % 60
+    t -= seconds
+    t /= 60
+    minutes = t % 60
+    t -= minutes
+    hours = t / 60
+    return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
+
+
+def populate(element, context):
+    if element.text:
+        element.text = element.text.format(**context)
+    if element.tail:
+        element.tail = element.tail.format(**context)
+    for k, v in element.attrib.items():
+        element.attrib[k] = v.format(**context)
+    for child in element:
+        populate(child, context)
+
+
+class DaisyBuilder:
+    file_extension = 'daisy.zip'
+
+    def build(self, document, mp3, **kwargs):
+        if not mp3:
+            raise ValueError("Need MP3 files")
+        
+        outfile = tempfile.NamedTemporaryFile(delete=False)
+        zipf = zipfile.ZipFile(outfile, 'w')
+
+        directory = document.meta.url.slug + '/'
+
+        html = DaisyHtmlBuilder().build(document)
+        zipf.write(
+            html.get_filename(),
+            directory + 'book.html',
+        )
+
+        durations = []
+        for i, mp3_file in enumerate(mp3):
+            durations.append(get_duration(mp3_file))
+            zipf.write(
+                mp3_file,
+                directory + "book%d.mp3" % i,
+            )
+        duration = sum(durations)
+
+        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+        task = Task(config_string=config_string)
+
+        # TODO: concatenate all the
+        with tempfile.TemporaryDirectory() as temp:
+            with open(temp + "/book.mp3", "wb") as m:
+                for minput in mp3:
+                    with open(minput, "rb") as minputf:
+                        m.write(minputf.read())
+                
+            
+            syncfile = temp + "/sync"
+            task.audio_file_path_absolute = temp + "/book.mp3"
+            task.text_file_path_absolute = html.get_filename()
+            task.sync_map_file_path_absolute = syncfile
+
+            ExecuteTask(task).execute()
+            task.output_sync_map_file()
+            sync = []
+            with open(syncfile) as f:
+                for line in f:
+                    start, end, sec = line.strip().split('\t')
+                    start = float(start)
+                    end = float(end)
+                    sync.append([start, end, sec])
+
+        hms = format_hms(duration)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
+
+        context = {
+            "VERSION": "1.10",
+
+            "HHMMSSmmm": hms,
+            "HHMMSS": hms.split('.')[0],
+            "Sd": "%.1f" % duration,
+
+            "TITLE": document.meta.title,
+            "PUBLISHER": document.meta.publisher[0],
+            "YEAR": document.meta.created_at[:4],
+            "MONTH": document.meta.created_at[5:7],
+            "AUTHOR": document.meta.author.readable(),
+
+            "NARRATOR": narrator,
+        }
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
+            with open(get_resource('res/daisy/' + fname)) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+            zipf.writestr(
+                directory + fname,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True
+                ),
+            )
+
+        with open(get_resource('res/daisy/content.smil')) as f:
+            tree = etree.parse(f)
+        populate(tree.getroot(), context)
+
+        seq = tree.find('//seq')
+        for i, item in enumerate(sync):
+            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
+            etree.SubElement(
+                par,
+                "text",
+                src="book.html#%s" % item[2])
+
+            # If we have a split between mp3 parts, err on the larger side.
+            i = 0
+            start, end = item[0], item[1]
+            while start >= durations[i]:
+                start -= durations[i]
+                end -= durations[i]
+                i += 1
+            if 2 * (end - durations[i]) > end - start:
+                start = 0
+                end -= durations[i]
+                i += 1
+
+            audio = etree.SubElement(
+                par,
+                "audio",
+                src="book%d.mp3" % i,
+                **{
+                    "clip-begin": "npt=%.3fs" % start,
+                    "clip-end": "npt=%.3fs" % end,
+                },
+            )
+            
+        zipf.writestr(
+            directory + 'content.smil',
+            etree.tostring(
+                tree,
+                xml_declaration=True,
+                pretty_print=True,
+            ),
+        )
+
+            
+# WHERE IS MP3
+        
+        zipf.close()
+        return OutputFile.from_filename(outfile.name)
index c0efd3a..8876294 100644 (file)
@@ -1,17 +1,17 @@
 import gettext
 import os
 import re
 import gettext
 import os
 import re
+from urllib.request import urlopen
 from lxml import etree
 from lxml import etree
-from .builders import builders
 from .parser import parser
 from . import dcparser
 from .functions import lang_code_3to2
 
 
 class WLDocument:
 from .parser import parser
 from . import dcparser
 from .functions import lang_code_3to2
 
 
 class WLDocument:
-    def __init__(self, tree=None, filename=None):
-        if filename is not None:
-            tree = etree.parse(filename, parser=parser)
+    def __init__(self, filename=None, url=None):
+        source = filename or urlopen(url)
+        tree = etree.parse(source, parser=parser)
         self.tree = tree
         tree.getroot().document = self
         self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
         self.tree = tree
         tree.getroot().document = self
         self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
@@ -23,8 +23,8 @@ class WLDocument:
         return self.tree.getroot().meta
         return master.meta
 
         return self.tree.getroot().meta
         return master.meta
 
-    def build(self, builder_id, **kwargs):
-        return builders[builder_id]().build(self, **kwargs)
+    def build(self, builder, **kwargs):
+        return builder().build(self, **kwargs)
 
     def _compat_assign_ordered_ids(self):
         """
 
     def _compat_assign_ordered_ids(self):
         """