Experimental DAISY builder.

author Radek Czajka <rczajka@rczajka.pl>

Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
diff --git a/CHANGELOG.md b/CHANGELOG.md

index 4149659..521ef1c 100644 (file)
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -3,6 +3,13 @@
  This document records all notable changes to Librarian.
  
  
  This document records all notable changes to Librarian.
  
  
+## 1.11 (2020-12-09)
+
+### Added
+
+- Experimental DAISY builder.
+
+
  ## 1.10 (2020-11-09)
  
  ### Added
  ## 1.10 (2020-11-09)
  
  ### Added
diff --git a/setup.py b/setup.py

index 79d4f84..3a50c6c 100755 (executable)
--- a/setup.py
+++ b/setup.py
@@ -22,7 +22,7 @@ def whole_tree(prefix, path):
  
  setup(
      name='librarian',
  
  setup(
      name='librarian',
-    version='1.10',
+    version='1.11',
      description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
      author="Marek Stępniowski",
      author_email='marek@stepniowski.com',
      description='Converter from WolneLektury.pl XML-based language to XHTML, TXT and other formats',
      author="Marek Stępniowski",
      author_email='marek@stepniowski.com',
@@ -42,6 +42,8 @@ setup(
          'six',
          'texml',
          'ebooklib',
          'six',
          'texml',
          'ebooklib',
+        'aeneas',
+        'mutagen',
      ],
      entry_points = {
          "console_scripts": [
      ],
      entry_points = {
          "console_scripts": [
diff --git a/src/librarian/builders/__init__.py b/src/librarian/builders/__init__.py

index b1afe94..dc5bdee 100644 (file)
--- a/src/librarian/builders/__init__.py
+++ b/src/librarian/builders/__init__.py
@@ -2,6 +2,7 @@ from collections import OrderedDict
  from .txt import TxtBuilder
  from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
  from .sanitize import Sanitizer
  from .txt import TxtBuilder
  from .html import HtmlBuilder, StandaloneHtmlBuilder, DaisyHtmlBuilder
  from .sanitize import Sanitizer
+from .daisy import DaisyBuilder
  
  
  builders = OrderedDict([
  
  
  builders = OrderedDict([
@@ -9,5 +10,6 @@ builders = OrderedDict([
      ("html", HtmlBuilder),
      ("html-standalone", StandaloneHtmlBuilder),
      ("html-daisy", DaisyHtmlBuilder),
      ("html", HtmlBuilder),
      ("html-standalone", StandaloneHtmlBuilder),
      ("html-daisy", DaisyHtmlBuilder),
+    ("daisy", DaisyBuilder),
      ("sanitizer", Sanitizer),
  ])
      ("sanitizer", Sanitizer),
  ])
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py

new file mode 100644 (file)

index 0000000..b96226f
--- /dev/null
+++ b/src/librarian/builders/daisy.py
@@ -0,0 +1,192 @@
+import subprocess
+import tempfile
+import zipfile
+from aeneas.executetask import ExecuteTask
+from aeneas.task import Task
+from lxml import etree
+import mutagen
+from librarian import OutputFile, get_resource
+from .html import DaisyHtmlBuilder
+
+
+def get_duration(path):
+    return float(
+        subprocess.run(
+            [
+                "ffprobe",
+                "-i",
+                path,
+                "-show_entries",
+                "format=duration",
+                "-v",
+                "quiet",
+                "-of",
+                "csv=p=0",
+            ],
+            capture_output=True,
+            text=True,
+            check=True,
+        ).stdout
+    )
+
+
+def format_hms(t):
+    seconds = t % 60
+    t -= seconds
+    t /= 60
+    minutes = t % 60
+    t -= minutes
+    hours = t / 60
+    return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
+
+
+def populate(element, context):
+    if element.text:
+        element.text = element.text.format(**context)
+    if element.tail:
+        element.tail = element.tail.format(**context)
+    for k, v in element.attrib.items():
+        element.attrib[k] = v.format(**context)
+    for child in element:
+        populate(child, context)
+
+
+class DaisyBuilder:
+    file_extension = 'daisy.zip'
+
+    def build(self, document, mp3, **kwargs):
+        if not mp3:
+            raise ValueError("Need MP3 files")
+        
+        outfile = tempfile.NamedTemporaryFile(delete=False)
+        zipf = zipfile.ZipFile(outfile, 'w')
+
+        directory = document.meta.url.slug + '/'
+
+        html = DaisyHtmlBuilder().build(document)
+        zipf.write(
+            html.get_filename(),
+            directory + 'book.html',
+        )
+
+        durations = []
+        for i, mp3_file in enumerate(mp3):
+            durations.append(get_duration(mp3_file))
+            zipf.write(
+                mp3_file,
+                directory + "book%d.mp3" % i,
+            )
+        duration = sum(durations)
+
+        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+        task = Task(config_string=config_string)
+
+        # TODO: concatenate all the
+        with tempfile.TemporaryDirectory() as temp:
+            with open(temp + "/book.mp3", "wb") as m:
+                for minput in mp3:
+                    with open(minput, "rb") as minputf:
+                        m.write(minputf.read())
+                
+            
+            syncfile = temp + "/sync"
+            task.audio_file_path_absolute = temp + "/book.mp3"
+            task.text_file_path_absolute = html.get_filename()
+            task.sync_map_file_path_absolute = syncfile
+
+            ExecuteTask(task).execute()
+            task.output_sync_map_file()
+            sync = []
+            with open(syncfile) as f:
+                for line in f:
+                    start, end, sec = line.strip().split('\t')
+                    start = float(start)
+                    end = float(end)
+                    sync.append([start, end, sec])
+
+        hms = format_hms(duration)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
+
+        context = {
+            "VERSION": "1.10",
+
+            "HHMMSSmmm": hms,
+            "HHMMSS": hms.split('.')[0],
+            "Sd": "%.1f" % duration,
+
+            "TITLE": document.meta.title,
+            "PUBLISHER": document.meta.publisher[0],
+            "YEAR": document.meta.created_at[:4],
+            "MONTH": document.meta.created_at[5:7],
+            "AUTHOR": document.meta.author.readable(),
+
+            "NARRATOR": narrator,
+        }
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
+            with open(get_resource('res/daisy/' + fname)) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+            zipf.writestr(
+                directory + fname,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True
+                ),
+            )
+
+        with open(get_resource('res/daisy/content.smil')) as f:
+            tree = etree.parse(f)
+        populate(tree.getroot(), context)
+
+        seq = tree.find('//seq')
+        for i, item in enumerate(sync):
+            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
+            etree.SubElement(
+                par,
+                "text",
+                src="book.html#%s" % item[2])
+
+            # If we have a split between mp3 parts, err on the larger side.
+            i = 0
+            start, end = item[0], item[1]
+            while start >= durations[i]:
+                start -= durations[i]
+                end -= durations[i]
+                i += 1
+            if 2 * (end - durations[i]) > end - start:
+                start = 0
+                end -= durations[i]
+                i += 1
+
+            audio = etree.SubElement(
+                par,
+                "audio",
+                src="book%d.mp3" % i,
+                **{
+                    "clip-begin": "npt=%.3fs" % start,
+                    "clip-end": "npt=%.3fs" % end,
+                },
+            )
+            
+        zipf.writestr(
+            directory + 'content.smil',
+            etree.tostring(
+                tree,
+                xml_declaration=True,
+                pretty_print=True,
+            ),
+        )
+
+            
+# WHERE IS MP3
+        
+        zipf.close()
+        return OutputFile.from_filename(outfile.name)
diff --git a/src/librarian/document.py b/src/librarian/document.py

index c0efd3a..8876294 100644 (file)
--- a/src/librarian/document.py
+++ b/src/librarian/document.py
@@ -1,17 +1,17 @@
  import gettext
  import os
  import re
  import gettext
  import os
  import re
+from urllib.request import urlopen
  from lxml import etree
  from lxml import etree
-from .builders import builders
  from .parser import parser
  from . import dcparser
  from .functions import lang_code_3to2
  
  
  class WLDocument:
  from .parser import parser
  from . import dcparser
  from .functions import lang_code_3to2
  
  
  class WLDocument:
-    def __init__(self, tree=None, filename=None):
-        if filename is not None:
-            tree = etree.parse(filename, parser=parser)
+    def __init__(self, filename=None, url=None):
+        source = filename or urlopen(url)
+        tree = etree.parse(source, parser=parser)
          self.tree = tree
          tree.getroot().document = self
          self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
          self.tree = tree
          tree.getroot().document = self
          self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
@@ -23,8 +23,8 @@ class WLDocument:
          return self.tree.getroot().meta
          return master.meta
  
          return self.tree.getroot().meta
          return master.meta
  
-    def build(self, builder_id, **kwargs):
-        return builders[builder_id]().build(self, **kwargs)
+    def build(self, builder, **kwargs):
+        return builder().build(self, **kwargs)
  
      def _compat_assign_ordered_ids(self):
          """
  
      def _compat_assign_ordered_ids(self):
          """
author	Radek Czajka <rczajka@rczajka.pl>
	Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Wed, 9 Dec 2020 08:40:53 +0000 (09:40 +0100)
CHANGELOG.md		patch \| blob \| history
setup.py		patch \| blob \| history
src/librarian/builders/__init__.py		patch \| blob \| history
src/librarian/builders/daisy.py	[new file with mode: 0644]	patch \| blob
src/librarian/document.py		patch \| blob \| history