Better handling of multipart DAISY.

author Radek Czajka <rczajka@rczajka.pl>

Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)

committer Radek Czajka <rczajka@rczajka.pl>

Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
author Radek Czajka <rczajka@rczajka.pl>
Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
committer Radek Czajka <rczajka@rczajka.pl>
Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
diff --git a/src/librarian/builders/daisy.py b/src/librarian/builders/daisy.py

index b96226f..38e5338 100644 (file)
--- a/src/librarian/builders/daisy.py
+++ b/src/librarian/builders/daisy.py
@@ -1,3 +1,4 @@
+from copy import deepcopy
  import subprocess
  import tempfile
  import zipfile
@@ -6,6 +7,7 @@ from aeneas.task import Task
  from lxml import etree
  import mutagen
  from librarian import OutputFile, get_resource
+from librarian.html import raw_printable_text
  from .html import DaisyHtmlBuilder
  
  
@@ -54,7 +56,7 @@ def populate(element, context):
  class DaisyBuilder:
      file_extension = 'daisy.zip'
  
-    def build(self, document, mp3, **kwargs):
+    def build(self, document, mp3, split_on=None, **kwargs):
          if not mp3:
              raise ValueError("Need MP3 files")
          
@@ -63,130 +65,181 @@ class DaisyBuilder:
  
          directory = document.meta.url.slug + '/'
  
-        html = DaisyHtmlBuilder().build(document)
-        zipf.write(
-            html.get_filename(),
-            directory + 'book.html',
-        )
+        if split_on:
+            documents = []
+            headers = []
+            present = True
+            n = 0
+            while present:
+                present = False
+                n += 1
+                newdoc = deepcopy(document)
+                newdoc.tree.getroot().document = newdoc
+
+                master = newdoc.tree.getroot()[-1]
+                i = 0
+                for item in list(master):
+                    if item.tag == split_on:
+                        # TODO: clear
+                        i += 1
+                        if i == n:
+                            headers.append(raw_printable_text(item))
+                    if i != n and not (n == 1 and not i):
+                        master.remove(item)
+                    else:
+                        present = True
+                if present:
+                    documents.append(newdoc)
+        else:
+            documents = [document]
+            headers = [document.meta.title]
+
+        assert len(documents) == len(mp3)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
  
          durations = []
-        for i, mp3_file in enumerate(mp3):
-            durations.append(get_duration(mp3_file))
+        for i, part in enumerate(documents):
+            print('part', i)
+            html = DaisyHtmlBuilder().build(part)
              zipf.write(
-                mp3_file,
+                html.get_filename(),
+                directory + 'book%d.html' % i,
+            )
+
+            durations.append(get_duration(mp3[i]))
+            zipf.write(
+                mp3[i],
                  directory + "book%d.mp3" % i,
              )
-        duration = sum(durations)
  
-        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
-        task = Task(config_string=config_string)
-
-        # TODO: concatenate all the
-        with tempfile.TemporaryDirectory() as temp:
-            with open(temp + "/book.mp3", "wb") as m:
-                for minput in mp3:
-                    with open(minput, "rb") as minputf:
-                        m.write(minputf.read())
-                
-            
-            syncfile = temp + "/sync"
-            task.audio_file_path_absolute = temp + "/book.mp3"
-            task.text_file_path_absolute = html.get_filename()
-            task.sync_map_file_path_absolute = syncfile
-
-            ExecuteTask(task).execute()
-            task.output_sync_map_file()
-            sync = []
-            with open(syncfile) as f:
-                for line in f:
-                    start, end, sec = line.strip().split('\t')
-                    start = float(start)
-                    end = float(end)
-                    sync.append([start, end, sec])
+            config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+            task = Task(config_string=config_string)
  
-        hms = format_hms(duration)
+            with tempfile.TemporaryDirectory() as temp:
+                syncfile = temp + "/sync"
+                task.audio_file_path_absolute = mp3[i]
+                task.text_file_path_absolute = html.get_filename()
+                task.sync_map_file_path_absolute = syncfile
  
-        narrator = mutagen.File(mp3[0]).get('TPE1')
-        narrator = narrator.text[0] if narrator else ''
+                ExecuteTask(task).execute()
+                task.output_sync_map_file()
+
+                sync = []
+                with open(syncfile) as f:
+                    for line in f:
+                        start, end, sec = line.strip().split('\t')
+                        start = float(start)
+                        end = float(end)
+                        sync.append([start, end, sec])
+
+            hms = format_hms(durations[i])
+            elapsed_hms = format_hms(sum(durations[:i]))
+
+            context = {
+                "VERSION": "1.10",
+
+                "HHMMSSmmm": hms,
+                "HHMMSS": hms.split('.')[0],
+                "Sd": "%.1f" % durations[i],
+                "ELAPSED": elapsed_hms,
  
+                "TITLE": document.meta.title,
+                "PUBLISHER": document.meta.publisher[0],
+                "YEAR": document.meta.created_at[:4],
+                "MONTH": document.meta.created_at[5:7],
+                "AUTHOR": document.meta.author.readable(),
+
+                "NARRATOR": narrator,
+            }
+
+            with open(get_resource('res/daisy/content.smil')) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+
+            seq = tree.find('//seq')
+            for si, item in enumerate(sync):
+                par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
+                etree.SubElement(
+                    par,
+                    "text",
+                    src="book%d.html#%s" % (i, item[2]))
+
+                audio = etree.SubElement(
+                    par,
+                    "audio",
+                    src="book%d.mp3" % i,
+                    **{
+                        "clip-begin": "npt=%.3fs" % item[0],
+                        "clip-end": "npt=%.3fs" % item[1],
+                    },
+                )
+
+            zipf.writestr(
+                directory + 'content%d.smil' % i,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True,
+                    pretty_print=True,
+                ),
+            )
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        duration = sum(durations)
+        hms = format_hms(duration)
          context = {
              "VERSION": "1.10",
-
              "HHMMSSmmm": hms,
              "HHMMSS": hms.split('.')[0],
              "Sd": "%.1f" % duration,
-
              "TITLE": document.meta.title,
              "PUBLISHER": document.meta.publisher[0],
              "YEAR": document.meta.created_at[:4],
              "MONTH": document.meta.created_at[5:7],
              "AUTHOR": document.meta.author.readable(),
-
              "NARRATOR": narrator,
          }
  
-        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
-            zipf.write(
-                get_resource('res/daisy/' + fname),
-                directory + fname)
+        tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
+        cont = tree.getroot()[0]
+        for i, dur in enumerate(durations):
+            etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
+        zipf.writestr(
+            directory + 'er_book_info.xml',
+            etree.tostring(tree, xml_declaration=True))
  
-        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
-            with open(get_resource('res/daisy/' + fname)) as f:
-                tree = etree.parse(f)
-            populate(tree.getroot(), context)
-            zipf.writestr(
-                directory + fname,
-                etree.tostring(
-                    tree,
-                    xml_declaration=True
-                ),
-            )
+        tree = etree.parse(get_resource('res/daisy/master.smil'))
+        populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
+        zipf.writestr(
+            directory + 'master.smil',
+            etree.tostring(tree, xml_declaration=True))
  
-        with open(get_resource('res/daisy/content.smil')) as f:
-            tree = etree.parse(f)
+        tree = etree.parse(get_resource('res/daisy/ncc.html'))
          populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            if not i:
+                h1 = etree.SubElement(
+                    cont, 'h1', id='content', **{"class": "title"})
+                etree.SubElement(
+                    h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
+            else:
+                h2 = etree.SubElement(
+                    cont, 'h2', id='content', **{"class": "chapter"})
+                etree.SubElement(
+                    h2, "a", href='content%d.smil#par000001' % i).text = header
  
-        seq = tree.find('//seq')
-        for i, item in enumerate(sync):
-            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
-            etree.SubElement(
-                par,
-                "text",
-                src="book.html#%s" % item[2])
-
-            # If we have a split between mp3 parts, err on the larger side.
-            i = 0
-            start, end = item[0], item[1]
-            while start >= durations[i]:
-                start -= durations[i]
-                end -= durations[i]
-                i += 1
-            if 2 * (end - durations[i]) > end - start:
-                start = 0
-                end -= durations[i]
-                i += 1
-
-            audio = etree.SubElement(
-                par,
-                "audio",
-                src="book%d.mp3" % i,
-                **{
-                    "clip-begin": "npt=%.3fs" % start,
-                    "clip-end": "npt=%.3fs" % end,
-                },
-            )
-            
          zipf.writestr(
-            directory + 'content.smil',
-            etree.tostring(
-                tree,
-                xml_declaration=True,
-                pretty_print=True,
-            ),
-        )
-
-            
-# WHERE IS MP3
-        
+            directory + 'ncc.html',
+            etree.tostring(tree, xml_declaration=True))
+
          zipf.close()
          return OutputFile.from_filename(outfile.name)
diff --git a/src/librarian/builders/html.py b/src/librarian/builders/html.py

index 165a265..ee50cb8 100644 (file)
--- a/src/librarian/builders/html.py
+++ b/src/librarian/builders/html.py
@@ -36,18 +36,14 @@ class HtmlBuilder:
              'footnotes': self.footnotes,
              'nota_red': self.nota_red,
          }
-        self.current_cursors = [None]
+        self.current_cursors = [text]
  
      @property
      def cursor(self):
-        return self.cursors[self.current_cursors[-1]]
-
-    @cursor.setter
-    def cursor(self, value):
-        self.cursors[self.current_cursors[-1]] = value
+        return self.current_cursors[-1]
  
      def enter_fragment(self, fragment):
-        self.current_cursors.append(fragment)
+        self.current_cursors.append(self.cursors[fragment])
  
      def exit_fragment(self):
          self.current_cursors.pop()
@@ -63,7 +59,7 @@ class HtmlBuilder:
          document._compat_assign_ordered_ids()
          document._compat_assign_section_ids()
  
-    def build(self, document):
+    def build(self, document, **kwargs):
          self.preprocess(document)
          document.tree.getroot().html_build(self)
          self.postprocess(document)
@@ -110,19 +106,16 @@ class HtmlBuilder:
              self.tree.append(self.footnotes)
  
      def start_element(self, tag, attrib=None):
-        self.cursor = etree.SubElement(
+        self.current_cursors.append(etree.SubElement(
              self.cursor,
              tag,
              **(attrib or {})
-        )
+        ))
  
      def end_element(self):
-        self.cursor = self.cursor.getparent()
+        self.current_cursors.pop()
  
      def push_text(self, text):
-        if text == 'Między nami nic nie było':
-            print(self.cursors)
-            print(self.current_cursors)
          cursor = self.cursor
          if len(cursor):
              cursor[-1].tail = (cursor[-1].tail or '') + text
diff --git a/src/librarian/builders/sanitize.py b/src/librarian/builders/sanitize.py

index 4d7f7f9..9897482 100644 (file)
--- a/src/librarian/builders/sanitize.py
+++ b/src/librarian/builders/sanitize.py
@@ -6,7 +6,7 @@ class Sanitizer:
      identifier = 'sanitize'
      file_extension = 'xml2'
  
-    def build(self, document):
+    def build(self, document, **kwargs):
          doc = document.tree.getroot() # TODO: copy
          doc.sanitize()
          return OutputFile.from_bytes(
diff --git a/src/librarian/builders/txt.py b/src/librarian/builders/txt.py

index 2830e89..3f19346 100644 (file)
--- a/src/librarian/builders/txt.py
+++ b/src/librarian/builders/txt.py
@@ -88,7 +88,7 @@ class TxtBuilder:
      def push_legacy_margin(self, margin, where=None):
          self.current_fragments[-1].push_legacy_margin(margin)
          
-    def build(self, document, raw_text=False):
+    def build(self, document, raw_text=False, **kwargs):
          document.tree.getroot().txt_build(self)
          meta = document.meta
  
diff --git a/src/librarian/document.py b/src/librarian/document.py

index 8876294..1c8f223 100644 (file)
--- a/src/librarian/document.py
+++ b/src/librarian/document.py
@@ -4,7 +4,7 @@ import re
  from urllib.request import urlopen
  from lxml import etree
  from .parser import parser
-from . import dcparser
+from . import dcparser, DCNS
  from .functions import lang_code_3to2
  
  
@@ -14,7 +14,9 @@ class WLDocument:
          tree = etree.parse(source, parser=parser)
          self.tree = tree
          tree.getroot().document = self
-        self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
+        self.base_meta = dcparser.BookInfo({}, {
+            DCNS('language'): ["pol"],
+        }, validate_required=False)
  
      @property
      def meta(self):
diff --git a/src/librarian/elements/root/__init__.py b/src/librarian/elements/root/__init__.py

index 8e624bd..541895a 100644 (file)
--- a/src/librarian/elements/root/__init__.py
+++ b/src/librarian/elements/root/__init__.py
@@ -17,6 +17,8 @@ class Utwor(WLElement):
              # This should not generally happen.
              if self.getparent() is not None:
                  return self.getparent().meta
+        # Fallback
+        return self.document.base_meta
  
      @property
      def master(self):
diff --git a/src/librarian/res/daisy/content.smil b/src/librarian/res/daisy/content.smil

index ea9ff99..9f9350c 100644 (file)
--- a/src/librarian/res/daisy/content.smil
+++ b/src/librarian/res/daisy/content.smil
@@ -3,7 +3,7 @@
  <smil>
    <head>
      <meta name="dc:format" content="Daisy 2.02"/>
-    <meta name="ncc:totalElapsedTime" content="00:00:00.000"/>
+    <meta name="ncc:totalElapsedTime" content="{ELAPSED}"/>
      <layout>
        <region id="txt-view"/>
      </layout>
diff --git a/src/librarian/res/daisy/er_book_info.xml b/src/librarian/res/daisy/er_book_info.xml

index 0ea3ffd..35ca837 100644 (file)
--- a/src/librarian/res/daisy/er_book_info.xml
+++ b/src/librarian/res/daisy/er_book_info.xml
@@ -1,6 +1,4 @@
  <?xml version="1.0" encoding="utf-8"?>\r
  <book_info>\r
-  <smil_info>\r
-    <smil nr="1" Name="content.smil" dur="{Sd}"/>\r
-  </smil_info>\r
+  <smil_info></smil_info>\r
  </book_info>\r
diff --git a/src/librarian/res/daisy/master.smil b/src/librarian/res/daisy/master.smil

index 666fed9..0a66301 100644 (file)
--- a/src/librarian/res/daisy/master.smil
+++ b/src/librarian/res/daisy/master.smil
@@ -11,7 +11,5 @@
        <region id="txtView"/>\r
      </layout>\r
    </head>\r
-  <body>\r
-    <ref title="{TITLE}" src="content.smil#seq000001" id="smil_0001"/>\r
-  </body>\r
+  <body></body>\r
  </smil>\r
diff --git a/src/librarian/res/daisy/ncc.html b/src/librarian/res/daisy/ncc.html

index a9b20ec..1941d35 100644 (file)
--- a/src/librarian/res/daisy/ncc.html
+++ b/src/librarian/res/daisy/ncc.html
@@ -28,7 +28,5 @@
      <meta name="prod:last_used_id" content=""/>\r
      <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>\r
    </head>\r
-  <body>\r
-    <h1 class="title" id="content"><a href="content.smil#par000001">{TITLE}</a></h1>\r
-  </body>\r
+  <body></body>\r
  </html>\r
diff --git a/src/librarian/util.py b/src/librarian/util.py

index 2c6b773..63e7996 100644 (file)
--- a/src/librarian/util.py
+++ b/src/librarian/util.py
@@ -137,5 +137,5 @@ def get_translation(language):
      return gettext.translation(
          'messages',
          localedir=os.path.join(os.path.dirname(__file__), 'locale'),
-        languages=[lang_code_3to2(language)],
+        languages=[lang_code_3to2(language), 'pl'],
      )
author	Radek Czajka <rczajka@rczajka.pl>
	Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
committer	Radek Czajka <rczajka@rczajka.pl>
	Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
src/librarian/builders/daisy.py		patch \| blob \| history
src/librarian/builders/html.py		patch \| blob \| history
src/librarian/builders/sanitize.py		patch \| blob \| history
src/librarian/builders/txt.py		patch \| blob \| history
src/librarian/document.py		patch \| blob \| history
src/librarian/elements/root/__init__.py		patch \| blob \| history
src/librarian/res/daisy/content.smil		patch \| blob \| history
src/librarian/res/daisy/er_book_info.xml		patch \| blob \| history
src/librarian/res/daisy/master.smil		patch \| blob \| history
src/librarian/res/daisy/ncc.html		patch \| blob \| history
src/librarian/util.py		patch \| blob \| history