Better handling of multipart DAISY.
authorRadek Czajka <rczajka@rczajka.pl>
Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
committerRadek Czajka <rczajka@rczajka.pl>
Fri, 11 Dec 2020 12:13:44 +0000 (13:13 +0100)
src/librarian/builders/daisy.py
src/librarian/builders/html.py
src/librarian/builders/sanitize.py
src/librarian/builders/txt.py
src/librarian/document.py
src/librarian/elements/root/__init__.py
src/librarian/res/daisy/content.smil
src/librarian/res/daisy/er_book_info.xml
src/librarian/res/daisy/master.smil
src/librarian/res/daisy/ncc.html
src/librarian/util.py

index b96226f..38e5338 100644 (file)
@@ -1,3 +1,4 @@
+from copy import deepcopy
 import subprocess
 import tempfile
 import zipfile
@@ -6,6 +7,7 @@ from aeneas.task import Task
 from lxml import etree
 import mutagen
 from librarian import OutputFile, get_resource
+from librarian.html import raw_printable_text
 from .html import DaisyHtmlBuilder
 
 
@@ -54,7 +56,7 @@ def populate(element, context):
 class DaisyBuilder:
     file_extension = 'daisy.zip'
 
-    def build(self, document, mp3, **kwargs):
+    def build(self, document, mp3, split_on=None, **kwargs):
         if not mp3:
             raise ValueError("Need MP3 files")
         
@@ -63,130 +65,181 @@ class DaisyBuilder:
 
         directory = document.meta.url.slug + '/'
 
-        html = DaisyHtmlBuilder().build(document)
-        zipf.write(
-            html.get_filename(),
-            directory + 'book.html',
-        )
+        if split_on:
+            documents = []
+            headers = []
+            present = True
+            n = 0
+            while present:
+                present = False
+                n += 1
+                newdoc = deepcopy(document)
+                newdoc.tree.getroot().document = newdoc
+
+                master = newdoc.tree.getroot()[-1]
+                i = 0
+                for item in list(master):
+                    if item.tag == split_on:
+                        # TODO: clear
+                        i += 1
+                        if i == n:
+                            headers.append(raw_printable_text(item))
+                    if i != n and not (n == 1 and not i):
+                        master.remove(item)
+                    else:
+                        present = True
+                if present:
+                    documents.append(newdoc)
+        else:
+            documents = [document]
+            headers = [document.meta.title]
+
+        assert len(documents) == len(mp3)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
 
         durations = []
-        for i, mp3_file in enumerate(mp3):
-            durations.append(get_duration(mp3_file))
+        for i, part in enumerate(documents):
+            print('part', i)
+            html = DaisyHtmlBuilder().build(part)
             zipf.write(
-                mp3_file,
+                html.get_filename(),
+                directory + 'book%d.html' % i,
+            )
+
+            durations.append(get_duration(mp3[i]))
+            zipf.write(
+                mp3[i],
                 directory + "book%d.mp3" % i,
             )
-        duration = sum(durations)
 
-        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
-        task = Task(config_string=config_string)
-
-        # TODO: concatenate all the
-        with tempfile.TemporaryDirectory() as temp:
-            with open(temp + "/book.mp3", "wb") as m:
-                for minput in mp3:
-                    with open(minput, "rb") as minputf:
-                        m.write(minputf.read())
-                
-            
-            syncfile = temp + "/sync"
-            task.audio_file_path_absolute = temp + "/book.mp3"
-            task.text_file_path_absolute = html.get_filename()
-            task.sync_map_file_path_absolute = syncfile
-
-            ExecuteTask(task).execute()
-            task.output_sync_map_file()
-            sync = []
-            with open(syncfile) as f:
-                for line in f:
-                    start, end, sec = line.strip().split('\t')
-                    start = float(start)
-                    end = float(end)
-                    sync.append([start, end, sec])
+            config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+            task = Task(config_string=config_string)
 
-        hms = format_hms(duration)
+            with tempfile.TemporaryDirectory() as temp:
+                syncfile = temp + "/sync"
+                task.audio_file_path_absolute = mp3[i]
+                task.text_file_path_absolute = html.get_filename()
+                task.sync_map_file_path_absolute = syncfile
 
-        narrator = mutagen.File(mp3[0]).get('TPE1')
-        narrator = narrator.text[0] if narrator else ''
+                ExecuteTask(task).execute()
+                task.output_sync_map_file()
+
+                sync = []
+                with open(syncfile) as f:
+                    for line in f:
+                        start, end, sec = line.strip().split('\t')
+                        start = float(start)
+                        end = float(end)
+                        sync.append([start, end, sec])
+
+            hms = format_hms(durations[i])
+            elapsed_hms = format_hms(sum(durations[:i]))
+
+            context = {
+                "VERSION": "1.10",
+
+                "HHMMSSmmm": hms,
+                "HHMMSS": hms.split('.')[0],
+                "Sd": "%.1f" % durations[i],
+                "ELAPSED": elapsed_hms,
 
+                "TITLE": document.meta.title,
+                "PUBLISHER": document.meta.publisher[0],
+                "YEAR": document.meta.created_at[:4],
+                "MONTH": document.meta.created_at[5:7],
+                "AUTHOR": document.meta.author.readable(),
+
+                "NARRATOR": narrator,
+            }
+
+            with open(get_resource('res/daisy/content.smil')) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+
+            seq = tree.find('//seq')
+            for si, item in enumerate(sync):
+                par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
+                etree.SubElement(
+                    par,
+                    "text",
+                    src="book%d.html#%s" % (i, item[2]))
+
+                audio = etree.SubElement(
+                    par,
+                    "audio",
+                    src="book%d.mp3" % i,
+                    **{
+                        "clip-begin": "npt=%.3fs" % item[0],
+                        "clip-end": "npt=%.3fs" % item[1],
+                    },
+                )
+
+            zipf.writestr(
+                directory + 'content%d.smil' % i,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True,
+                    pretty_print=True,
+                ),
+            )
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        duration = sum(durations)
+        hms = format_hms(duration)
         context = {
             "VERSION": "1.10",
-
             "HHMMSSmmm": hms,
             "HHMMSS": hms.split('.')[0],
             "Sd": "%.1f" % duration,
-
             "TITLE": document.meta.title,
             "PUBLISHER": document.meta.publisher[0],
             "YEAR": document.meta.created_at[:4],
             "MONTH": document.meta.created_at[5:7],
             "AUTHOR": document.meta.author.readable(),
-
             "NARRATOR": narrator,
         }
 
-        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
-            zipf.write(
-                get_resource('res/daisy/' + fname),
-                directory + fname)
+        tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
+        cont = tree.getroot()[0]
+        for i, dur in enumerate(durations):
+            etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
+        zipf.writestr(
+            directory + 'er_book_info.xml',
+            etree.tostring(tree, xml_declaration=True))
 
-        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
-            with open(get_resource('res/daisy/' + fname)) as f:
-                tree = etree.parse(f)
-            populate(tree.getroot(), context)
-            zipf.writestr(
-                directory + fname,
-                etree.tostring(
-                    tree,
-                    xml_declaration=True
-                ),
-            )
+        tree = etree.parse(get_resource('res/daisy/master.smil'))
+        populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
+        zipf.writestr(
+            directory + 'master.smil',
+            etree.tostring(tree, xml_declaration=True))
 
-        with open(get_resource('res/daisy/content.smil')) as f:
-            tree = etree.parse(f)
+        tree = etree.parse(get_resource('res/daisy/ncc.html'))
         populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            if not i:
+                h1 = etree.SubElement(
+                    cont, 'h1', id='content', **{"class": "title"})
+                etree.SubElement(
+                    h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
+            else:
+                h2 = etree.SubElement(
+                    cont, 'h2', id='content', **{"class": "chapter"})
+                etree.SubElement(
+                    h2, "a", href='content%d.smil#par000001' % i).text = header
 
-        seq = tree.find('//seq')
-        for i, item in enumerate(sync):
-            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
-            etree.SubElement(
-                par,
-                "text",
-                src="book.html#%s" % item[2])
-
-            # If we have a split between mp3 parts, err on the larger side.
-            i = 0
-            start, end = item[0], item[1]
-            while start >= durations[i]:
-                start -= durations[i]
-                end -= durations[i]
-                i += 1
-            if 2 * (end - durations[i]) > end - start:
-                start = 0
-                end -= durations[i]
-                i += 1
-
-            audio = etree.SubElement(
-                par,
-                "audio",
-                src="book%d.mp3" % i,
-                **{
-                    "clip-begin": "npt=%.3fs" % start,
-                    "clip-end": "npt=%.3fs" % end,
-                },
-            )
-            
         zipf.writestr(
-            directory + 'content.smil',
-            etree.tostring(
-                tree,
-                xml_declaration=True,
-                pretty_print=True,
-            ),
-        )
-
-            
-# WHERE IS MP3
-        
+            directory + 'ncc.html',
+            etree.tostring(tree, xml_declaration=True))
+
         zipf.close()
         return OutputFile.from_filename(outfile.name)
index 165a265..ee50cb8 100644 (file)
@@ -36,18 +36,14 @@ class HtmlBuilder:
             'footnotes': self.footnotes,
             'nota_red': self.nota_red,
         }
-        self.current_cursors = [None]
+        self.current_cursors = [text]
 
     @property
     def cursor(self):
-        return self.cursors[self.current_cursors[-1]]
-
-    @cursor.setter
-    def cursor(self, value):
-        self.cursors[self.current_cursors[-1]] = value
+        return self.current_cursors[-1]
 
     def enter_fragment(self, fragment):
-        self.current_cursors.append(fragment)
+        self.current_cursors.append(self.cursors[fragment])
 
     def exit_fragment(self):
         self.current_cursors.pop()
@@ -63,7 +59,7 @@ class HtmlBuilder:
         document._compat_assign_ordered_ids()
         document._compat_assign_section_ids()
 
-    def build(self, document):
+    def build(self, document, **kwargs):
         self.preprocess(document)
         document.tree.getroot().html_build(self)
         self.postprocess(document)
@@ -110,19 +106,16 @@ class HtmlBuilder:
             self.tree.append(self.footnotes)
 
     def start_element(self, tag, attrib=None):
-        self.cursor = etree.SubElement(
+        self.current_cursors.append(etree.SubElement(
             self.cursor,
             tag,
             **(attrib or {})
-        )
+        ))
 
     def end_element(self):
-        self.cursor = self.cursor.getparent()
+        self.current_cursors.pop()
 
     def push_text(self, text):
-        if text == 'Między nami nic nie było':
-            print(self.cursors)
-            print(self.current_cursors)
         cursor = self.cursor
         if len(cursor):
             cursor[-1].tail = (cursor[-1].tail or '') + text
index 4d7f7f9..9897482 100644 (file)
@@ -6,7 +6,7 @@ class Sanitizer:
     identifier = 'sanitize'
     file_extension = 'xml2'
 
-    def build(self, document):
+    def build(self, document, **kwargs):
         doc = document.tree.getroot() # TODO: copy
         doc.sanitize()
         return OutputFile.from_bytes(
index 2830e89..3f19346 100644 (file)
@@ -88,7 +88,7 @@ class TxtBuilder:
     def push_legacy_margin(self, margin, where=None):
         self.current_fragments[-1].push_legacy_margin(margin)
         
-    def build(self, document, raw_text=False):
+    def build(self, document, raw_text=False, **kwargs):
         document.tree.getroot().txt_build(self)
         meta = document.meta
 
index 8876294..1c8f223 100644 (file)
@@ -4,7 +4,7 @@ import re
 from urllib.request import urlopen
 from lxml import etree
 from .parser import parser
-from . import dcparser
+from . import dcparser, DCNS
 from .functions import lang_code_3to2
 
 
@@ -14,7 +14,9 @@ class WLDocument:
         tree = etree.parse(source, parser=parser)
         self.tree = tree
         tree.getroot().document = self
-        self.base_meta = dcparser.BookInfo({}, {}, validate_required=False)
+        self.base_meta = dcparser.BookInfo({}, {
+            DCNS('language'): ["pol"],
+        }, validate_required=False)
 
     @property
     def meta(self):
index 8e624bd..541895a 100644 (file)
@@ -17,6 +17,8 @@ class Utwor(WLElement):
             # This should not generally happen.
             if self.getparent() is not None:
                 return self.getparent().meta
+        # Fallback
+        return self.document.base_meta
 
     @property
     def master(self):
index ea9ff99..9f9350c 100644 (file)
@@ -3,7 +3,7 @@
 <smil>
   <head>
     <meta name="dc:format" content="Daisy 2.02"/>
-    <meta name="ncc:totalElapsedTime" content="00:00:00.000"/>
+    <meta name="ncc:totalElapsedTime" content="{ELAPSED}"/>
     <layout>
       <region id="txt-view"/>
     </layout>
index 0ea3ffd..35ca837 100644 (file)
@@ -1,6 +1,4 @@
 <?xml version="1.0" encoding="utf-8"?>\r
 <book_info>\r
-  <smil_info>\r
-    <smil nr="1" Name="content.smil" dur="{Sd}"/>\r
-  </smil_info>\r
+  <smil_info></smil_info>\r
 </book_info>\r
index 666fed9..0a66301 100644 (file)
@@ -11,7 +11,5 @@
       <region id="txtView"/>\r
     </layout>\r
   </head>\r
-  <body>\r
-    <ref title="{TITLE}" src="content.smil#seq000001" id="smil_0001"/>\r
-  </body>\r
+  <body></body>\r
 </smil>\r
index a9b20ec..1941d35 100644 (file)
@@ -28,7 +28,5 @@
     <meta name="prod:last_used_id" content=""/>\r
     <meta http-equiv="Content-type" content="text/html; charset=utf-8"/>\r
   </head>\r
-  <body>\r
-    <h1 class="title" id="content"><a href="content.smil#par000001">{TITLE}</a></h1>\r
-  </body>\r
+  <body></body>\r
 </html>\r
index 2c6b773..63e7996 100644 (file)
@@ -137,5 +137,5 @@ def get_translation(language):
     return gettext.translation(
         'messages',
         localedir=os.path.join(os.path.dirname(__file__), 'locale'),
-        languages=[lang_code_3to2(language)],
+        languages=[lang_code_3to2(language), 'pl'],
     )