Better handling of multipart DAISY.
[librarian.git] / src / librarian / builders / daisy.py
index b96226f..38e5338 100644 (file)
@@ -1,3 +1,4 @@
+from copy import deepcopy
 import subprocess
 import tempfile
 import zipfile
@@ -6,6 +7,7 @@ from aeneas.task import Task
 from lxml import etree
 import mutagen
 from librarian import OutputFile, get_resource
+from librarian.html import raw_printable_text
 from .html import DaisyHtmlBuilder
 
 
@@ -54,7 +56,7 @@ def populate(element, context):
 class DaisyBuilder:
     file_extension = 'daisy.zip'
 
-    def build(self, document, mp3, **kwargs):
+    def build(self, document, mp3, split_on=None, **kwargs):
         if not mp3:
             raise ValueError("Need MP3 files")
         
@@ -63,130 +65,181 @@ class DaisyBuilder:
 
         directory = document.meta.url.slug + '/'
 
-        html = DaisyHtmlBuilder().build(document)
-        zipf.write(
-            html.get_filename(),
-            directory + 'book.html',
-        )
+        if split_on:
+            documents = []
+            headers = []
+            present = True
+            n = 0
+            while present:
+                present = False
+                n += 1
+                newdoc = deepcopy(document)
+                newdoc.tree.getroot().document = newdoc
+
+                master = newdoc.tree.getroot()[-1]
+                i = 0
+                for item in list(master):
+                    if item.tag == split_on:
+                        # TODO: clear
+                        i += 1
+                        if i == n:
+                            headers.append(raw_printable_text(item))
+                    if i != n and not (n == 1 and not i):
+                        master.remove(item)
+                    else:
+                        present = True
+                if present:
+                    documents.append(newdoc)
+        else:
+            documents = [document]
+            headers = [document.meta.title]
+
+        assert len(documents) == len(mp3)
+
+        narrator = mutagen.File(mp3[0]).get('TPE1')
+        narrator = narrator.text[0] if narrator else ''
 
         durations = []
-        for i, mp3_file in enumerate(mp3):
-            durations.append(get_duration(mp3_file))
+        for i, part in enumerate(documents):
+            print('part', i)
+            html = DaisyHtmlBuilder().build(part)
             zipf.write(
-                mp3_file,
+                html.get_filename(),
+                directory + 'book%d.html' % i,
+            )
+
+            durations.append(get_duration(mp3[i]))
+            zipf.write(
+                mp3[i],
                 directory + "book%d.mp3" % i,
             )
-        duration = sum(durations)
 
-        config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
-        task = Task(config_string=config_string)
-
-        # TODO: concatenate all the
-        with tempfile.TemporaryDirectory() as temp:
-            with open(temp + "/book.mp3", "wb") as m:
-                for minput in mp3:
-                    with open(minput, "rb") as minputf:
-                        m.write(minputf.read())
-                
-            
-            syncfile = temp + "/sync"
-            task.audio_file_path_absolute = temp + "/book.mp3"
-            task.text_file_path_absolute = html.get_filename()
-            task.sync_map_file_path_absolute = syncfile
-
-            ExecuteTask(task).execute()
-            task.output_sync_map_file()
-            sync = []
-            with open(syncfile) as f:
-                for line in f:
-                    start, end, sec = line.strip().split('\t')
-                    start = float(start)
-                    end = float(end)
-                    sync.append([start, end, sec])
+            config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
+            task = Task(config_string=config_string)
 
-        hms = format_hms(duration)
+            with tempfile.TemporaryDirectory() as temp:
+                syncfile = temp + "/sync"
+                task.audio_file_path_absolute = mp3[i]
+                task.text_file_path_absolute = html.get_filename()
+                task.sync_map_file_path_absolute = syncfile
 
-        narrator = mutagen.File(mp3[0]).get('TPE1')
-        narrator = narrator.text[0] if narrator else ''
+                ExecuteTask(task).execute()
+                task.output_sync_map_file()
+
+                sync = []
+                with open(syncfile) as f:
+                    for line in f:
+                        start, end, sec = line.strip().split('\t')
+                        start = float(start)
+                        end = float(end)
+                        sync.append([start, end, sec])
+
+            hms = format_hms(durations[i])
+            elapsed_hms = format_hms(sum(durations[:i]))
+
+            context = {
+                "VERSION": "1.10",
+
+                "HHMMSSmmm": hms,
+                "HHMMSS": hms.split('.')[0],
+                "Sd": "%.1f" % durations[i],
+                "ELAPSED": elapsed_hms,
 
+                "TITLE": document.meta.title,
+                "PUBLISHER": document.meta.publisher[0],
+                "YEAR": document.meta.created_at[:4],
+                "MONTH": document.meta.created_at[5:7],
+                "AUTHOR": document.meta.author.readable(),
+
+                "NARRATOR": narrator,
+            }
+
+            with open(get_resource('res/daisy/content.smil')) as f:
+                tree = etree.parse(f)
+            populate(tree.getroot(), context)
+
+            seq = tree.find('//seq')
+            for si, item in enumerate(sync):
+                par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
+                etree.SubElement(
+                    par,
+                    "text",
+                    src="book%d.html#%s" % (i, item[2]))
+
+                audio = etree.SubElement(
+                    par,
+                    "audio",
+                    src="book%d.mp3" % i,
+                    **{
+                        "clip-begin": "npt=%.3fs" % item[0],
+                        "clip-end": "npt=%.3fs" % item[1],
+                    },
+                )
+
+            zipf.writestr(
+                directory + 'content%d.smil' % i,
+                etree.tostring(
+                    tree,
+                    xml_declaration=True,
+                    pretty_print=True,
+                ),
+            )
+
+        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
+            zipf.write(
+                get_resource('res/daisy/' + fname),
+                directory + fname)
+
+        duration = sum(durations)
+        hms = format_hms(duration)
         context = {
             "VERSION": "1.10",
-
             "HHMMSSmmm": hms,
             "HHMMSS": hms.split('.')[0],
             "Sd": "%.1f" % duration,
-
             "TITLE": document.meta.title,
             "PUBLISHER": document.meta.publisher[0],
             "YEAR": document.meta.created_at[:4],
             "MONTH": document.meta.created_at[5:7],
             "AUTHOR": document.meta.author.readable(),
-
             "NARRATOR": narrator,
         }
 
-        for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
-            zipf.write(
-                get_resource('res/daisy/' + fname),
-                directory + fname)
+        tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
+        cont = tree.getroot()[0]
+        for i, dur in enumerate(durations):
+            etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
+        zipf.writestr(
+            directory + 'er_book_info.xml',
+            etree.tostring(tree, xml_declaration=True))
 
-        for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
-            with open(get_resource('res/daisy/' + fname)) as f:
-                tree = etree.parse(f)
-            populate(tree.getroot(), context)
-            zipf.writestr(
-                directory + fname,
-                etree.tostring(
-                    tree,
-                    xml_declaration=True
-                ),
-            )
+        tree = etree.parse(get_resource('res/daisy/master.smil'))
+        populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
+        zipf.writestr(
+            directory + 'master.smil',
+            etree.tostring(tree, xml_declaration=True))
 
-        with open(get_resource('res/daisy/content.smil')) as f:
-            tree = etree.parse(f)
+        tree = etree.parse(get_resource('res/daisy/ncc.html'))
         populate(tree.getroot(), context)
+        cont = tree.getroot()[-1]
+        for i, header in enumerate(headers):
+            if not i:
+                h1 = etree.SubElement(
+                    cont, 'h1', id='content', **{"class": "title"})
+                etree.SubElement(
+                    h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
+            else:
+                h2 = etree.SubElement(
+                    cont, 'h2', id='content', **{"class": "chapter"})
+                etree.SubElement(
+                    h2, "a", href='content%d.smil#par000001' % i).text = header
 
-        seq = tree.find('//seq')
-        for i, item in enumerate(sync):
-            par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
-            etree.SubElement(
-                par,
-                "text",
-                src="book.html#%s" % item[2])
-
-            # If we have a split between mp3 parts, err on the larger side.
-            i = 0
-            start, end = item[0], item[1]
-            while start >= durations[i]:
-                start -= durations[i]
-                end -= durations[i]
-                i += 1
-            if 2 * (end - durations[i]) > end - start:
-                start = 0
-                end -= durations[i]
-                i += 1
-
-            audio = etree.SubElement(
-                par,
-                "audio",
-                src="book%d.mp3" % i,
-                **{
-                    "clip-begin": "npt=%.3fs" % start,
-                    "clip-end": "npt=%.3fs" % end,
-                },
-            )
-            
         zipf.writestr(
-            directory + 'content.smil',
-            etree.tostring(
-                tree,
-                xml_declaration=True,
-                pretty_print=True,
-            ),
-        )
-
-            
-# WHERE IS MP3
-        
+            directory + 'ncc.html',
+            etree.tostring(tree, xml_declaration=True))
+
         zipf.close()
         return OutputFile.from_filename(outfile.name)