src/librarian/builders/daisy.py

   1 import subprocess
   2 import tempfile
   3 import zipfile
   4 from aeneas.executetask import ExecuteTask
   5 from aeneas.task import Task
   6 from lxml import etree
   7 import mutagen
   8 from librarian import OutputFile, get_resource
   9 from .html import DaisyHtmlBuilder
  10
  11
  12 def get_duration(path):
  13     return float(
  14         subprocess.run(
  15             [
  16                 "ffprobe",
  17                 "-i",
  18                 path,
  19                 "-show_entries",
  20                 "format=duration",
  21                 "-v",
  22                 "quiet",
  23                 "-of",
  24                 "csv=p=0",
  25             ],
  26             capture_output=True,
  27             text=True,
  28             check=True,
  29         ).stdout
  30     )
  31
  32
  33 def format_hms(t):
  34     seconds = t % 60
  35     t -= seconds
  36     t /= 60
  37     minutes = t % 60
  38     t -= minutes
  39     hours = t / 60
  40     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)
  41
  42
  43 def populate(element, context):
  44     if element.text:
  45         element.text = element.text.format(**context)
  46     if element.tail:
  47         element.tail = element.tail.format(**context)
  48     for k, v in element.attrib.items():
  49         element.attrib[k] = v.format(**context)
  50     for child in element:
  51         populate(child, context)
  52
  53
  54 class DaisyBuilder:
  55     file_extension = 'daisy.zip'
  56
  57     def build(self, document, mp3, **kwargs):
  58         if not mp3:
  59             raise ValueError("Need MP3 files")
  60
  61         outfile = tempfile.NamedTemporaryFile(delete=False)
  62         zipf = zipfile.ZipFile(outfile, 'w')
  63
  64         directory = document.meta.url.slug + '/'
  65
  66         html = DaisyHtmlBuilder().build(document)
  67         zipf.write(
  68             html.get_filename(),
  69             directory + 'book.html',
  70         )
  71
  72         durations = []
  73         for i, mp3_file in enumerate(mp3):
  74             durations.append(get_duration(mp3_file))
  75             zipf.write(
  76                 mp3_file,
  77                 directory + "book%d.mp3" % i,
  78             )
  79         duration = sum(durations)
  80
  81         config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
  82         task = Task(config_string=config_string)
  83
  84         # TODO: concatenate all the
  85         with tempfile.TemporaryDirectory() as temp:
  86             with open(temp + "/book.mp3", "wb") as m:
  87                 for minput in mp3:
  88                     with open(minput, "rb") as minputf:
  89                         m.write(minputf.read())
  90
  91
  92             syncfile = temp + "/sync"
  93             task.audio_file_path_absolute = temp + "/book.mp3"
  94             task.text_file_path_absolute = html.get_filename()
  95             task.sync_map_file_path_absolute = syncfile
  96
  97             ExecuteTask(task).execute()
  98             task.output_sync_map_file()
  99             sync = []
 100             with open(syncfile) as f:
 101                 for line in f:
 102                     start, end, sec = line.strip().split('\t')
 103                     start = float(start)
 104                     end = float(end)
 105                     sync.append([start, end, sec])
 106
 107         hms = format_hms(duration)
 108
 109         narrator = mutagen.File(mp3[0]).get('TPE1')
 110         narrator = narrator.text[0] if narrator else ''
 111
 112         context = {
 113             "VERSION": "1.10",
 114
 115             "HHMMSSmmm": hms,
 116             "HHMMSS": hms.split('.')[0],
 117             "Sd": "%.1f" % duration,
 118
 119             "TITLE": document.meta.title,
 120             "PUBLISHER": document.meta.publisher[0],
 121             "YEAR": document.meta.created_at[:4],
 122             "MONTH": document.meta.created_at[5:7],
 123             "AUTHOR": document.meta.author.readable(),
 124
 125             "NARRATOR": narrator,
 126         }
 127
 128         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
 129             zipf.write(
 130                 get_resource('res/daisy/' + fname),
 131                 directory + fname)
 132
 133         for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
 134             with open(get_resource('res/daisy/' + fname)) as f:
 135                 tree = etree.parse(f)
 136             populate(tree.getroot(), context)
 137             zipf.writestr(
 138                 directory + fname,
 139                 etree.tostring(
 140                     tree,
 141                     xml_declaration=True
 142                 ),
 143             )
 144
 145         with open(get_resource('res/daisy/content.smil')) as f:
 146             tree = etree.parse(f)
 147         populate(tree.getroot(), context)
 148
 149         seq = tree.find('//seq')
 150         for i, item in enumerate(sync):
 151             par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
 152             etree.SubElement(
 153                 par,
 154                 "text",
 155                 src="book.html#%s" % item[2])
 156
 157             # If we have a split between mp3 parts, err on the larger side.
 158             i = 0
 159             start, end = item[0], item[1]
 160             while start >= durations[i]:
 161                 start -= durations[i]
 162                 end -= durations[i]
 163                 i += 1
 164             if 2 * (end - durations[i]) > end - start:
 165                 start = 0
 166                 end -= durations[i]
 167                 i += 1
 168
 169             audio = etree.SubElement(
 170                 par,
 171                 "audio",
 172                 src="book%d.mp3" % i,
 173                 **{
 174                     "clip-begin": "npt=%.3fs" % start,
 175                     "clip-end": "npt=%.3fs" % end,
 176                 },
 177             )
 178
 179         zipf.writestr(
 180             directory + 'content.smil',
 181             etree.tostring(
 182                 tree,
 183                 xml_declaration=True,
 184                 pretty_print=True,
 185             ),
 186         )
 187
 188
 189 # WHERE IS MP3
 190
 191         zipf.close()
 192         return OutputFile.from_filename(outfile.name)