b96226f16c3b2d3215795b063cc0f85e5d9b79fa
[librarian.git] / src / librarian / builders / daisy.py
1 import subprocess
2 import tempfile
3 import zipfile
4 from aeneas.executetask import ExecuteTask
5 from aeneas.task import Task
6 from lxml import etree
7 import mutagen
8 from librarian import OutputFile, get_resource
9 from .html import DaisyHtmlBuilder
10
11
12 def get_duration(path):
13     return float(
14         subprocess.run(
15             [
16                 "ffprobe",
17                 "-i",
18                 path,
19                 "-show_entries",
20                 "format=duration",
21                 "-v",
22                 "quiet",
23                 "-of",
24                 "csv=p=0",
25             ],
26             capture_output=True,
27             text=True,
28             check=True,
29         ).stdout
30     )
31
32
33 def format_hms(t):
34     seconds = t % 60
35     t -= seconds
36     t /= 60
37     minutes = t % 60
38     t -= minutes
39     hours = t / 60
40     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
41
42
43 def populate(element, context):
44     if element.text:
45         element.text = element.text.format(**context)
46     if element.tail:
47         element.tail = element.tail.format(**context)
48     for k, v in element.attrib.items():
49         element.attrib[k] = v.format(**context)
50     for child in element:
51         populate(child, context)
52
53
54 class DaisyBuilder:
55     file_extension = 'daisy.zip'
56
57     def build(self, document, mp3, **kwargs):
58         if not mp3:
59             raise ValueError("Need MP3 files")
60         
61         outfile = tempfile.NamedTemporaryFile(delete=False)
62         zipf = zipfile.ZipFile(outfile, 'w')
63
64         directory = document.meta.url.slug + '/'
65
66         html = DaisyHtmlBuilder().build(document)
67         zipf.write(
68             html.get_filename(),
69             directory + 'book.html',
70         )
71
72         durations = []
73         for i, mp3_file in enumerate(mp3):
74             durations.append(get_duration(mp3_file))
75             zipf.write(
76                 mp3_file,
77                 directory + "book%d.mp3" % i,
78             )
79         duration = sum(durations)
80
81         config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
82         task = Task(config_string=config_string)
83
84         # TODO: concatenate all the
85         with tempfile.TemporaryDirectory() as temp:
86             with open(temp + "/book.mp3", "wb") as m:
87                 for minput in mp3:
88                     with open(minput, "rb") as minputf:
89                         m.write(minputf.read())
90                 
91             
92             syncfile = temp + "/sync"
93             task.audio_file_path_absolute = temp + "/book.mp3"
94             task.text_file_path_absolute = html.get_filename()
95             task.sync_map_file_path_absolute = syncfile
96
97             ExecuteTask(task).execute()
98             task.output_sync_map_file()
99             sync = []
100             with open(syncfile) as f:
101                 for line in f:
102                     start, end, sec = line.strip().split('\t')
103                     start = float(start)
104                     end = float(end)
105                     sync.append([start, end, sec])
106
107         hms = format_hms(duration)
108
109         narrator = mutagen.File(mp3[0]).get('TPE1')
110         narrator = narrator.text[0] if narrator else ''
111
112         context = {
113             "VERSION": "1.10",
114
115             "HHMMSSmmm": hms,
116             "HHMMSS": hms.split('.')[0],
117             "Sd": "%.1f" % duration,
118
119             "TITLE": document.meta.title,
120             "PUBLISHER": document.meta.publisher[0],
121             "YEAR": document.meta.created_at[:4],
122             "MONTH": document.meta.created_at[5:7],
123             "AUTHOR": document.meta.author.readable(),
124
125             "NARRATOR": narrator,
126         }
127
128         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
129             zipf.write(
130                 get_resource('res/daisy/' + fname),
131                 directory + fname)
132
133         for fname in ('er_book_info.xml', 'master.smil', 'ncc.html'):
134             with open(get_resource('res/daisy/' + fname)) as f:
135                 tree = etree.parse(f)
136             populate(tree.getroot(), context)
137             zipf.writestr(
138                 directory + fname,
139                 etree.tostring(
140                     tree,
141                     xml_declaration=True
142                 ),
143             )
144
145         with open(get_resource('res/daisy/content.smil')) as f:
146             tree = etree.parse(f)
147         populate(tree.getroot(), context)
148
149         seq = tree.find('//seq')
150         for i, item in enumerate(sync):
151             par = etree.SubElement(seq, 'par', id="par%06d" % (i + 1), endsync="last")
152             etree.SubElement(
153                 par,
154                 "text",
155                 src="book.html#%s" % item[2])
156
157             # If we have a split between mp3 parts, err on the larger side.
158             i = 0
159             start, end = item[0], item[1]
160             while start >= durations[i]:
161                 start -= durations[i]
162                 end -= durations[i]
163                 i += 1
164             if 2 * (end - durations[i]) > end - start:
165                 start = 0
166                 end -= durations[i]
167                 i += 1
168
169             audio = etree.SubElement(
170                 par,
171                 "audio",
172                 src="book%d.mp3" % i,
173                 **{
174                     "clip-begin": "npt=%.3fs" % start,
175                     "clip-end": "npt=%.3fs" % end,
176                 },
177             )
178             
179         zipf.writestr(
180             directory + 'content.smil',
181             etree.tostring(
182                 tree,
183                 xml_declaration=True,
184                 pretty_print=True,
185             ),
186         )
187
188             
189 # WHERE IS MP3
190         
191         zipf.close()
192         return OutputFile.from_filename(outfile.name)