Better handling of multipart DAISY.
[librarian.git] / src / librarian / builders / daisy.py
1 from copy import deepcopy
2 import subprocess
3 import tempfile
4 import zipfile
5 from aeneas.executetask import ExecuteTask
6 from aeneas.task import Task
7 from lxml import etree
8 import mutagen
9 from librarian import OutputFile, get_resource
10 from librarian.html import raw_printable_text
11 from .html import DaisyHtmlBuilder
12
13
14 def get_duration(path):
15     return float(
16         subprocess.run(
17             [
18                 "ffprobe",
19                 "-i",
20                 path,
21                 "-show_entries",
22                 "format=duration",
23                 "-v",
24                 "quiet",
25                 "-of",
26                 "csv=p=0",
27             ],
28             capture_output=True,
29             text=True,
30             check=True,
31         ).stdout
32     )
33
34
35 def format_hms(t):
36     seconds = t % 60
37     t -= seconds
38     t /= 60
39     minutes = t % 60
40     t -= minutes
41     hours = t / 60
42     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
43
44
45 def populate(element, context):
46     if element.text:
47         element.text = element.text.format(**context)
48     if element.tail:
49         element.tail = element.tail.format(**context)
50     for k, v in element.attrib.items():
51         element.attrib[k] = v.format(**context)
52     for child in element:
53         populate(child, context)
54
55
56 class DaisyBuilder:
57     file_extension = 'daisy.zip'
58
59     def build(self, document, mp3, split_on=None, **kwargs):
60         if not mp3:
61             raise ValueError("Need MP3 files")
62         
63         outfile = tempfile.NamedTemporaryFile(delete=False)
64         zipf = zipfile.ZipFile(outfile, 'w')
65
66         directory = document.meta.url.slug + '/'
67
68         if split_on:
69             documents = []
70             headers = []
71             present = True
72             n = 0
73             while present:
74                 present = False
75                 n += 1
76                 newdoc = deepcopy(document)
77                 newdoc.tree.getroot().document = newdoc
78
79                 master = newdoc.tree.getroot()[-1]
80                 i = 0
81                 for item in list(master):
82                     if item.tag == split_on:
83                         # TODO: clear
84                         i += 1
85                         if i == n:
86                             headers.append(raw_printable_text(item))
87                     if i != n and not (n == 1 and not i):
88                         master.remove(item)
89                     else:
90                         present = True
91                 if present:
92                     documents.append(newdoc)
93         else:
94             documents = [document]
95             headers = [document.meta.title]
96
97         assert len(documents) == len(mp3)
98
99         narrator = mutagen.File(mp3[0]).get('TPE1')
100         narrator = narrator.text[0] if narrator else ''
101
102         durations = []
103         for i, part in enumerate(documents):
104             print('part', i)
105             html = DaisyHtmlBuilder().build(part)
106             zipf.write(
107                 html.get_filename(),
108                 directory + 'book%d.html' % i,
109             )
110
111             durations.append(get_duration(mp3[i]))
112             zipf.write(
113                 mp3[i],
114                 directory + "book%d.mp3" % i,
115             )
116
117             config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
118             task = Task(config_string=config_string)
119
120             with tempfile.TemporaryDirectory() as temp:
121                 syncfile = temp + "/sync"
122                 task.audio_file_path_absolute = mp3[i]
123                 task.text_file_path_absolute = html.get_filename()
124                 task.sync_map_file_path_absolute = syncfile
125
126                 ExecuteTask(task).execute()
127                 task.output_sync_map_file()
128
129                 sync = []
130                 with open(syncfile) as f:
131                     for line in f:
132                         start, end, sec = line.strip().split('\t')
133                         start = float(start)
134                         end = float(end)
135                         sync.append([start, end, sec])
136
137             hms = format_hms(durations[i])
138             elapsed_hms = format_hms(sum(durations[:i]))
139
140             context = {
141                 "VERSION": "1.10",
142
143                 "HHMMSSmmm": hms,
144                 "HHMMSS": hms.split('.')[0],
145                 "Sd": "%.1f" % durations[i],
146                 "ELAPSED": elapsed_hms,
147
148                 "TITLE": document.meta.title,
149                 "PUBLISHER": document.meta.publisher[0],
150                 "YEAR": document.meta.created_at[:4],
151                 "MONTH": document.meta.created_at[5:7],
152                 "AUTHOR": document.meta.author.readable(),
153
154                 "NARRATOR": narrator,
155             }
156
157             with open(get_resource('res/daisy/content.smil')) as f:
158                 tree = etree.parse(f)
159             populate(tree.getroot(), context)
160
161             seq = tree.find('//seq')
162             for si, item in enumerate(sync):
163                 par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
164                 etree.SubElement(
165                     par,
166                     "text",
167                     src="book%d.html#%s" % (i, item[2]))
168
169                 audio = etree.SubElement(
170                     par,
171                     "audio",
172                     src="book%d.mp3" % i,
173                     **{
174                         "clip-begin": "npt=%.3fs" % item[0],
175                         "clip-end": "npt=%.3fs" % item[1],
176                     },
177                 )
178
179             zipf.writestr(
180                 directory + 'content%d.smil' % i,
181                 etree.tostring(
182                     tree,
183                     xml_declaration=True,
184                     pretty_print=True,
185                 ),
186             )
187
188         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
189             zipf.write(
190                 get_resource('res/daisy/' + fname),
191                 directory + fname)
192
193         duration = sum(durations)
194         hms = format_hms(duration)
195         context = {
196             "VERSION": "1.10",
197             "HHMMSSmmm": hms,
198             "HHMMSS": hms.split('.')[0],
199             "Sd": "%.1f" % duration,
200             "TITLE": document.meta.title,
201             "PUBLISHER": document.meta.publisher[0],
202             "YEAR": document.meta.created_at[:4],
203             "MONTH": document.meta.created_at[5:7],
204             "AUTHOR": document.meta.author.readable(),
205             "NARRATOR": narrator,
206         }
207
208         tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
209         cont = tree.getroot()[0]
210         for i, dur in enumerate(durations):
211             etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
212         zipf.writestr(
213             directory + 'er_book_info.xml',
214             etree.tostring(tree, xml_declaration=True))
215
216         tree = etree.parse(get_resource('res/daisy/master.smil'))
217         populate(tree.getroot(), context)
218         cont = tree.getroot()[-1]
219         for i, header in enumerate(headers):
220             etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
221         zipf.writestr(
222             directory + 'master.smil',
223             etree.tostring(tree, xml_declaration=True))
224
225         tree = etree.parse(get_resource('res/daisy/ncc.html'))
226         populate(tree.getroot(), context)
227         cont = tree.getroot()[-1]
228         for i, header in enumerate(headers):
229             if not i:
230                 h1 = etree.SubElement(
231                     cont, 'h1', id='content', **{"class": "title"})
232                 etree.SubElement(
233                     h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
234             else:
235                 h2 = etree.SubElement(
236                     cont, 'h2', id='content', **{"class": "chapter"})
237                 etree.SubElement(
238                     h2, "a", href='content%d.smil#par000001' % i).text = header
239
240         zipf.writestr(
241             directory + 'ncc.html',
242             etree.tostring(tree, xml_declaration=True))
243
244         zipf.close()
245         return OutputFile.from_filename(outfile.name)