Fixes.
[librarian.git] / src / librarian / builders / daisy.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 from copy import deepcopy
5 import subprocess
6 import tempfile
7 import zipfile
8 from aeneas.executetask import ExecuteTask
9 from aeneas.task import Task
10 from lxml import etree
11 import mutagen
12 from librarian import OutputFile, get_resource
13 from librarian.html import raw_printable_text
14 from .html import DaisyHtmlBuilder
15
16
17 def get_duration(path):
18     return float(
19         subprocess.run(
20             [
21                 "ffprobe",
22                 "-i",
23                 path,
24                 "-show_entries",
25                 "format=duration",
26                 "-v",
27                 "quiet",
28                 "-of",
29                 "csv=p=0",
30             ],
31             capture_output=True,
32             text=True,
33             check=True,
34         ).stdout
35     )
36
37
38 def format_hms(t):
39     seconds = t % 60
40     t -= seconds
41     t /= 60
42     minutes = t % 60
43     t -= minutes
44     hours = t / 60
45     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
46
47
48 def populate(element, context):
49     if element.text:
50         element.text = element.text.format(**context)
51     if element.tail:
52         element.tail = element.tail.format(**context)
53     for k, v in element.attrib.items():
54         element.attrib[k] = v.format(**context)
55     for child in element:
56         populate(child, context)
57
58
59 class DaisyBuilder:
60     file_extension = 'daisy.zip'
61
62     def build(self, document, mp3, split_on=None, **kwargs):
63         if not mp3:
64             raise ValueError("Need MP3 files")
65         
66         outfile = tempfile.NamedTemporaryFile(delete=False)
67         zipf = zipfile.ZipFile(outfile, 'w')
68
69         directory = document.meta.url.slug + '/'
70
71         if split_on:
72             documents = []
73             headers = []
74             present = True
75             n = 0
76             while present:
77                 present = False
78                 n += 1
79                 newdoc = deepcopy(document)
80                 newdoc.tree.getroot().document = newdoc
81
82                 master = newdoc.tree.getroot()[-1]
83                 i = 0
84                 for item in list(master):
85                     if item.tag == split_on:
86                         # TODO: clear
87                         i += 1
88                         if i == n:
89                             headers.append(raw_printable_text(item))
90                     if i != n and not (n == 1 and not i):
91                         master.remove(item)
92                     else:
93                         present = True
94                 if present:
95                     documents.append(newdoc)
96         else:
97             documents = [document]
98             headers = [document.meta.title]
99
100         assert len(documents) == len(mp3)
101
102         narrator = mutagen.File(mp3[0]).get('TPE1')
103         narrator = narrator.text[0] if narrator else ''
104
105         durations = []
106         for i, part in enumerate(documents):
107             print('part', i)
108             html = DaisyHtmlBuilder().build(part)
109             zipf.write(
110                 html.get_filename(),
111                 directory + 'book%d.html' % i,
112             )
113
114             durations.append(get_duration(mp3[i]))
115             zipf.write(
116                 mp3[i],
117                 directory + "book%d.mp3" % i,
118             )
119
120             config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
121             task = Task(config_string=config_string)
122
123             with tempfile.TemporaryDirectory() as temp:
124                 syncfile = temp + "/sync"
125                 task.audio_file_path_absolute = mp3[i]
126                 task.text_file_path_absolute = html.get_filename()
127                 task.sync_map_file_path_absolute = syncfile
128
129                 ExecuteTask(task).execute()
130                 task.output_sync_map_file()
131
132                 sync = []
133                 with open(syncfile) as f:
134                     for line in f:
135                         start, end, sec = line.strip().split('\t')
136                         start = float(start)
137                         end = float(end)
138                         sync.append([start, end, sec])
139
140             hms = format_hms(durations[i])
141             elapsed_hms = format_hms(sum(durations[:i]))
142
143             context = {
144                 "VERSION": "1.10",
145
146                 "HHMMSSmmm": hms,
147                 "HHMMSS": hms.split('.')[0],
148                 "Sd": "%.1f" % durations[i],
149                 "ELAPSED": elapsed_hms,
150
151                 "TITLE": document.meta.title,
152                 "PUBLISHER": document.meta.publisher[0],
153                 "YEAR": document.meta.created_at[:4],
154                 "MONTH": document.meta.created_at[5:7],
155                 "AUTHOR": document.meta.author.readable(),
156
157                 "NARRATOR": narrator,
158             }
159
160             with open(get_resource('res/daisy/content.smil')) as f:
161                 tree = etree.parse(f)
162             populate(tree.getroot(), context)
163
164             seq = tree.find('//seq')
165             for si, item in enumerate(sync):
166                 par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
167                 etree.SubElement(
168                     par,
169                     "text",
170                     src="book%d.html#%s" % (i, item[2]))
171
172                 audio = etree.SubElement(
173                     par,
174                     "audio",
175                     src="book%d.mp3" % i,
176                     **{
177                         "clip-begin": "npt=%.3fs" % item[0],
178                         "clip-end": "npt=%.3fs" % item[1],
179                     }
180                 )
181
182             zipf.writestr(
183                 directory + 'content%d.smil' % i,
184                 etree.tostring(
185                     tree,
186                     xml_declaration=True,
187                     pretty_print=True,
188                 ),
189             )
190
191         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
192             zipf.write(
193                 get_resource('res/daisy/' + fname),
194                 directory + fname)
195
196         duration = sum(durations)
197         hms = format_hms(duration)
198         context = {
199             "VERSION": "1.10",
200             "HHMMSSmmm": hms,
201             "HHMMSS": hms.split('.')[0],
202             "Sd": "%.1f" % duration,
203             "TITLE": document.meta.title,
204             "PUBLISHER": document.meta.publisher[0],
205             "YEAR": document.meta.created_at[:4],
206             "MONTH": document.meta.created_at[5:7],
207             "AUTHOR": document.meta.author.readable(),
208             "NARRATOR": narrator,
209         }
210
211         tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
212         cont = tree.getroot()[0]
213         for i, dur in enumerate(durations):
214             etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
215         zipf.writestr(
216             directory + 'er_book_info.xml',
217             etree.tostring(tree, xml_declaration=True))
218
219         tree = etree.parse(get_resource('res/daisy/master.smil'))
220         populate(tree.getroot(), context)
221         cont = tree.getroot()[-1]
222         for i, header in enumerate(headers):
223             etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
224         zipf.writestr(
225             directory + 'master.smil',
226             etree.tostring(tree, xml_declaration=True))
227
228         tree = etree.parse(get_resource('res/daisy/ncc.html'))
229         populate(tree.getroot(), context)
230         cont = tree.getroot()[-1]
231         for i, header in enumerate(headers):
232             if not i:
233                 h1 = etree.SubElement(
234                     cont, 'h1', id='content', **{"class": "title"})
235                 etree.SubElement(
236                     h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
237             else:
238                 h2 = etree.SubElement(
239                     cont, 'h2', id='content', **{"class": "chapter"})
240                 etree.SubElement(
241                     h2, "a", href='content%d.smil#par000001' % i).text = header
242
243         zipf.writestr(
244             directory + 'ncc.html',
245             etree.tostring(tree, xml_declaration=True))
246
247         zipf.close()
248         return OutputFile.from_filename(outfile.name)