fix tests
[librarian.git] / src / librarian / builders / daisy.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 from copy import deepcopy
5 import subprocess
6 import tempfile
7 import zipfile
8 from lxml import etree
9 import mutagen
10 from librarian import OutputFile, get_resource
11 from librarian.html import raw_printable_text
12 from .html import DaisyHtmlBuilder
13
14
15 def get_duration(path):
16     return float(
17         subprocess.run(
18             [
19                 "ffprobe",
20                 "-i",
21                 path,
22                 "-show_entries",
23                 "format=duration",
24                 "-v",
25                 "quiet",
26                 "-of",
27                 "csv=p=0",
28             ],
29             capture_output=True,
30             text=True,
31             check=True,
32         ).stdout
33     )
34
35
36 def format_hms(t):
37     seconds = t % 60
38     t -= seconds
39     t /= 60
40     minutes = t % 60
41     t -= minutes
42     hours = t / 60
43     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
44
45
46 def populate(element, context):
47     if element.text:
48         element.text = element.text.format(**context)
49     if element.tail:
50         element.tail = element.tail.format(**context)
51     for k, v in element.attrib.items():
52         element.attrib[k] = v.format(**context)
53     for child in element:
54         populate(child, context)
55
56
57 class DaisyBuilder:
58     file_extension = 'daisy.zip'
59
60     def build(self, document, mp3, split_on=None, **kwargs):
61         if not mp3:
62             raise ValueError("Need MP3 files")
63         
64         outfile = tempfile.NamedTemporaryFile(delete=False)
65         zipf = zipfile.ZipFile(outfile, 'w')
66
67         directory = document.meta.url.slug + '/'
68
69         if split_on:
70             documents = []
71             headers = []
72             present = True
73             n = 0
74             while present:
75                 present = False
76                 n += 1
77                 newdoc = deepcopy(document)
78                 newdoc.tree.getroot().document = newdoc
79
80                 master = newdoc.tree.getroot()[-1]
81                 i = 0
82                 for item in list(master):
83                     if item.tag == split_on:
84                         # TODO: clear
85                         i += 1
86                         if i == n:
87                             headers.append(raw_printable_text(item))
88                     if i != n and not (n == 1 and not i):
89                         master.remove(item)
90                     else:
91                         present = True
92                 if present:
93                     documents.append(newdoc)
94         else:
95             documents = [document]
96             headers = [document.meta.title]
97
98         assert len(documents) == len(mp3)
99
100         narrator = mutagen.File(mp3[0]).get('TPE1')
101         narrator = narrator.text[0] if narrator else ''
102
103         durations = []
104         for i, part in enumerate(documents):
105             print('part', i)
106             html = DaisyHtmlBuilder().build(part)
107             zipf.write(
108                 html.get_filename(),
109                 directory + 'book%d.html' % i,
110             )
111
112             durations.append(get_duration(mp3[i]))
113             zipf.write(
114                 mp3[i],
115                 directory + "book%d.mp3" % i,
116             )
117
118             config_string = "task_language=pol|is_text_type=unparsed|is_text_unparsed_id_regex=sec\\d+$|is_text_unparsed_id_sort=numeric|os_task_file_format=tab"
119             task = Task(config_string=config_string)
120
121             with tempfile.TemporaryDirectory() as temp:
122                 syncfile = temp + "/sync"
123                 task.audio_file_path_absolute = mp3[i]
124                 task.text_file_path_absolute = html.get_filename()
125                 task.sync_map_file_path_absolute = syncfile
126
127                 ExecuteTask(task).execute()
128                 task.output_sync_map_file()
129
130                 sync = []
131                 with open(syncfile) as f:
132                     for line in f:
133                         start, end, sec = line.strip().split('\t')
134                         start = float(start)
135                         end = float(end)
136                         sync.append([start, end, sec])
137
138             hms = format_hms(durations[i])
139             elapsed_hms = format_hms(sum(durations[:i]))
140
141             context = {
142                 "VERSION": "1.10",
143
144                 "HHMMSSmmm": hms,
145                 "HHMMSS": hms.split('.')[0],
146                 "Sd": "%.1f" % durations[i],
147                 "ELAPSED": elapsed_hms,
148
149                 "TITLE": document.meta.title,
150                 "PUBLISHER": document.meta.publisher[0],
151                 "YEAR": document.meta.created_at[:4],
152                 "MONTH": document.meta.created_at[5:7],
153                 "AUTHOR": document.meta.author.readable(),
154
155                 "NARRATOR": narrator,
156             }
157
158             with open(get_resource('res/daisy/content.smil')) as f:
159                 tree = etree.parse(f)
160             populate(tree.getroot(), context)
161
162             seq = tree.find('//seq')
163             for si, item in enumerate(sync):
164                 par = etree.SubElement(seq, 'par', id="par%06d" % (si + 1), endsync="last")
165                 etree.SubElement(
166                     par,
167                     "text",
168                     src="book%d.html#%s" % (i, item[2]))
169
170                 audio = etree.SubElement(
171                     par,
172                     "audio",
173                     src="book%d.mp3" % i,
174                     **{
175                         "clip-begin": "npt=%.3fs" % item[0],
176                         "clip-end": "npt=%.3fs" % item[1],
177                     }
178                 )
179
180             zipf.writestr(
181                 directory + 'content%d.smil' % i,
182                 etree.tostring(
183                     tree,
184                     xml_declaration=True,
185                     pretty_print=True,
186                 ),
187             )
188
189         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
190             zipf.write(
191                 get_resource('res/daisy/' + fname),
192                 directory + fname)
193
194         duration = sum(durations)
195         hms = format_hms(duration)
196         context = {
197             "VERSION": "1.10",
198             "HHMMSSmmm": hms,
199             "HHMMSS": hms.split('.')[0],
200             "Sd": "%.1f" % duration,
201             "TITLE": document.meta.title,
202             "PUBLISHER": document.meta.publisher[0],
203             "YEAR": document.meta.created_at[:4],
204             "MONTH": document.meta.created_at[5:7],
205             "AUTHOR": document.meta.author.readable(),
206             "NARRATOR": narrator,
207         }
208
209         tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
210         cont = tree.getroot()[0]
211         for i, dur in enumerate(durations):
212             etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
213         zipf.writestr(
214             directory + 'er_book_info.xml',
215             etree.tostring(tree, xml_declaration=True))
216
217         tree = etree.parse(get_resource('res/daisy/master.smil'))
218         populate(tree.getroot(), context)
219         cont = tree.getroot()[-1]
220         for i, header in enumerate(headers):
221             etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
222         zipf.writestr(
223             directory + 'master.smil',
224             etree.tostring(tree, xml_declaration=True))
225
226         tree = etree.parse(get_resource('res/daisy/ncc.html'))
227         populate(tree.getroot(), context)
228         cont = tree.getroot()[-1]
229         for i, header in enumerate(headers):
230             if not i:
231                 h1 = etree.SubElement(
232                     cont, 'h1', id='content', **{"class": "title"})
233                 etree.SubElement(
234                     h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
235             else:
236                 h2 = etree.SubElement(
237                     cont, 'h2', id='content', **{"class": "chapter"})
238                 etree.SubElement(
239                     h2, "a", href='content%d.smil#par000001' % i).text = header
240
241         zipf.writestr(
242             directory + 'ncc.html',
243             etree.tostring(tree, xml_declaration=True))
244
245         zipf.close()
246         return OutputFile.from_filename(outfile.name)