Some prelim work on builder api.
[librarian.git] / src / librarian / builders / daisy.py
1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
3 #
4 from copy import deepcopy
5 import subprocess
6 import tempfile
7 import zipfile
8 from lxml import etree
9 import mutagen
10 from librarian import OutputFile, get_resource
11 from librarian.html import raw_printable_text
12 from .html import DaisyHtmlBuilder
13
14
15 def get_duration(path):
16     return float(
17         subprocess.run(
18             [
19                 "ffprobe",
20                 "-i",
21                 path,
22                 "-show_entries",
23                 "format=duration",
24                 "-v",
25                 "quiet",
26                 "-of",
27                 "csv=p=0",
28             ],
29             capture_output=True,
30             text=True,
31             check=True,
32         ).stdout
33     )
34
35
36 def format_hms(t):
37     seconds = t % 60
38     t -= seconds
39     t /= 60
40     minutes = t % 60
41     t -= minutes
42     hours = t / 60
43     return "%02d:%02d:%02.3f" % (hours, minutes, seconds)    
44
45
46 def populate(element, context):
47     if element.text:
48         element.text = element.text.format(**context)
49     if element.tail:
50         element.tail = element.tail.format(**context)
51     for k, v in element.attrib.items():
52         element.attrib[k] = v.format(**context)
53     for child in element:
54         populate(child, context)
55
56
57 class DaisyBuilder:
58     file_extension = 'daisy.zip'
59
60     def build(self, document, mp3, split_on=None, **kwargs):
61         if not mp3:
62             raise ValueError("Need MP3 files")
63         
64         outfile = tempfile.NamedTemporaryFile(delete=False)
65         zipf = zipfile.ZipFile(outfile, 'w')
66
67         directory = document.meta.url.slug + '/'
68
69         if split_on:
70             documents = []
71             headers = []
72             present = True
73             n = 0
74             while present:
75                 present = False
76                 n += 1
77                 newdoc = deepcopy(document)
78                 newdoc.tree.getroot().document = newdoc
79
80                 master = newdoc.tree.getroot()[-1]
81                 i = 0
82                 for item in list(master):
83                     if item.tag == split_on:
84                         # TODO: clear
85                         i += 1
86                         if i == n:
87                             headers.append(raw_printable_text(item))
88                     if i != n and not (n == 1 and not i):
89                         master.remove(item)
90                     else:
91                         present = True
92                 if present:
93                     documents.append(newdoc)
94         else:
95             documents = [document]
96             headers = [document.meta.title]
97
98         assert len(documents) == len(mp3)
99
100         narrator = mutagen.File(mp3[0]).get('TPE1')
101         narrator = narrator.text[0] if narrator else ''
102
103         durations = []
104         for i, part in enumerate(documents):
105             print('part', i)
106             html = DaisyHtmlBuilder().build(part)
107             zipf.write(
108                 html.get_filename(),
109                 directory + 'book%d.html' % i,
110             )
111
112             durations.append(get_duration(mp3[i]))
113             zipf.write(
114                 mp3[i],
115                 directory + "book%d.mp3" % i,
116             )
117
118             populate(tree.getroot(), context)
119
120             zipf.write(
121                 syncfiles[i],
122                 directory + 'content%d.smil' % i,
123             )
124
125         for fname in ('smil10.dtd', 'xhtml1-transitional.dtd', 'xhtml-lat1.ent', 'xhtml-special.ent', 'xhtml-symbol.ent'):
126             zipf.write(
127                 get_resource('res/daisy/' + fname),
128                 directory + fname)
129
130         duration = sum(durations)
131         hms = format_hms(duration)
132         context = {
133             "VERSION": "1.10",
134             "HHMMSSmmm": hms,
135             "HHMMSS": hms.split('.')[0],
136             "Sd": "%.1f" % duration,
137             "TITLE": document.meta.title,
138             "PUBLISHER": document.meta.publisher[0],
139             "YEAR": document.meta.created_at[:4],
140             "MONTH": document.meta.created_at[5:7],
141             "AUTHOR": document.meta.author.readable(),
142             "NARRATOR": narrator,
143         }
144
145         tree = etree.parse(get_resource('res/daisy/er_book_info.xml'))
146         cont = tree.getroot()[0]
147         for i, dur in enumerate(durations):
148             etree.SubElement(cont, 'smil', nr=str(i+1), Name="content%i.smil" % i, dur="%.1f" % dur)
149         zipf.writestr(
150             directory + 'er_book_info.xml',
151             etree.tostring(tree, xml_declaration=True))
152
153         tree = etree.parse(get_resource('res/daisy/master.smil'))
154         populate(tree.getroot(), context)
155         cont = tree.getroot()[-1]
156         for i, header in enumerate(headers):
157             etree.SubElement(cont, 'ref', title=header, src="content%d.smil#seq000001" % i, id="smil_%04d" % i)
158         zipf.writestr(
159             directory + 'master.smil',
160             etree.tostring(tree, xml_declaration=True))
161
162         tree = etree.parse(get_resource('res/daisy/ncc.html'))
163         populate(tree.getroot(), context)
164         cont = tree.getroot()[-1]
165         for i, header in enumerate(headers):
166             if not i:
167                 h1 = etree.SubElement(
168                     cont, 'h1', id='content', **{"class": "title"})
169                 etree.SubElement(
170                     h1, "a", href='content%d.smil#par000001' % i).text = document.meta.title
171             else:
172                 h2 = etree.SubElement(
173                     cont, 'h2', id='content', **{"class": "chapter"})
174                 etree.SubElement(
175                     h2, "a", href='content%d.smil#par000001' % i).text = header
176
177         zipf.writestr(
178             directory + 'ncc.html',
179             etree.tostring(tree, xml_declaration=True))
180
181         zipf.close()
182         return OutputFile.from_filename(outfile.name)