1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
23 'legacy': 'xslt/book2html.xslt',
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
33 '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37 def transform_abstrakt(abstrakt_element):
38 style_filename = get_stylesheet('legacy')
39 style = etree.parse(style_filename)
40 xml = etree.tostring(abstrakt_element, encoding='unicode')
41 document = etree.parse(io.StringIO(
42 xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
44 result = document.xslt(style)
45 html = re.sub('<a name="sec[0-9]*"/>', '',
46 etree.tostring(result, encoding='unicode'))
47 return re.sub('</?blockquote[^>]*>', '', html)
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51 widths = [360, 600, 1200, 1800, 2400]
53 os.makedirs(gallery_path)
57 for i, ilustr in enumerate(tree.findall('//ilustr')):
58 rel_path = ilustr.attrib['src']
59 img_url = urllib.parse.urljoin(base_url, rel_path)
61 f = urllib.request.urlopen(img_url)
63 ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
66 # Needed widths: predefined and original, limited by
67 # whichever is smaller.
71 set(widths + [img.size[0]])
73 if w <= min(widths[-1], img.size[0])
77 fname = '%d.W%d.%s' % (i, w, ext)
78 fpath = gallery_path + fname
79 if not os.path.exists(fpath):
80 height = round(img.size[1] * w / img.size[0])
81 th = img.resize((w, height))
83 th_url = gallery_url + fname
84 srcset.append(" ".join((
89 ilustr.attrib['srcset'] = ", ".join(srcset)
90 ilustr.attrib['src'] = largest_url
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96 """Transforms the WL document to XHTML.
98 If output_filename is None, returns an XML,
99 otherwise returns True if file has been written,False if it hasn't.
100 File won't be written if it has no content.
104 style_filename = get_stylesheet(stylesheet)
105 style = etree.parse(style_filename)
107 document = copy.deepcopy(wldoc)
109 document.swap_endlines()
113 document.edoc.getroot().set(flag, 'yes')
115 document.clean_ed_note()
116 document.clean_ed_note('abstrakt')
117 document.fix_pa_akap()
118 document.hebr_protect()
124 os.makedirs(gallery_path)
128 add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
132 or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
135 result = document.transform(style, css=css, **options)
136 del document # no longer needed large object :)
138 if html_has_content(result):
139 add_anchors(result.getroot())
140 add_table_of_themes(result.getroot())
141 add_table_of_contents(result.getroot())
143 return OutputFile.from_bytes(etree.tostring(
144 result, method='html', xml_declaration=False,
145 pretty_print=True, encoding='utf-8'
150 raise ValueError("'%s' is not a valid stylesheet.")
151 except (XMLSyntaxError, XSLTApplyError) as e:
156 def __init__(self, id, themes):
157 super(Fragment, self).__init__()
162 def append(self, event, element):
163 self.events.append((event, element))
165 def closed_events(self):
167 for event, element in self.events:
169 stack.append(('end', element))
174 print('CLOSED NON-OPEN TAG:', element)
177 return self.events + stack
181 for event, element in self.closed_events():
183 result.append('<%s %s>' % (
187 for k, v in element.attrib.items()
191 result.append(element.text)
193 result.append('</%s>' % element.tag)
195 result.append(element.tail)
197 result.append(element)
199 return ''.join(result)
202 return self.to_string()
205 def extract_fragments(input_filename):
206 """Extracts theme fragments from input_filename."""
208 closed_fragments = {}
210 # iterparse would die on a HTML document
211 parser = etree.HTMLParser(encoding='utf-8')
213 buf.write(etree.tostring(
214 etree.parse(input_filename, parser).getroot()[0][0],
219 for event, element in etree.iterparse(buf, events=('start', 'end')):
220 # Process begin and end elements
221 if element.get('class', '') in ('theme-begin', 'theme-end'):
222 if not event == 'end':
223 continue # Process elements only once, on end event
226 if element.get('class', '') == 'theme-begin':
227 fragment = Fragment(id=element.get('fid'), themes=element.text)
230 parent = element.getparent()
232 while parent.get('id', None) != 'book-text':
233 cparent = copy.deepcopy(parent)
235 if 'id' in cparent.attrib:
236 del cparent.attrib['id']
237 parents.append(cparent)
238 parent = parent.getparent()
241 for parent in parents:
242 fragment.append('start', parent)
244 if fragment.id not in open_fragments:
245 open_fragments[fragment.id] = fragment
247 # Close existing fragment
250 fragment = open_fragments[element.get('fid')]
252 print('%s:closed not open fragment #%s' % (
253 input_filename, element.get('fid')
256 closed_fragments[fragment.id] = fragment
257 del open_fragments[fragment.id]
259 # Append element tail to lost_text
260 # (we don't want to lose any text)
262 for fragment_id in open_fragments:
263 open_fragments[fragment_id].append('text', element.tail)
265 # Process all elements except begin and end
267 # Omit annotation tags
268 if (len(element.get('name', '')) or
269 element.get('class', '') in ('annotation', 'anchor')):
270 if event == 'end' and element.tail:
271 for fragment_id in open_fragments:
272 open_fragments[fragment_id].append(
276 for fragment_id in open_fragments:
277 celem = copy.copy(element)
278 if 'id' in celem.attrib:
279 del celem.attrib['id']
280 open_fragments[fragment_id].append(
284 return closed_fragments, open_fragments
287 def add_anchor(element, prefix, with_link=True, with_target=True,
289 parent = element.getparent()
290 index = parent.index(element)
293 if link_text is None:
295 anchor = etree.Element('a', href='#%s' % prefix)
296 anchor.set('class', 'anchor')
297 anchor.text = str(link_text)
298 parent.insert(index, anchor)
301 anchor_target = etree.Element('a', name='%s' % prefix)
302 anchor_target.set('class', 'target')
303 anchor_target.text = ' '
304 parent.insert(index, anchor_target)
307 def any_ancestor(element, test):
308 for ancestor in element.iterancestors():
314 def add_anchors(root):
318 for element in root.iterdescendants():
322 'note', 'motto', 'motto_podpis', 'dedication', 'frame'
324 or e.get('id') == 'nota_red'
325 or e.tag == 'blockquote'
326 or e.get('id') == 'footnotes'
329 if element.get('class') == 'numeracja':
331 visible_counter = int(element.get('data-start'))
334 if element.get("data-link"):
335 link_prefix = element.get("data-link")
336 counter[link_prefix] = 1
338 if any_ancestor(element, f):
341 if element.tag == 'div' and 'verse' in element.get('class', ''):
342 if visible_counter == 1 or visible_counter % 5 == 0:
343 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
344 counter[link_prefix] += 1
346 elif 'paragraph' in element.get('class', ''):
347 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
348 counter[link_prefix] += 1
352 def raw_printable_text(element):
353 working = copy.deepcopy(element)
354 for e in working.findall('a'):
355 if e.get('class') in ('annotation', 'theme-begin'):
357 return etree.tostring(working, method='text', encoding='unicode').strip()
360 def add_table_of_contents(root):
363 for element in root.iterdescendants():
364 if element.tag in ('h2', 'h3'):
367 lambda e: e.get('id') in (
368 'footnotes', 'nota_red'
369 ) or e.get('class') in ('person-list',)):
372 element_text = raw_printable_text(element)
373 if (element.tag == 'h3' and len(sections)
374 and sections[-1][1] == 'h2'):
375 sections[-1][3].append(
376 (counter, element.tag, element_text, [])
379 sections.append((counter, element.tag, element_text, []))
380 add_anchor(element, "s%d" % counter, with_link=False)
383 toc = etree.Element('div')
385 toc_header = etree.SubElement(toc, 'h2')
386 toc_header.text = 'Spis treści'
387 toc_list = etree.SubElement(toc, 'ol')
389 for n, section, text, subsections in sections:
390 section_element = etree.SubElement(toc_list, 'li')
391 add_anchor(section_element, "s%d" % n, with_target=False,
395 subsection_list = etree.SubElement(section_element, 'ol')
396 for n1, subsection, subtext, _ in subsections:
397 subsection_element = etree.SubElement(subsection_list, 'li')
398 add_anchor(subsection_element, "s%d" % n1, with_target=False,
404 def add_table_of_themes(root):
406 from sortify import sortify
412 for fragment in root.findall('.//a[@class="theme-begin"]'):
413 if not fragment.text:
415 theme_names = [s.strip() for s in fragment.text.split(',')]
416 for theme_name in theme_names:
417 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
418 book_themes = list(book_themes.items())
419 book_themes.sort(key=lambda s: sortify(s[0]))
420 themes_div = etree.Element('div', id="themes")
421 themes_ol = etree.SubElement(themes_div, 'ol')
422 for theme_name, fragments in book_themes:
423 themes_li = etree.SubElement(themes_ol, 'li')
424 themes_li.text = "%s: " % theme_name
425 for i, fragment in enumerate(fragments):
426 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
427 item.text = str(i + 1)
429 root.insert(0, themes_div)
432 def extract_annotations(html_path):
433 """Extracts annotations from HTML for annotations dictionary.
435 For each annotation, yields a tuple of:
436 anchor, footnote type, valid qualifiers, text, html.
439 from .fn_qualifiers import FN_QUALIFIERS
441 parser = etree.HTMLParser(encoding='utf-8')
442 tree = etree.parse(html_path, parser)
443 footnotes = tree.find('//*[@id="footnotes"]')
444 re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
445 if footnotes is not None:
446 for footnote in footnotes.findall('div'):
447 fn_type = footnote.get('class').split('-')[1]
448 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
451 if len(footnote) and footnote[-1].tail == '\n':
452 footnote[-1].tail = None
453 text_str = etree.tostring(footnote, method='text',
454 encoding='unicode').strip()
455 html_str = etree.tostring(footnote, method='html',
456 encoding='unicode').strip()
458 match = re_qualifier.match(text_str)
460 qualifier_str = match.group(1)
462 for candidate in re.split('[;,]', qualifier_str):
463 candidate = candidate.strip()
464 if candidate in FN_QUALIFIERS:
465 qualifiers.append(candidate)
466 elif candidate.startswith('z '):
467 subcandidate = candidate.split()[1]
468 if subcandidate in FN_QUALIFIERS:
469 qualifiers.append(subcandidate)
473 yield anchor, fn_type, qualifiers, text_str, html_str