1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
23 'legacy': 'xslt/book2html.xslt',
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
33 '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37 def transform_abstrakt(abstrakt_element):
38 style_filename = get_stylesheet('legacy')
39 style = etree.parse(style_filename)
40 xml = etree.tostring(abstrakt_element, encoding='unicode')
41 document = etree.parse(io.StringIO(
42 xml.replace('abstrakt', 'dlugi_cytat')
44 result = document.xslt(style)
45 html = re.sub('<a name="sec[0-9]*"/>', '',
46 etree.tostring(result, encoding='unicode'))
47 return re.sub('</?blockquote[^>]*>', '', html)
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51 widths = [360, 600, 1200, 1800, 2400]
53 for i, ilustr in enumerate(tree.findall('//ilustr')):
54 rel_path = ilustr.attrib['src']
55 img_url = urllib.parse.urljoin(base_url, rel_path)
57 f = urllib.request.urlopen(img_url)
59 ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
62 # Needed widths: predefined and original, limited by
63 # whichever is smaller.
67 set(widths + [img.size[0]])
69 if w <= min(widths[-1], img.size[0])
73 fname = '%d.W%d.%s' % (i, w, ext)
74 fpath = gallery_path + fname
75 if not os.path.exists(fpath):
76 height = round(img.size[1] * w / img.size[0])
77 th = img.resize((w, height))
79 th_url = gallery_url + fname
80 srcset.append(" ".join((
85 ilustr.attrib['srcset'] = ", ".join(srcset)
86 ilustr.attrib['src'] = largest_url
91 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
92 """Transforms the WL document to XHTML.
94 If output_filename is None, returns an XML,
95 otherwise returns True if file has been written,False if it hasn't.
96 File won't be written if it has no content.
100 style_filename = get_stylesheet(stylesheet)
101 style = etree.parse(style_filename)
103 document = copy.deepcopy(wldoc)
105 document.swap_endlines()
109 document.edoc.getroot().set(flag, 'yes')
111 document.clean_ed_note()
112 document.clean_ed_note('abstrakt')
113 document.fix_pa_akap()
119 os.makedirs(gallery_path)
123 add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
127 or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
130 result = document.transform(style, css=css, **options)
131 del document # no longer needed large object :)
133 if html_has_content(result):
134 add_anchors(result.getroot())
135 add_table_of_themes(result.getroot())
136 add_table_of_contents(result.getroot())
138 return OutputFile.from_bytes(etree.tostring(
139 result, method='html', xml_declaration=False,
140 pretty_print=True, encoding='utf-8'
145 raise ValueError("'%s' is not a valid stylesheet.")
146 except (XMLSyntaxError, XSLTApplyError) as e:
151 def __init__(self, id, themes):
152 super(Fragment, self).__init__()
157 def append(self, event, element):
158 self.events.append((event, element))
160 def closed_events(self):
162 for event, element in self.events:
164 stack.append(('end', element))
169 print('CLOSED NON-OPEN TAG:', element)
172 return self.events + stack
176 for event, element in self.closed_events():
178 result.append('<%s %s>' % (
182 for k, v in element.attrib.items()
186 result.append(element.text)
188 result.append('</%s>' % element.tag)
190 result.append(element.tail)
192 result.append(element)
194 return ''.join(result)
197 return self.to_string()
200 def extract_fragments(input_filename):
201 """Extracts theme fragments from input_filename."""
203 closed_fragments = {}
205 # iterparse would die on a HTML document
206 parser = etree.HTMLParser(encoding='utf-8')
208 buf.write(etree.tostring(
209 etree.parse(input_filename, parser).getroot()[0][0],
214 for event, element in etree.iterparse(buf, events=('start', 'end')):
215 # Process begin and end elements
216 if element.get('class', '') in ('theme-begin', 'theme-end'):
217 if not event == 'end':
218 continue # Process elements only once, on end event
221 if element.get('class', '') == 'theme-begin':
222 fragment = Fragment(id=element.get('fid'), themes=element.text)
225 parent = element.getparent()
227 while parent.get('id', None) != 'book-text':
228 cparent = copy.deepcopy(parent)
230 if 'id' in cparent.attrib:
231 del cparent.attrib['id']
232 parents.append(cparent)
233 parent = parent.getparent()
236 for parent in parents:
237 fragment.append('start', parent)
239 if fragment.id not in open_fragments:
240 open_fragments[fragment.id] = fragment
242 # Close existing fragment
245 fragment = open_fragments[element.get('fid')]
247 print('%s:closed not open fragment #%s' % (
248 input_filename, element.get('fid')
251 closed_fragments[fragment.id] = fragment
252 del open_fragments[fragment.id]
254 # Append element tail to lost_text
255 # (we don't want to lose any text)
257 for fragment_id in open_fragments:
258 open_fragments[fragment_id].append('text', element.tail)
260 # Process all elements except begin and end
262 # Omit annotation tags
263 if (len(element.get('name', '')) or
264 element.get('class', '') in ('annotation', 'anchor')):
265 if event == 'end' and element.tail:
266 for fragment_id in open_fragments:
267 open_fragments[fragment_id].append(
271 for fragment_id in open_fragments:
272 celem = copy.copy(element)
273 if 'id' in celem.attrib:
274 del celem.attrib['id']
275 open_fragments[fragment_id].append(
279 return closed_fragments, open_fragments
282 def add_anchor(element, prefix, with_link=True, with_target=True,
284 parent = element.getparent()
285 index = parent.index(element)
288 if link_text is None:
290 anchor = etree.Element('a', href='#%s' % prefix)
291 anchor.set('class', 'anchor')
292 anchor.text = str(link_text)
293 parent.insert(index, anchor)
296 anchor_target = etree.Element('a', name='%s' % prefix)
297 anchor_target.set('class', 'target')
298 anchor_target.text = ' '
299 parent.insert(index, anchor_target)
302 def any_ancestor(element, test):
303 for ancestor in element.iterancestors():
309 def add_anchors(root):
312 for element in root.iterdescendants():
316 'note', 'motto', 'motto_podpis', 'dedication', 'frame'
318 or e.get('id') == 'nota_red'
319 or e.tag == 'blockquote'
320 or e.get('id') == 'footnotes'
323 if element.get('class') == 'numeracja':
325 visible_counter = int(element.get('data-start'))
329 if any_ancestor(element, f):
332 if element.tag == 'div' and 'verse' in element.get('class', ''):
333 if visible_counter == 1 or visible_counter % 5 == 0:
334 add_anchor(element, "f%d" % counter, link_text=visible_counter)
337 elif 'paragraph' in element.get('class', ''):
338 add_anchor(element, "f%d" % counter, link_text=visible_counter)
343 def raw_printable_text(element):
344 working = copy.deepcopy(element)
345 for e in working.findall('a'):
346 if e.get('class') in ('annotation', 'theme-begin'):
348 return etree.tostring(working, method='text', encoding='unicode').strip()
351 def add_table_of_contents(root):
354 for element in root.iterdescendants():
355 if element.tag in ('h2', 'h3'):
358 lambda e: e.get('id') in (
359 'footnotes', 'nota_red'
360 ) or e.get('class') in ('person-list',)):
363 element_text = raw_printable_text(element)
364 if (element.tag == 'h3' and len(sections)
365 and sections[-1][1] == 'h2'):
366 sections[-1][3].append(
367 (counter, element.tag, element_text, [])
370 sections.append((counter, element.tag, element_text, []))
371 add_anchor(element, "s%d" % counter, with_link=False)
374 toc = etree.Element('div')
376 toc_header = etree.SubElement(toc, 'h2')
377 toc_header.text = 'Spis treści'
378 toc_list = etree.SubElement(toc, 'ol')
380 for n, section, text, subsections in sections:
381 section_element = etree.SubElement(toc_list, 'li')
382 add_anchor(section_element, "s%d" % n, with_target=False,
386 subsection_list = etree.SubElement(section_element, 'ol')
387 for n1, subsection, subtext, _ in subsections:
388 subsection_element = etree.SubElement(subsection_list, 'li')
389 add_anchor(subsection_element, "s%d" % n1, with_target=False,
395 def add_table_of_themes(root):
397 from sortify import sortify
403 for fragment in root.findall('.//a[@class="theme-begin"]'):
404 if not fragment.text:
406 theme_names = [s.strip() for s in fragment.text.split(',')]
407 for theme_name in theme_names:
408 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
409 book_themes = list(book_themes.items())
410 book_themes.sort(key=lambda s: sortify(s[0]))
411 themes_div = etree.Element('div', id="themes")
412 themes_ol = etree.SubElement(themes_div, 'ol')
413 for theme_name, fragments in book_themes:
414 themes_li = etree.SubElement(themes_ol, 'li')
415 themes_li.text = "%s: " % theme_name
416 for i, fragment in enumerate(fragments):
417 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
418 item.text = str(i + 1)
420 root.insert(0, themes_div)
423 def extract_annotations(html_path):
424 """Extracts annotations from HTML for annotations dictionary.
426 For each annotation, yields a tuple of:
427 anchor, footnote type, valid qualifiers, text, html.
430 from .fn_qualifiers import FN_QUALIFIERS
432 parser = etree.HTMLParser(encoding='utf-8')
433 tree = etree.parse(html_path, parser)
434 footnotes = tree.find('//*[@id="footnotes"]')
435 re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
436 if footnotes is not None:
437 for footnote in footnotes.findall('div'):
438 fn_type = footnote.get('class').split('-')[1]
439 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
442 if len(footnote) and footnote[-1].tail == '\n':
443 footnote[-1].tail = None
444 text_str = etree.tostring(footnote, method='text',
445 encoding='unicode').strip()
446 html_str = etree.tostring(footnote, method='html',
447 encoding='unicode').strip()
449 match = re_qualifier.match(text_str)
451 qualifier_str = match.group(1)
453 for candidate in re.split('[;,]', qualifier_str):
454 candidate = candidate.strip()
455 if candidate in FN_QUALIFIERS:
456 qualifiers.append(candidate)
457 elif candidate.startswith('z '):
458 subcandidate = candidate.split()[1]
459 if subcandidate in FN_QUALIFIERS:
460 qualifiers.append(subcandidate)
464 yield anchor, fn_type, qualifiers, text_str, html_str