1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
13 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
23 'legacy': 'xslt/book2html.xslt',
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
33 '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37 def transform_abstrakt(abstrakt_element):
38 style_filename = get_stylesheet('legacy')
39 style = etree.parse(style_filename)
40 xml = etree.tostring(abstrakt_element, encoding='unicode')
41 document = etree.parse(io.StringIO(
42 xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
44 result = document.xslt(style)
45 html = re.sub('<a name="sec[0-9]*"/>', '',
46 etree.tostring(result, encoding='unicode'))
47 return re.sub('</?blockquote[^>]*>', '', html)
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51 widths = [360, 600, 1200, 1800, 2400]
53 os.makedirs(gallery_path)
57 for i, ilustr in enumerate(tree.findall('//ilustr')):
58 rel_path = ilustr.attrib['src']
59 img_url = urllib.parse.urljoin(base_url, rel_path)
61 f = urllib.request.urlopen(img_url)
63 ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
66 # Needed widths: predefined and original, limited by
67 # whichever is smaller.
71 set(widths + [img.size[0]])
73 if w <= min(widths[-1], img.size[0])
77 fname = '%d.W%d.%s' % (i, w, ext)
78 fpath = gallery_path + fname
79 if not os.path.exists(fpath):
80 height = round(img.size[1] * w / img.size[0])
81 th = img.resize((w, height))
83 th_url = gallery_url + fname
84 srcset.append(" ".join((
89 ilustr.attrib['srcset'] = ", ".join(srcset)
90 ilustr.attrib['src'] = largest_url
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96 """Transforms the WL document to XHTML.
98 If output_filename is None, returns an XML,
99 otherwise returns True if file has been written,False if it hasn't.
100 File won't be written if it has no content.
104 style_filename = get_stylesheet(stylesheet)
105 style = etree.parse(style_filename)
107 document = copy.deepcopy(wldoc)
109 document.swap_endlines()
113 document.edoc.getroot().set(flag, 'yes')
115 ltag = document.edoc.find('//' + DCNS('language'))
117 lang = functions.lang_code_3to2(ltag.text)
120 document.edoc.getroot().set('lang', lang)
122 document.clean_ed_note()
123 document.clean_ed_note('abstrakt')
124 document.fix_pa_akap()
125 document.hebr_protect()
131 os.makedirs(gallery_path)
135 add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
139 or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
142 result = document.transform(style, css=css, **options)
143 del document # no longer needed large object :)
145 if html_has_content(result):
146 add_anchors(result.getroot())
147 add_table_of_themes(result.getroot())
148 add_table_of_contents(result.getroot())
150 return OutputFile.from_bytes(etree.tostring(
151 result, method='html', xml_declaration=False,
152 pretty_print=True, encoding='utf-8'
157 raise ValueError("'%s' is not a valid stylesheet.")
158 except (XMLSyntaxError, XSLTApplyError) as e:
163 def __init__(self, id, themes):
164 super(Fragment, self).__init__()
169 def append(self, event, element):
170 self.events.append((event, element))
172 def closed_events(self):
174 for event, element in self.events:
176 stack.append(('end', element))
181 print('CLOSED NON-OPEN TAG:', element)
184 return self.events + stack
188 for event, element in self.closed_events():
190 result.append('<%s %s>' % (
194 for k, v in element.attrib.items()
198 result.append(element.text)
200 result.append('</%s>' % element.tag)
202 result.append(element.tail)
204 result.append(element)
206 return ''.join(result)
209 return self.to_string()
212 def extract_fragments(input_filename):
213 """Extracts theme fragments from input_filename."""
215 closed_fragments = {}
217 # iterparse would die on a HTML document
218 parser = etree.HTMLParser(encoding='utf-8')
220 buf.write(etree.tostring(
221 etree.parse(input_filename, parser).getroot()[0][0],
226 for event, element in etree.iterparse(buf, events=('start', 'end')):
227 # Process begin and end elements
228 if element.get('class', '') in ('theme-begin', 'theme-end'):
229 if not event == 'end':
230 continue # Process elements only once, on end event
233 if element.get('class', '') == 'theme-begin':
234 fragment = Fragment(id=element.get('fid'), themes=element.text)
237 parent = element.getparent()
239 while parent.get('id', None) != 'book-text':
240 cparent = copy.deepcopy(parent)
242 if 'id' in cparent.attrib:
243 del cparent.attrib['id']
244 parents.append(cparent)
245 parent = parent.getparent()
248 for parent in parents:
249 fragment.append('start', parent)
251 if fragment.id not in open_fragments:
252 open_fragments[fragment.id] = fragment
254 # Close existing fragment
257 fragment = open_fragments[element.get('fid')]
259 print('%s:closed not open fragment #%s' % (
260 input_filename, element.get('fid')
263 closed_fragments[fragment.id] = fragment
264 del open_fragments[fragment.id]
266 # Append element tail to lost_text
267 # (we don't want to lose any text)
269 for fragment_id in open_fragments:
270 open_fragments[fragment_id].append('text', element.tail)
272 # Process all elements except begin and end
274 # Omit annotation tags
275 if (len(element.get('name', '')) or
276 element.get('class', '') in ('annotation', 'anchor')):
277 if event == 'end' and element.tail:
278 for fragment_id in open_fragments:
279 open_fragments[fragment_id].append(
283 for fragment_id in open_fragments:
284 celem = copy.copy(element)
285 if 'id' in celem.attrib:
286 del celem.attrib['id']
287 open_fragments[fragment_id].append(
291 return closed_fragments, open_fragments
294 def add_anchor(element, prefix, with_link=True, with_target=True,
296 parent = element.getparent()
297 index = parent.index(element)
300 if link_text is None:
302 anchor = etree.Element('a', href='#%s' % prefix)
303 anchor.set('class', 'anchor')
304 anchor.text = str(link_text)
305 parent.insert(index, anchor)
308 anchor_target = etree.Element('a', name='%s' % prefix)
309 anchor_target.set('class', 'target')
310 anchor_target.text = ' '
311 parent.insert(index, anchor_target)
314 def any_ancestor(element, test):
315 for ancestor in element.iterancestors():
321 def add_anchors(root):
325 for element in root.iterdescendants():
329 'note', 'motto', 'motto_podpis', 'dedication', 'frame'
331 or e.get('id') == 'nota_red'
332 or e.tag == 'blockquote'
333 or e.get('id') == 'footnotes'
336 if element.get('class') == 'numeracja':
338 visible_counter = int(element.get('data-start'))
341 if element.get("data-link"):
342 link_prefix = element.get("data-link")
343 counter[link_prefix] = 1
345 if any_ancestor(element, f):
348 if element.tag == 'div' and 'verse' in element.get('class', ''):
349 if visible_counter == 1 or visible_counter % 5 == 0:
350 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
351 counter[link_prefix] += 1
353 elif 'paragraph' in element.get('class', ''):
354 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
355 counter[link_prefix] += 1
359 def raw_printable_text(element):
360 working = copy.deepcopy(element)
361 for e in working.findall('a'):
362 if e.get('class') in ('annotation', 'theme-begin'):
364 return etree.tostring(working, method='text', encoding='unicode').strip()
367 def add_table_of_contents(root):
370 for element in root.iterdescendants():
371 if element.tag in ('h2', 'h3'):
374 lambda e: e.get('id') in (
375 'footnotes', 'nota_red'
376 ) or e.get('class') in ('person-list',)):
379 element_text = raw_printable_text(element)
380 if (element.tag == 'h3' and len(sections)
381 and sections[-1][1] == 'h2'):
382 sections[-1][3].append(
383 (counter, element.tag, element_text, [])
386 sections.append((counter, element.tag, element_text, []))
387 add_anchor(element, "s%d" % counter, with_link=False)
390 toc = etree.Element('div')
392 toc_header = etree.SubElement(toc, 'h2')
393 toc_header.text = 'Spis treści'
394 toc_list = etree.SubElement(toc, 'ol')
396 for n, section, text, subsections in sections:
397 section_element = etree.SubElement(toc_list, 'li')
398 add_anchor(section_element, "s%d" % n, with_target=False,
402 subsection_list = etree.SubElement(section_element, 'ol')
403 for n1, subsection, subtext, _ in subsections:
404 subsection_element = etree.SubElement(subsection_list, 'li')
405 add_anchor(subsection_element, "s%d" % n1, with_target=False,
411 def add_table_of_themes(root):
413 from sortify import sortify
419 for fragment in root.findall('.//a[@class="theme-begin"]'):
420 if not fragment.text:
422 theme_names = [s.strip() for s in fragment.text.split(',')]
423 for theme_name in theme_names:
424 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
425 book_themes = list(book_themes.items())
426 book_themes.sort(key=lambda s: sortify(s[0]))
427 themes_div = etree.Element('div', id="themes")
428 themes_ol = etree.SubElement(themes_div, 'ol')
429 for theme_name, fragments in book_themes:
430 themes_li = etree.SubElement(themes_ol, 'li')
431 themes_li.text = "%s: " % theme_name
432 for i, fragment in enumerate(fragments):
433 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
434 item.text = str(i + 1)
436 root.insert(0, themes_div)
439 def extract_annotations(html_path):
440 """Extracts annotations from HTML for annotations dictionary.
442 For each annotation, yields a tuple of:
443 anchor, footnote type, valid qualifiers, text, html.
446 from .fn_qualifiers import FN_QUALIFIERS
448 parser = etree.HTMLParser(encoding='utf-8')
449 tree = etree.parse(html_path, parser)
450 footnotes = tree.find('//*[@id="footnotes"]')
451 re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
452 if footnotes is not None:
453 for footnote in footnotes.findall('div'):
454 fn_type = footnote.get('class').split('-')[1]
455 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
458 if len(footnote) and footnote[-1].tail == '\n':
459 footnote[-1].tail = None
460 text_str = etree.tostring(footnote, method='text',
461 encoding='unicode').strip()
462 html_str = etree.tostring(footnote, method='html',
463 encoding='unicode').strip()
465 match = re_qualifier.match(text_str)
467 qualifier_str = match.group(1)
469 for candidate in re.split('[;,]', qualifier_str):
470 candidate = candidate.strip()
471 if candidate in FN_QUALIFIERS:
472 qualifiers.append(candidate)
473 elif candidate.startswith('z '):
474 subcandidate = candidate.split()[1]
475 if subcandidate in FN_QUALIFIERS:
476 qualifiers.append(subcandidate)
480 yield anchor, fn_type, qualifiers, text_str, html_str