1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
13 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
23 'legacy': 'xslt/book2html.xslt',
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
33 '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37 def transform_abstrakt(abstrakt_element):
38 style_filename = get_stylesheet('legacy')
39 style = etree.parse(style_filename)
40 xml = etree.tostring(abstrakt_element, encoding='unicode')
41 document = etree.parse(io.StringIO(
42 xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
44 result = document.xslt(style)
45 html = re.sub('<a name="sec[0-9]*"/>', '',
46 etree.tostring(result, encoding='unicode'))
47 return re.sub('</?blockquote[^>]*>', '', html)
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51 widths = [360, 600, 1200, 1800, 2400]
53 os.makedirs(gallery_path)
57 for i, ilustr in enumerate(tree.findall('//ilustr')):
58 rel_path = ilustr.attrib['src']
59 img_url = urllib.parse.urljoin(base_url, rel_path)
61 f = urllib.request.urlopen(img_url)
63 ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
66 # Needed widths: predefined and original, limited by
67 # whichever is smaller.
71 set(widths + [img.size[0]])
73 if w <= min(widths[-1], img.size[0])
77 fname = '%d.W%d.%s' % (i, w, ext)
78 fpath = gallery_path + fname
79 if not os.path.exists(fpath):
80 height = round(img.size[1] * w / img.size[0])
81 th = img.resize((w, height))
83 th_url = gallery_url + fname
84 srcset.append(" ".join((
89 ilustr.attrib['srcset'] = ", ".join(srcset)
90 ilustr.attrib['src'] = largest_url
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96 """Transforms the WL document to XHTML.
98 If output_filename is None, returns an XML,
99 otherwise returns True if file has been written,False if it hasn't.
100 File won't be written if it has no content.
104 style_filename = get_stylesheet(stylesheet)
105 style = etree.parse(style_filename)
107 document = copy.deepcopy(wldoc)
109 document.swap_endlines()
113 document.edoc.getroot().set(flag, 'yes')
115 ltag = document.edoc.find('//' + DCNS('language'))
116 lang = functions.lang_code_3to2(ltag.text) or 'pl'
117 document.edoc.getroot().set('lang', lang)
119 document.clean_ed_note()
120 document.clean_ed_note('abstrakt')
121 document.fix_pa_akap()
122 document.hebr_protect()
128 os.makedirs(gallery_path)
132 add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
136 or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
139 result = document.transform(style, css=css, **options)
140 del document # no longer needed large object :)
142 if html_has_content(result):
143 add_anchors(result.getroot())
144 add_table_of_themes(result.getroot())
145 add_table_of_contents(result.getroot())
147 return OutputFile.from_bytes(etree.tostring(
148 result, method='html', xml_declaration=False,
149 pretty_print=True, encoding='utf-8'
154 raise ValueError("'%s' is not a valid stylesheet.")
155 except (XMLSyntaxError, XSLTApplyError) as e:
160 def __init__(self, id, themes):
161 super(Fragment, self).__init__()
166 def append(self, event, element):
167 self.events.append((event, element))
169 def closed_events(self):
171 for event, element in self.events:
173 stack.append(('end', element))
178 print('CLOSED NON-OPEN TAG:', element)
181 return self.events + stack
185 for event, element in self.closed_events():
187 result.append('<%s %s>' % (
191 for k, v in element.attrib.items()
195 result.append(element.text)
197 result.append('</%s>' % element.tag)
199 result.append(element.tail)
201 result.append(element)
203 return ''.join(result)
206 return self.to_string()
209 def extract_fragments(input_filename):
210 """Extracts theme fragments from input_filename."""
212 closed_fragments = {}
214 # iterparse would die on a HTML document
215 parser = etree.HTMLParser(encoding='utf-8')
217 buf.write(etree.tostring(
218 etree.parse(input_filename, parser).getroot()[0][0],
223 for event, element in etree.iterparse(buf, events=('start', 'end')):
224 # Process begin and end elements
225 if element.get('class', '') in ('theme-begin', 'theme-end'):
226 if not event == 'end':
227 continue # Process elements only once, on end event
230 if element.get('class', '') == 'theme-begin':
231 fragment = Fragment(id=element.get('fid'), themes=element.text)
234 parent = element.getparent()
236 while parent.get('id', None) != 'book-text':
237 cparent = copy.deepcopy(parent)
239 if 'id' in cparent.attrib:
240 del cparent.attrib['id']
241 parents.append(cparent)
242 parent = parent.getparent()
245 for parent in parents:
246 fragment.append('start', parent)
248 if fragment.id not in open_fragments:
249 open_fragments[fragment.id] = fragment
251 # Close existing fragment
254 fragment = open_fragments[element.get('fid')]
256 print('%s:closed not open fragment #%s' % (
257 input_filename, element.get('fid')
260 closed_fragments[fragment.id] = fragment
261 del open_fragments[fragment.id]
263 # Append element tail to lost_text
264 # (we don't want to lose any text)
266 for fragment_id in open_fragments:
267 open_fragments[fragment_id].append('text', element.tail)
269 # Process all elements except begin and end
271 # Omit annotation tags
272 if (len(element.get('name', '')) or
273 element.get('class', '') in ('annotation', 'anchor')):
274 if event == 'end' and element.tail:
275 for fragment_id in open_fragments:
276 open_fragments[fragment_id].append(
280 for fragment_id in open_fragments:
281 celem = copy.copy(element)
282 if 'id' in celem.attrib:
283 del celem.attrib['id']
284 open_fragments[fragment_id].append(
288 return closed_fragments, open_fragments
291 def add_anchor(element, prefix, with_link=True, with_target=True,
293 parent = element.getparent()
294 index = parent.index(element)
297 if link_text is None:
299 anchor = etree.Element('a', href='#%s' % prefix)
300 anchor.set('class', 'anchor')
301 anchor.text = str(link_text)
302 parent.insert(index, anchor)
305 anchor_target = etree.Element('a', name='%s' % prefix)
306 anchor_target.set('class', 'target')
307 anchor_target.text = ' '
308 parent.insert(index, anchor_target)
311 def any_ancestor(element, test):
312 for ancestor in element.iterancestors():
318 def add_anchors(root):
322 for element in root.iterdescendants():
326 'note', 'motto', 'motto_podpis', 'dedication', 'frame'
328 or e.get('id') == 'nota_red'
329 or e.tag == 'blockquote'
330 or e.get('id') == 'footnotes'
333 if element.get('class') == 'numeracja':
335 visible_counter = int(element.get('data-start'))
338 if element.get("data-link"):
339 link_prefix = element.get("data-link")
340 counter[link_prefix] = 1
342 if any_ancestor(element, f):
345 if element.tag == 'div' and 'verse' in element.get('class', ''):
346 if visible_counter == 1 or visible_counter % 5 == 0:
347 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
348 counter[link_prefix] += 1
350 elif 'paragraph' in element.get('class', ''):
351 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
352 counter[link_prefix] += 1
356 def raw_printable_text(element):
357 working = copy.deepcopy(element)
358 for e in working.findall('a'):
359 if e.get('class') in ('annotation', 'theme-begin'):
361 return etree.tostring(working, method='text', encoding='unicode').strip()
364 def add_table_of_contents(root):
367 for element in root.iterdescendants():
368 if element.tag in ('h2', 'h3'):
371 lambda e: e.get('id') in (
372 'footnotes', 'nota_red'
373 ) or e.get('class') in ('person-list',)):
376 element_text = raw_printable_text(element)
377 if (element.tag == 'h3' and len(sections)
378 and sections[-1][1] == 'h2'):
379 sections[-1][3].append(
380 (counter, element.tag, element_text, [])
383 sections.append((counter, element.tag, element_text, []))
384 add_anchor(element, "s%d" % counter, with_link=False)
387 toc = etree.Element('div')
389 toc_header = etree.SubElement(toc, 'h2')
390 toc_header.text = 'Spis treści'
391 toc_list = etree.SubElement(toc, 'ol')
393 for n, section, text, subsections in sections:
394 section_element = etree.SubElement(toc_list, 'li')
395 add_anchor(section_element, "s%d" % n, with_target=False,
399 subsection_list = etree.SubElement(section_element, 'ol')
400 for n1, subsection, subtext, _ in subsections:
401 subsection_element = etree.SubElement(subsection_list, 'li')
402 add_anchor(subsection_element, "s%d" % n1, with_target=False,
408 def add_table_of_themes(root):
410 from sortify import sortify
416 for fragment in root.findall('.//a[@class="theme-begin"]'):
417 if not fragment.text:
419 theme_names = [s.strip() for s in fragment.text.split(',')]
420 for theme_name in theme_names:
421 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
422 book_themes = list(book_themes.items())
423 book_themes.sort(key=lambda s: sortify(s[0]))
424 themes_div = etree.Element('div', id="themes")
425 themes_ol = etree.SubElement(themes_div, 'ol')
426 for theme_name, fragments in book_themes:
427 themes_li = etree.SubElement(themes_ol, 'li')
428 themes_li.text = "%s: " % theme_name
429 for i, fragment in enumerate(fragments):
430 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
431 item.text = str(i + 1)
433 root.insert(0, themes_div)
436 def extract_annotations(html_path):
437 """Extracts annotations from HTML for annotations dictionary.
439 For each annotation, yields a tuple of:
440 anchor, footnote type, valid qualifiers, text, html.
443 from .fn_qualifiers import FN_QUALIFIERS
445 parser = etree.HTMLParser(encoding='utf-8')
446 tree = etree.parse(html_path, parser)
447 footnotes = tree.find('//*[@id="footnotes"]')
448 re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
449 if footnotes is not None:
450 for footnote in footnotes.findall('div'):
451 fn_type = footnote.get('class').split('-')[1]
452 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
455 if len(footnote) and footnote[-1].tail == '\n':
456 footnote[-1].tail = None
457 text_str = etree.tostring(footnote, method='text',
458 encoding='unicode').strip()
459 html_str = etree.tostring(footnote, method='html',
460 encoding='unicode').strip()
462 match = re_qualifier.match(text_str)
464 qualifier_str = match.group(1)
466 for candidate in re.split('[;,]', qualifier_str):
467 candidate = candidate.strip()
468 if candidate in FN_QUALIFIERS:
469 qualifiers.append(candidate)
470 elif candidate.startswith('z '):
471 subcandidate = candidate.split()[1]
472 if subcandidate in FN_QUALIFIERS:
473 qualifiers.append(subcandidate)
477 yield anchor, fn_type, qualifiers, text_str, html_str