1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
11 from lxml import etree
12 from librarian import XHTMLNS, ParseError, OutputFile
13 from librarian import functions
16 from lxml.etree import XMLSyntaxError, XSLTApplyError
19 functions.reg_substitute_entities()
20 functions.reg_person_name()
23 'legacy': 'xslt/book2html.xslt',
27 def get_stylesheet(name):
28 return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
31 def html_has_content(text):
33 '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
37 def transform_abstrakt(abstrakt_element):
38 style_filename = get_stylesheet('legacy')
39 style = etree.parse(style_filename)
40 xml = etree.tostring(abstrakt_element, encoding='unicode')
41 document = etree.parse(io.StringIO(
42 xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
44 result = document.xslt(style)
45 html = re.sub('<a name="sec[0-9]*"/>', '',
46 etree.tostring(result, encoding='unicode'))
47 return re.sub('</?blockquote[^>]*>', '', html)
50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
51 widths = [360, 600, 1200, 1800, 2400]
53 os.makedirs(gallery_path)
57 for i, ilustr in enumerate(tree.findall('//ilustr')):
58 rel_path = ilustr.attrib['src']
59 img_url = urllib.parse.urljoin(base_url, rel_path)
61 f = urllib.request.urlopen(img_url)
63 ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
66 # Needed widths: predefined and original, limited by
67 # whichever is smaller.
71 set(widths + [img.size[0]])
73 if w <= min(widths[-1], img.size[0])
77 fname = '%d.W%d.%s' % (i, w, ext)
78 fpath = gallery_path + fname
79 if not os.path.exists(fpath):
80 height = round(img.size[1] * w / img.size[0])
81 th = img.resize((w, height))
83 th_url = gallery_url + fname
84 srcset.append(" ".join((
89 ilustr.attrib['srcset'] = ", ".join(srcset)
90 ilustr.attrib['src'] = largest_url
95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
96 """Transforms the WL document to XHTML.
98 If output_filename is None, returns an XML,
99 otherwise returns True if file has been written,False if it hasn't.
100 File won't be written if it has no content.
104 style_filename = get_stylesheet(stylesheet)
105 style = etree.parse(style_filename)
107 document = copy.deepcopy(wldoc)
109 document.swap_endlines()
113 document.edoc.getroot().set(flag, 'yes')
115 document.clean_ed_note()
116 document.clean_ed_note('abstrakt')
117 document.fix_pa_akap()
123 os.makedirs(gallery_path)
127 add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
131 or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
134 result = document.transform(style, css=css, **options)
135 del document # no longer needed large object :)
137 if html_has_content(result):
138 add_anchors(result.getroot())
139 add_table_of_themes(result.getroot())
140 add_table_of_contents(result.getroot())
142 return OutputFile.from_bytes(etree.tostring(
143 result, method='html', xml_declaration=False,
144 pretty_print=True, encoding='utf-8'
149 raise ValueError("'%s' is not a valid stylesheet.")
150 except (XMLSyntaxError, XSLTApplyError) as e:
155 def __init__(self, id, themes):
156 super(Fragment, self).__init__()
161 def append(self, event, element):
162 self.events.append((event, element))
164 def closed_events(self):
166 for event, element in self.events:
168 stack.append(('end', element))
173 print('CLOSED NON-OPEN TAG:', element)
176 return self.events + stack
180 for event, element in self.closed_events():
182 result.append('<%s %s>' % (
186 for k, v in element.attrib.items()
190 result.append(element.text)
192 result.append('</%s>' % element.tag)
194 result.append(element.tail)
196 result.append(element)
198 return ''.join(result)
201 return self.to_string()
204 def extract_fragments(input_filename):
205 """Extracts theme fragments from input_filename."""
207 closed_fragments = {}
209 # iterparse would die on a HTML document
210 parser = etree.HTMLParser(encoding='utf-8')
212 buf.write(etree.tostring(
213 etree.parse(input_filename, parser).getroot()[0][0],
218 for event, element in etree.iterparse(buf, events=('start', 'end')):
219 # Process begin and end elements
220 if element.get('class', '') in ('theme-begin', 'theme-end'):
221 if not event == 'end':
222 continue # Process elements only once, on end event
225 if element.get('class', '') == 'theme-begin':
226 fragment = Fragment(id=element.get('fid'), themes=element.text)
229 parent = element.getparent()
231 while parent.get('id', None) != 'book-text':
232 cparent = copy.deepcopy(parent)
234 if 'id' in cparent.attrib:
235 del cparent.attrib['id']
236 parents.append(cparent)
237 parent = parent.getparent()
240 for parent in parents:
241 fragment.append('start', parent)
243 if fragment.id not in open_fragments:
244 open_fragments[fragment.id] = fragment
246 # Close existing fragment
249 fragment = open_fragments[element.get('fid')]
251 print('%s:closed not open fragment #%s' % (
252 input_filename, element.get('fid')
255 closed_fragments[fragment.id] = fragment
256 del open_fragments[fragment.id]
258 # Append element tail to lost_text
259 # (we don't want to lose any text)
261 for fragment_id in open_fragments:
262 open_fragments[fragment_id].append('text', element.tail)
264 # Process all elements except begin and end
266 # Omit annotation tags
267 if (len(element.get('name', '')) or
268 element.get('class', '') in ('annotation', 'anchor')):
269 if event == 'end' and element.tail:
270 for fragment_id in open_fragments:
271 open_fragments[fragment_id].append(
275 for fragment_id in open_fragments:
276 celem = copy.copy(element)
277 if 'id' in celem.attrib:
278 del celem.attrib['id']
279 open_fragments[fragment_id].append(
283 return closed_fragments, open_fragments
286 def add_anchor(element, prefix, with_link=True, with_target=True,
288 parent = element.getparent()
289 index = parent.index(element)
292 if link_text is None:
294 anchor = etree.Element('a', href='#%s' % prefix)
295 anchor.set('class', 'anchor')
296 anchor.text = str(link_text)
297 parent.insert(index, anchor)
300 anchor_target = etree.Element('a', name='%s' % prefix)
301 anchor_target.set('class', 'target')
302 anchor_target.text = ' '
303 parent.insert(index, anchor_target)
306 def any_ancestor(element, test):
307 for ancestor in element.iterancestors():
313 def add_anchors(root):
316 for element in root.iterdescendants():
320 'note', 'motto', 'motto_podpis', 'dedication', 'frame'
322 or e.get('id') == 'nota_red'
323 or e.tag == 'blockquote'
324 or e.get('id') == 'footnotes'
327 if element.get('class') == 'numeracja':
329 visible_counter = int(element.get('data-start'))
333 if any_ancestor(element, f):
336 if element.tag == 'div' and 'verse' in element.get('class', ''):
337 if visible_counter == 1 or visible_counter % 5 == 0:
338 add_anchor(element, "f%d" % counter, link_text=visible_counter)
341 elif 'paragraph' in element.get('class', ''):
342 add_anchor(element, "f%d" % counter, link_text=visible_counter)
347 def raw_printable_text(element):
348 working = copy.deepcopy(element)
349 for e in working.findall('a'):
350 if e.get('class') in ('annotation', 'theme-begin'):
352 return etree.tostring(working, method='text', encoding='unicode').strip()
355 def add_table_of_contents(root):
358 for element in root.iterdescendants():
359 if element.tag in ('h2', 'h3'):
362 lambda e: e.get('id') in (
363 'footnotes', 'nota_red'
364 ) or e.get('class') in ('person-list',)):
367 element_text = raw_printable_text(element)
368 if (element.tag == 'h3' and len(sections)
369 and sections[-1][1] == 'h2'):
370 sections[-1][3].append(
371 (counter, element.tag, element_text, [])
374 sections.append((counter, element.tag, element_text, []))
375 add_anchor(element, "s%d" % counter, with_link=False)
378 toc = etree.Element('div')
380 toc_header = etree.SubElement(toc, 'h2')
381 toc_header.text = 'Spis treści'
382 toc_list = etree.SubElement(toc, 'ol')
384 for n, section, text, subsections in sections:
385 section_element = etree.SubElement(toc_list, 'li')
386 add_anchor(section_element, "s%d" % n, with_target=False,
390 subsection_list = etree.SubElement(section_element, 'ol')
391 for n1, subsection, subtext, _ in subsections:
392 subsection_element = etree.SubElement(subsection_list, 'li')
393 add_anchor(subsection_element, "s%d" % n1, with_target=False,
399 def add_table_of_themes(root):
401 from sortify import sortify
407 for fragment in root.findall('.//a[@class="theme-begin"]'):
408 if not fragment.text:
410 theme_names = [s.strip() for s in fragment.text.split(',')]
411 for theme_name in theme_names:
412 book_themes.setdefault(theme_name, []).append(fragment.get('name'))
413 book_themes = list(book_themes.items())
414 book_themes.sort(key=lambda s: sortify(s[0]))
415 themes_div = etree.Element('div', id="themes")
416 themes_ol = etree.SubElement(themes_div, 'ol')
417 for theme_name, fragments in book_themes:
418 themes_li = etree.SubElement(themes_ol, 'li')
419 themes_li.text = "%s: " % theme_name
420 for i, fragment in enumerate(fragments):
421 item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
422 item.text = str(i + 1)
424 root.insert(0, themes_div)
427 def extract_annotations(html_path):
428 """Extracts annotations from HTML for annotations dictionary.
430 For each annotation, yields a tuple of:
431 anchor, footnote type, valid qualifiers, text, html.
434 from .fn_qualifiers import FN_QUALIFIERS
436 parser = etree.HTMLParser(encoding='utf-8')
437 tree = etree.parse(html_path, parser)
438 footnotes = tree.find('//*[@id="footnotes"]')
439 re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
440 if footnotes is not None:
441 for footnote in footnotes.findall('div'):
442 fn_type = footnote.get('class').split('-')[1]
443 anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
446 if len(footnote) and footnote[-1].tail == '\n':
447 footnote[-1].tail = None
448 text_str = etree.tostring(footnote, method='text',
449 encoding='unicode').strip()
450 html_str = etree.tostring(footnote, method='html',
451 encoding='unicode').strip()
453 match = re_qualifier.match(text_str)
455 qualifier_str = match.group(1)
457 for candidate in re.split('[;,]', qualifier_str):
458 candidate = candidate.strip()
459 if candidate in FN_QUALIFIERS:
460 qualifiers.append(candidate)
461 elif candidate.startswith('z '):
462 subcandidate = candidate.split()[1]
463 if subcandidate in FN_QUALIFIERS:
464 qualifiers.append(subcandidate)
468 yield anchor, fn_type, qualifiers, text_str, html_str