1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 
  11 from lxml import etree
 
  12 from librarian import XHTMLNS, ParseError, OutputFile
 
  13 from librarian import functions
 
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
  19 functions.reg_substitute_entities()
 
  20 functions.reg_person_name()
 
  23     'legacy': 'xslt/book2html.xslt',
 
  27 def get_stylesheet(name):
 
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
 
  31 def html_has_content(text):
 
  33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
 
  37 def transform_abstrakt(abstrakt_element):
 
  38     style_filename = get_stylesheet('legacy')
 
  39     style = etree.parse(style_filename)
 
  40     xml = etree.tostring(abstrakt_element, encoding='unicode')
 
  41     document = etree.parse(io.StringIO(
 
  42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
 
  44     result = document.xslt(style)
 
  45     html = re.sub('<a name="sec[0-9]*"/>', '',
 
  46                   etree.tostring(result, encoding='unicode'))
 
  47     return re.sub('</?blockquote[^>]*>', '', html)
 
  50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
 
  51     widths = [360, 600, 1200, 1800, 2400]
 
  53         os.makedirs(gallery_path)
 
  57     for i, ilustr in enumerate(tree.findall('//ilustr')):
 
  58         rel_path = ilustr.attrib['src']
 
  59         img_url = urllib.parse.urljoin(base_url, rel_path)
 
  61         f = urllib.request.urlopen(img_url)
 
  63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
 
  66         # Needed widths: predefined and original, limited by
 
  67         # whichever is smaller.
 
  71                 set(widths + [img.size[0]])
 
  73             if w <= min(widths[-1], img.size[0])
 
  77             fname = '%d.W%d.%s' % (i, w, ext)
 
  78             fpath = gallery_path + fname
 
  79             if not os.path.exists(fpath):
 
  80                 height = round(img.size[1] * w / img.size[0])
 
  81                 th = img.resize((w, height))
 
  83             th_url = gallery_url + fname
 
  84             srcset.append(" ".join((
 
  89         ilustr.attrib['srcset'] = ", ".join(srcset)
 
  90         ilustr.attrib['src'] = largest_url
 
  95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
 
  96     """Transforms the WL document to XHTML.
 
  98     If output_filename is None, returns an XML,
 
  99     otherwise returns True if file has been written,False if it hasn't.
 
 100     File won't be written if it has no content.
 
 104         style_filename = get_stylesheet(stylesheet)
 
 105         style = etree.parse(style_filename)
 
 107         document = copy.deepcopy(wldoc)
 
 109         document.swap_endlines()
 
 113                 document.edoc.getroot().set(flag, 'yes')
 
 115         document.clean_ed_note()
 
 116         document.clean_ed_note('abstrakt')
 
 117         document.fix_pa_akap()
 
 118         document.hebr_protect()
 
 124             os.makedirs(gallery_path)
 
 128         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
 132             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 
 135         result = document.transform(style, css=css, **options)
 
 136         del document  # no longer needed large object :)
 
 138         if html_has_content(result):
 
 139             add_anchors(result.getroot())
 
 140             add_table_of_themes(result.getroot())
 
 141             add_table_of_contents(result.getroot())
 
 143             return OutputFile.from_bytes(etree.tostring(
 
 144                 result, method='html', xml_declaration=False,
 
 145                 pretty_print=True, encoding='utf-8'
 
 150         raise ValueError("'%s' is not a valid stylesheet.")
 
 151     except (XMLSyntaxError, XSLTApplyError) as e:
 
 156     def __init__(self, id, themes):
 
 157         super(Fragment, self).__init__()
 
 162     def append(self, event, element):
 
 163         self.events.append((event, element))
 
 165     def closed_events(self):
 
 167         for event, element in self.events:
 
 169                 stack.append(('end', element))
 
 174                     print('CLOSED NON-OPEN TAG:', element)
 
 177         return self.events + stack
 
 181         for event, element in self.closed_events():
 
 183                 result.append('<%s %s>' % (
 
 187                         for k, v in element.attrib.items()
 
 191                     result.append(element.text)
 
 193                 result.append('</%s>' % element.tag)
 
 195                     result.append(element.tail)
 
 197                 result.append(element)
 
 199         return ''.join(result)
 
 202         return self.to_string()
 
 205 def extract_fragments(input_filename):
 
 206     """Extracts theme fragments from input_filename."""
 
 208     closed_fragments = {}
 
 210     # iterparse would die on a HTML document
 
 211     parser = etree.HTMLParser(encoding='utf-8')
 
 213     buf.write(etree.tostring(
 
 214         etree.parse(input_filename, parser).getroot()[0][0],
 
 219     for event, element in etree.iterparse(buf, events=('start', 'end')):
 
 220         # Process begin and end elements
 
 221         if element.get('class', '') in ('theme-begin', 'theme-end'):
 
 222             if not event == 'end':
 
 223                 continue  # Process elements only once, on end event
 
 226             if element.get('class', '') == 'theme-begin':
 
 227                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
 230                 parent = element.getparent()
 
 232                 while parent.get('id', None) != 'book-text':
 
 233                     cparent = copy.deepcopy(parent)
 
 235                     if 'id' in cparent.attrib:
 
 236                         del cparent.attrib['id']
 
 237                     parents.append(cparent)
 
 238                     parent = parent.getparent()
 
 241                 for parent in parents:
 
 242                     fragment.append('start', parent)
 
 244                 if fragment.id not in open_fragments:
 
 245                     open_fragments[fragment.id] = fragment
 
 247             # Close existing fragment
 
 250                     fragment = open_fragments[element.get('fid')]
 
 252                     print('%s:closed not open fragment #%s' % (
 
 253                         input_filename, element.get('fid')
 
 256                     closed_fragments[fragment.id] = fragment
 
 257                     del open_fragments[fragment.id]
 
 259             # Append element tail to lost_text
 
 260             # (we don't want to lose any text)
 
 262                 for fragment_id in open_fragments:
 
 263                     open_fragments[fragment_id].append('text', element.tail)
 
 265         # Process all elements except begin and end
 
 267             # Omit annotation tags
 
 268             if (len(element.get('name', '')) or
 
 269                     element.get('class', '') in ('annotation', 'anchor')):
 
 270                 if event == 'end' and element.tail:
 
 271                     for fragment_id in open_fragments:
 
 272                         open_fragments[fragment_id].append(
 
 276                 for fragment_id in open_fragments:
 
 277                     celem = copy.copy(element)
 
 278                     if 'id' in celem.attrib:
 
 279                         del celem.attrib['id']
 
 280                     open_fragments[fragment_id].append(
 
 284     return closed_fragments, open_fragments
 
 287 def add_anchor(element, prefix, with_link=True, with_target=True,
 
 289     parent = element.getparent()
 
 290     index = parent.index(element)
 
 293         if link_text is None:
 
 295         anchor = etree.Element('a', href='#%s' % prefix)
 
 296         anchor.set('class', 'anchor')
 
 297         anchor.text = str(link_text)
 
 298         parent.insert(index, anchor)
 
 301         anchor_target = etree.Element('a', name='%s' % prefix)
 
 302         anchor_target.set('class', 'target')
 
 303         anchor_target.text = ' '
 
 304         parent.insert(index, anchor_target)
 
 307 def any_ancestor(element, test):
 
 308     for ancestor in element.iterancestors():
 
 314 def add_anchors(root):
 
 318     for element in root.iterdescendants():
 
 322                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 
 324                 or e.get('id') == 'nota_red'
 
 325                 or e.tag == 'blockquote'
 
 326                 or e.get('id') == 'footnotes'
 
 329         if element.get('class') == 'numeracja':
 
 331                 visible_counter = int(element.get('data-start'))
 
 334             if element.get("data-link"):
 
 335                 link_prefix = element.get("data-link")
 
 336                 counter[link_prefix] = 1
 
 338         if any_ancestor(element, f):
 
 341         if element.tag == 'div' and 'verse' in element.get('class', ''):
 
 342             if visible_counter == 1 or visible_counter % 5 == 0:
 
 343                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 344             counter[link_prefix] += 1
 
 346         elif 'paragraph' in element.get('class', ''):
 
 347             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 348             counter[link_prefix] += 1
 
 352 def raw_printable_text(element):
 
 353     working = copy.deepcopy(element)
 
 354     for e in working.findall('a'):
 
 355         if e.get('class') in ('annotation', 'theme-begin'):
 
 357     return etree.tostring(working, method='text', encoding='unicode').strip()
 
 360 def add_table_of_contents(root):
 
 363     for element in root.iterdescendants():
 
 364         if element.tag in ('h2', 'h3'):
 
 367                     lambda e: e.get('id') in (
 
 368                         'footnotes', 'nota_red'
 
 369                     ) or e.get('class') in ('person-list',)):
 
 372             element_text = raw_printable_text(element)
 
 373             if (element.tag == 'h3' and len(sections)
 
 374                     and sections[-1][1] == 'h2'):
 
 375                 sections[-1][3].append(
 
 376                     (counter, element.tag, element_text, [])
 
 379                 sections.append((counter, element.tag, element_text, []))
 
 380             add_anchor(element, "s%d" % counter, with_link=False)
 
 383     toc = etree.Element('div')
 
 385     toc_header = etree.SubElement(toc, 'h2')
 
 386     toc_header.text = 'Spis treści'
 
 387     toc_list = etree.SubElement(toc, 'ol')
 
 389     for n, section, text, subsections in sections:
 
 390         section_element = etree.SubElement(toc_list, 'li')
 
 391         add_anchor(section_element, "s%d" % n, with_target=False,
 
 395             subsection_list = etree.SubElement(section_element, 'ol')
 
 396             for n1, subsection, subtext, _ in subsections:
 
 397                 subsection_element = etree.SubElement(subsection_list, 'li')
 
 398                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 
 404 def add_table_of_themes(root):
 
 406         from sortify import sortify
 
 412     for fragment in root.findall('.//a[@class="theme-begin"]'):
 
 413         if not fragment.text:
 
 415         theme_names = [s.strip() for s in fragment.text.split(',')]
 
 416         for theme_name in theme_names:
 
 417             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 
 418     book_themes = list(book_themes.items())
 
 419     book_themes.sort(key=lambda s: sortify(s[0]))
 
 420     themes_div = etree.Element('div', id="themes")
 
 421     themes_ol = etree.SubElement(themes_div, 'ol')
 
 422     for theme_name, fragments in book_themes:
 
 423         themes_li = etree.SubElement(themes_ol, 'li')
 
 424         themes_li.text = "%s: " % theme_name
 
 425         for i, fragment in enumerate(fragments):
 
 426             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 
 427             item.text = str(i + 1)
 
 429     root.insert(0, themes_div)
 
 432 def extract_annotations(html_path):
 
 433     """Extracts annotations from HTML for annotations dictionary.
 
 435     For each annotation, yields a tuple of:
 
 436     anchor, footnote type, valid qualifiers, text, html.
 
 439     from .fn_qualifiers import FN_QUALIFIERS
 
 441     parser = etree.HTMLParser(encoding='utf-8')
 
 442     tree = etree.parse(html_path, parser)
 
 443     footnotes = tree.find('//*[@id="footnotes"]')
 
 444     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 
 445     if footnotes is not None:
 
 446         for footnote in footnotes.findall('div'):
 
 447             fn_type = footnote.get('class').split('-')[1]
 
 448             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 
 451             if len(footnote) and footnote[-1].tail == '\n':
 
 452                 footnote[-1].tail = None
 
 453             text_str = etree.tostring(footnote, method='text',
 
 454                                       encoding='unicode').strip()
 
 455             html_str = etree.tostring(footnote, method='html',
 
 456                                       encoding='unicode').strip()
 
 458             match = re_qualifier.match(text_str)
 
 460                 qualifier_str = match.group(1)
 
 462                 for candidate in re.split('[;,]', qualifier_str):
 
 463                     candidate = candidate.strip()
 
 464                     if candidate in FN_QUALIFIERS:
 
 465                         qualifiers.append(candidate)
 
 466                     elif candidate.startswith('z '):
 
 467                         subcandidate = candidate.split()[1]
 
 468                         if subcandidate in FN_QUALIFIERS:
 
 469                             qualifiers.append(subcandidate)
 
 473             yield anchor, fn_type, qualifiers, text_str, html_str