1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 
  11 from lxml import etree
 
  12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
 
  13 from librarian import functions
 
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
  19 functions.reg_substitute_entities()
 
  20 functions.reg_person_name()
 
  23     'legacy': 'xslt/book2html.xslt',
 
  27 def get_stylesheet(name):
 
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
 
  31 def html_has_content(text):
 
  33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
 
  37 def transform_abstrakt(abstrakt_element):
 
  38     style_filename = get_stylesheet('legacy')
 
  39     style = etree.parse(style_filename)
 
  40     xml = etree.tostring(abstrakt_element, encoding='unicode')
 
  41     document = etree.parse(io.StringIO(
 
  42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
 
  44     result = document.xslt(style)
 
  45     html = re.sub('<a name="sec[0-9]*"/>', '',
 
  46                   etree.tostring(result, encoding='unicode'))
 
  47     return re.sub('</?blockquote[^>]*>', '', html)
 
  50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
 
  51     widths = [360, 600, 1200, 1800, 2400]
 
  53         os.makedirs(gallery_path)
 
  57     for i, ilustr in enumerate(tree.findall('//ilustr')):
 
  58         rel_path = ilustr.attrib['src']
 
  59         img_url = urllib.parse.urljoin(base_url, rel_path)
 
  61         f = urllib.request.urlopen(img_url)
 
  63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
 
  66         # Needed widths: predefined and original, limited by
 
  67         # whichever is smaller.
 
  71                 set(widths + [img.size[0]])
 
  73             if w <= min(widths[-1], img.size[0])
 
  77             fname = '%d.W%d.%s' % (i, w, ext)
 
  78             fpath = gallery_path + fname
 
  79             if not os.path.exists(fpath):
 
  80                 height = round(img.size[1] * w / img.size[0])
 
  81                 th = img.resize((w, height))
 
  83             th_url = gallery_url + fname
 
  84             srcset.append(" ".join((
 
  89         ilustr.attrib['srcset'] = ", ".join(srcset)
 
  90         ilustr.attrib['src'] = largest_url
 
  95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
 
  96     """Transforms the WL document to XHTML.
 
  98     If output_filename is None, returns an XML,
 
  99     otherwise returns True if file has been written,False if it hasn't.
 
 100     File won't be written if it has no content.
 
 104         style_filename = get_stylesheet(stylesheet)
 
 105         style = etree.parse(style_filename)
 
 107         document = copy.deepcopy(wldoc)
 
 109         document.swap_endlines()
 
 113                 document.edoc.getroot().set(flag, 'yes')
 
 115         ltag = document.edoc.find('//' + DCNS('language'))
 
 116         lang = functions.lang_code_3to2(ltag.text) or 'pl'
 
 117         document.edoc.getroot().set('lang', lang)
 
 119         document.clean_ed_note()
 
 120         document.clean_ed_note('abstrakt')
 
 121         document.fix_pa_akap()
 
 122         document.hebr_protect()
 
 128             os.makedirs(gallery_path)
 
 132         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
 136             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 
 139         result = document.transform(style, css=css, **options)
 
 140         del document  # no longer needed large object :)
 
 142         if html_has_content(result):
 
 143             add_anchors(result.getroot())
 
 144             add_table_of_themes(result.getroot())
 
 145             add_table_of_contents(result.getroot())
 
 147             return OutputFile.from_bytes(etree.tostring(
 
 148                 result, method='html', xml_declaration=False,
 
 149                 pretty_print=True, encoding='utf-8'
 
 154         raise ValueError("'%s' is not a valid stylesheet.")
 
 155     except (XMLSyntaxError, XSLTApplyError) as e:
 
 160     def __init__(self, id, themes):
 
 161         super(Fragment, self).__init__()
 
 166     def append(self, event, element):
 
 167         self.events.append((event, element))
 
 169     def closed_events(self):
 
 171         for event, element in self.events:
 
 173                 stack.append(('end', element))
 
 178                     print('CLOSED NON-OPEN TAG:', element)
 
 181         return self.events + stack
 
 185         for event, element in self.closed_events():
 
 187                 result.append('<%s %s>' % (
 
 191                         for k, v in element.attrib.items()
 
 195                     result.append(element.text)
 
 197                 result.append('</%s>' % element.tag)
 
 199                     result.append(element.tail)
 
 201                 result.append(element)
 
 203         return ''.join(result)
 
 206         return self.to_string()
 
 209 def extract_fragments(input_filename):
 
 210     """Extracts theme fragments from input_filename."""
 
 212     closed_fragments = {}
 
 214     # iterparse would die on a HTML document
 
 215     parser = etree.HTMLParser(encoding='utf-8')
 
 217     buf.write(etree.tostring(
 
 218         etree.parse(input_filename, parser).getroot()[0][0],
 
 223     for event, element in etree.iterparse(buf, events=('start', 'end')):
 
 224         # Process begin and end elements
 
 225         if element.get('class', '') in ('theme-begin', 'theme-end'):
 
 226             if not event == 'end':
 
 227                 continue  # Process elements only once, on end event
 
 230             if element.get('class', '') == 'theme-begin':
 
 231                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
 234                 parent = element.getparent()
 
 236                 while parent.get('id', None) != 'book-text':
 
 237                     cparent = copy.deepcopy(parent)
 
 239                     if 'id' in cparent.attrib:
 
 240                         del cparent.attrib['id']
 
 241                     parents.append(cparent)
 
 242                     parent = parent.getparent()
 
 245                 for parent in parents:
 
 246                     fragment.append('start', parent)
 
 248                 if fragment.id not in open_fragments:
 
 249                     open_fragments[fragment.id] = fragment
 
 251             # Close existing fragment
 
 254                     fragment = open_fragments[element.get('fid')]
 
 256                     print('%s:closed not open fragment #%s' % (
 
 257                         input_filename, element.get('fid')
 
 260                     closed_fragments[fragment.id] = fragment
 
 261                     del open_fragments[fragment.id]
 
 263             # Append element tail to lost_text
 
 264             # (we don't want to lose any text)
 
 266                 for fragment_id in open_fragments:
 
 267                     open_fragments[fragment_id].append('text', element.tail)
 
 269         # Process all elements except begin and end
 
 271             # Omit annotation tags
 
 272             if (len(element.get('name', '')) or
 
 273                     element.get('class', '') in ('annotation', 'anchor')):
 
 274                 if event == 'end' and element.tail:
 
 275                     for fragment_id in open_fragments:
 
 276                         open_fragments[fragment_id].append(
 
 280                 for fragment_id in open_fragments:
 
 281                     celem = copy.copy(element)
 
 282                     if 'id' in celem.attrib:
 
 283                         del celem.attrib['id']
 
 284                     open_fragments[fragment_id].append(
 
 288     return closed_fragments, open_fragments
 
 291 def add_anchor(element, prefix, with_link=True, with_target=True,
 
 293     parent = element.getparent()
 
 294     index = parent.index(element)
 
 297         if link_text is None:
 
 299         anchor = etree.Element('a', href='#%s' % prefix)
 
 300         anchor.set('class', 'anchor')
 
 301         anchor.text = str(link_text)
 
 302         parent.insert(index, anchor)
 
 305         anchor_target = etree.Element('a', name='%s' % prefix)
 
 306         anchor_target.set('class', 'target')
 
 307         anchor_target.text = ' '
 
 308         parent.insert(index, anchor_target)
 
 311 def any_ancestor(element, test):
 
 312     for ancestor in element.iterancestors():
 
 318 def add_anchors(root):
 
 322     for element in root.iterdescendants():
 
 326                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 
 328                 or e.get('id') == 'nota_red'
 
 329                 or e.tag == 'blockquote'
 
 330                 or e.get('id') == 'footnotes'
 
 333         if element.get('class') == 'numeracja':
 
 335                 visible_counter = int(element.get('data-start'))
 
 338             if element.get("data-link"):
 
 339                 link_prefix = element.get("data-link")
 
 340                 counter[link_prefix] = 1
 
 342         if any_ancestor(element, f):
 
 345         if element.tag == 'div' and 'verse' in element.get('class', ''):
 
 346             if visible_counter == 1 or visible_counter % 5 == 0:
 
 347                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 348             counter[link_prefix] += 1
 
 350         elif 'paragraph' in element.get('class', ''):
 
 351             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 352             counter[link_prefix] += 1
 
 356 def raw_printable_text(element):
 
 357     working = copy.deepcopy(element)
 
 358     for e in working.findall('a'):
 
 359         if e.get('class') in ('annotation', 'theme-begin'):
 
 361     return etree.tostring(working, method='text', encoding='unicode').strip()
 
 364 def add_table_of_contents(root):
 
 367     for element in root.iterdescendants():
 
 368         if element.tag in ('h2', 'h3'):
 
 371                     lambda e: e.get('id') in (
 
 372                         'footnotes', 'nota_red'
 
 373                     ) or e.get('class') in ('person-list',)):
 
 376             element_text = raw_printable_text(element)
 
 377             if (element.tag == 'h3' and len(sections)
 
 378                     and sections[-1][1] == 'h2'):
 
 379                 sections[-1][3].append(
 
 380                     (counter, element.tag, element_text, [])
 
 383                 sections.append((counter, element.tag, element_text, []))
 
 384             add_anchor(element, "s%d" % counter, with_link=False)
 
 387     toc = etree.Element('div')
 
 389     toc_header = etree.SubElement(toc, 'h2')
 
 390     toc_header.text = 'Spis treści'
 
 391     toc_list = etree.SubElement(toc, 'ol')
 
 393     for n, section, text, subsections in sections:
 
 394         section_element = etree.SubElement(toc_list, 'li')
 
 395         add_anchor(section_element, "s%d" % n, with_target=False,
 
 399             subsection_list = etree.SubElement(section_element, 'ol')
 
 400             for n1, subsection, subtext, _ in subsections:
 
 401                 subsection_element = etree.SubElement(subsection_list, 'li')
 
 402                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 
 408 def add_table_of_themes(root):
 
 410         from sortify import sortify
 
 416     for fragment in root.findall('.//a[@class="theme-begin"]'):
 
 417         if not fragment.text:
 
 419         theme_names = [s.strip() for s in fragment.text.split(',')]
 
 420         for theme_name in theme_names:
 
 421             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 
 422     book_themes = list(book_themes.items())
 
 423     book_themes.sort(key=lambda s: sortify(s[0]))
 
 424     themes_div = etree.Element('div', id="themes")
 
 425     themes_ol = etree.SubElement(themes_div, 'ol')
 
 426     for theme_name, fragments in book_themes:
 
 427         themes_li = etree.SubElement(themes_ol, 'li')
 
 428         themes_li.text = "%s: " % theme_name
 
 429         for i, fragment in enumerate(fragments):
 
 430             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 
 431             item.text = str(i + 1)
 
 433     root.insert(0, themes_div)
 
 436 def extract_annotations(html_path):
 
 437     """Extracts annotations from HTML for annotations dictionary.
 
 439     For each annotation, yields a tuple of:
 
 440     anchor, footnote type, valid qualifiers, text, html.
 
 443     from .fn_qualifiers import FN_QUALIFIERS
 
 445     parser = etree.HTMLParser(encoding='utf-8')
 
 446     tree = etree.parse(html_path, parser)
 
 447     footnotes = tree.find('//*[@id="footnotes"]')
 
 448     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 
 449     if footnotes is not None:
 
 450         for footnote in footnotes.findall('div'):
 
 451             fn_type = footnote.get('class').split('-')[1]
 
 452             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 
 455             if len(footnote) and footnote[-1].tail == '\n':
 
 456                 footnote[-1].tail = None
 
 457             text_str = etree.tostring(footnote, method='text',
 
 458                                       encoding='unicode').strip()
 
 459             html_str = etree.tostring(footnote, method='html',
 
 460                                       encoding='unicode').strip()
 
 462             match = re_qualifier.match(text_str)
 
 464                 qualifier_str = match.group(1)
 
 466                 for candidate in re.split('[;,]', qualifier_str):
 
 467                     candidate = candidate.strip()
 
 468                     if candidate in FN_QUALIFIERS:
 
 469                         qualifiers.append(candidate)
 
 470                     elif candidate.startswith('z '):
 
 471                         subcandidate = candidate.split()[1]
 
 472                         if subcandidate in FN_QUALIFIERS:
 
 473                             qualifiers.append(subcandidate)
 
 477             yield anchor, fn_type, qualifiers, text_str, html_str