1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 
  11 from lxml import etree
 
  12 from librarian import XHTMLNS, DCNS, ParseError, OutputFile
 
  13 from librarian import functions
 
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
  19 functions.reg_substitute_entities()
 
  20 functions.reg_person_name()
 
  23     'legacy': 'xslt/book2html.xslt',
 
  27 def get_stylesheet(name):
 
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
 
  31 def html_has_content(text):
 
  33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
 
  37 def transform_abstrakt(abstrakt_element):
 
  38     style_filename = get_stylesheet('legacy')
 
  39     style = etree.parse(style_filename)
 
  40     xml = etree.tostring(abstrakt_element, encoding='unicode')
 
  41     document = etree.parse(io.StringIO(
 
  42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
 
  44     result = document.xslt(style)
 
  45     html = re.sub('<a name="sec[0-9]*"/>', '',
 
  46                   etree.tostring(result, encoding='unicode'))
 
  47     return re.sub('</?blockquote[^>]*>', '', html)
 
  50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
 
  51     widths = [360, 600, 1200, 1800, 2400]
 
  53         os.makedirs(gallery_path)
 
  57     for i, ilustr in enumerate(tree.findall('//ilustr')):
 
  58         rel_path = ilustr.attrib['src']
 
  59         img_url = urllib.parse.urljoin(base_url, rel_path)
 
  61         f = urllib.request.urlopen(img_url)
 
  63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
 
  66         # Needed widths: predefined and original, limited by
 
  67         # whichever is smaller.
 
  71                 set(widths + [img.size[0]])
 
  73             if w <= min(widths[-1], img.size[0])
 
  77             fname = '%d.W%d.%s' % (i, w, ext)
 
  78             fpath = gallery_path + fname
 
  79             if not os.path.exists(fpath):
 
  80                 height = round(img.size[1] * w / img.size[0])
 
  81                 th = img.resize((w, height))
 
  83             th_url = gallery_url + fname
 
  84             srcset.append(" ".join((
 
  89         ilustr.attrib['srcset'] = ", ".join(srcset)
 
  90         ilustr.attrib['src'] = largest_url
 
  95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
 
  96     """Transforms the WL document to XHTML.
 
  98     If output_filename is None, returns an XML,
 
  99     otherwise returns True if file has been written,False if it hasn't.
 
 100     File won't be written if it has no content.
 
 104         style_filename = get_stylesheet(stylesheet)
 
 105         style = etree.parse(style_filename)
 
 107         document = copy.deepcopy(wldoc)
 
 109         document.swap_endlines()
 
 113                 document.edoc.getroot().set(flag, 'yes')
 
 115         ltag = document.edoc.find('//' + DCNS('language'))
 
 117             lang = functions.lang_code_3to2(ltag.text)
 
 120         document.edoc.getroot().set('lang', lang)
 
 122         document.clean_ed_note()
 
 123         document.clean_ed_note('abstrakt')
 
 124         document.fix_pa_akap()
 
 125         document.hebr_protect()
 
 131             os.makedirs(gallery_path)
 
 135         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
 139             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 
 142         result = document.transform(style, css=css, **options)
 
 143         del document  # no longer needed large object :)
 
 145         if html_has_content(result):
 
 146             add_anchors(result.getroot())
 
 147             add_table_of_themes(result.getroot())
 
 148             add_table_of_contents(result.getroot())
 
 150             return OutputFile.from_bytes(etree.tostring(
 
 151                 result, method='html', xml_declaration=False,
 
 152                 pretty_print=True, encoding='utf-8'
 
 157         raise ValueError("'%s' is not a valid stylesheet.")
 
 158     except (XMLSyntaxError, XSLTApplyError) as e:
 
 163     def __init__(self, id, themes):
 
 164         super(Fragment, self).__init__()
 
 169     def append(self, event, element):
 
 170         self.events.append((event, element))
 
 172     def closed_events(self):
 
 174         for event, element in self.events:
 
 176                 stack.append(('end', element))
 
 181                     print('CLOSED NON-OPEN TAG:', element)
 
 184         return self.events + stack
 
 188         for event, element in self.closed_events():
 
 190                 result.append('<%s %s>' % (
 
 194                         for k, v in element.attrib.items()
 
 198                     result.append(element.text)
 
 200                 result.append('</%s>' % element.tag)
 
 202                     result.append(element.tail)
 
 204                 result.append(element)
 
 206         return ''.join(result)
 
 209         return self.to_string()
 
 212 def extract_fragments(input_filename):
 
 213     """Extracts theme fragments from input_filename."""
 
 215     closed_fragments = {}
 
 217     # iterparse would die on a HTML document
 
 218     parser = etree.HTMLParser(encoding='utf-8')
 
 220     buf.write(etree.tostring(
 
 221         etree.parse(input_filename, parser).getroot()[0][0],
 
 226     for event, element in etree.iterparse(buf, events=('start', 'end')):
 
 227         # Process begin and end elements
 
 228         if element.get('class', '') in ('theme-begin', 'theme-end'):
 
 229             if not event == 'end':
 
 230                 continue  # Process elements only once, on end event
 
 233             if element.get('class', '') == 'theme-begin':
 
 234                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
 237                 parent = element.getparent()
 
 239                 while parent.get('id', None) != 'book-text':
 
 240                     cparent = copy.deepcopy(parent)
 
 242                     if 'id' in cparent.attrib:
 
 243                         del cparent.attrib['id']
 
 244                     parents.append(cparent)
 
 245                     parent = parent.getparent()
 
 248                 for parent in parents:
 
 249                     fragment.append('start', parent)
 
 251                 if fragment.id not in open_fragments:
 
 252                     open_fragments[fragment.id] = fragment
 
 254             # Close existing fragment
 
 257                     fragment = open_fragments[element.get('fid')]
 
 259                     print('%s:closed not open fragment #%s' % (
 
 260                         input_filename, element.get('fid')
 
 263                     closed_fragments[fragment.id] = fragment
 
 264                     del open_fragments[fragment.id]
 
 266             # Append element tail to lost_text
 
 267             # (we don't want to lose any text)
 
 269                 for fragment_id in open_fragments:
 
 270                     open_fragments[fragment_id].append('text', element.tail)
 
 272         # Process all elements except begin and end
 
 274             # Omit annotation tags
 
 275             if (len(element.get('name', '')) or
 
 276                     element.get('class', '') in ('annotation', 'anchor')):
 
 277                 if event == 'end' and element.tail:
 
 278                     for fragment_id in open_fragments:
 
 279                         open_fragments[fragment_id].append(
 
 283                 for fragment_id in open_fragments:
 
 284                     celem = copy.copy(element)
 
 285                     if 'id' in celem.attrib:
 
 286                         del celem.attrib['id']
 
 287                     open_fragments[fragment_id].append(
 
 291     return closed_fragments, open_fragments
 
 294 def add_anchor(element, prefix, with_link=True, with_target=True,
 
 296     parent = element.getparent()
 
 297     index = parent.index(element)
 
 300         if link_text is None:
 
 302         anchor = etree.Element('a', href='#%s' % prefix)
 
 303         anchor.set('class', 'anchor')
 
 304         anchor.text = str(link_text)
 
 305         parent.insert(index, anchor)
 
 308         anchor_target = etree.Element('a', name='%s' % prefix)
 
 309         anchor_target.set('class', 'target')
 
 310         anchor_target.text = ' '
 
 311         parent.insert(index, anchor_target)
 
 314 def any_ancestor(element, test):
 
 315     for ancestor in element.iterancestors():
 
 321 def add_anchors(root):
 
 325     for element in root.iterdescendants():
 
 329                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 
 331                 or e.get('id') == 'nota_red'
 
 332                 or e.tag == 'blockquote'
 
 333                 or e.get('id') == 'footnotes'
 
 336         if element.get('class') == 'numeracja':
 
 338                 visible_counter = int(element.get('data-start'))
 
 341             if element.get("data-link"):
 
 342                 link_prefix = element.get("data-link")
 
 343                 counter[link_prefix] = 1
 
 345         if any_ancestor(element, f):
 
 348         if element.tag == 'div' and 'verse' in element.get('class', ''):
 
 349             if visible_counter == 1 or visible_counter % 5 == 0:
 
 350                 add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 351             counter[link_prefix] += 1
 
 353         elif 'paragraph' in element.get('class', ''):
 
 354             add_anchor(element, "%s%d" % (link_prefix, counter[link_prefix]), link_text=visible_counter)
 
 355             counter[link_prefix] += 1
 
 359 def raw_printable_text(element):
 
 360     working = copy.deepcopy(element)
 
 361     for e in working.findall('a'):
 
 362         if e.get('class') in ('annotation', 'theme-begin'):
 
 364     return etree.tostring(working, method='text', encoding='unicode').strip()
 
 367 def add_table_of_contents(root):
 
 370     for element in root.iterdescendants():
 
 371         if element.tag in ('h2', 'h3'):
 
 374                     lambda e: e.get('id') in (
 
 375                         'footnotes', 'nota_red'
 
 376                     ) or e.get('class') in ('person-list',)):
 
 379             element_text = raw_printable_text(element)
 
 380             if (element.tag == 'h3' and len(sections)
 
 381                     and sections[-1][1] == 'h2'):
 
 382                 sections[-1][3].append(
 
 383                     (counter, element.tag, element_text, [])
 
 386                 sections.append((counter, element.tag, element_text, []))
 
 387             add_anchor(element, "s%d" % counter, with_link=False)
 
 390     toc = etree.Element('div')
 
 392     toc_header = etree.SubElement(toc, 'h2')
 
 393     toc_header.text = 'Spis treści'
 
 394     toc_list = etree.SubElement(toc, 'ol')
 
 396     for n, section, text, subsections in sections:
 
 397         section_element = etree.SubElement(toc_list, 'li')
 
 398         add_anchor(section_element, "s%d" % n, with_target=False,
 
 402             subsection_list = etree.SubElement(section_element, 'ol')
 
 403             for n1, subsection, subtext, _ in subsections:
 
 404                 subsection_element = etree.SubElement(subsection_list, 'li')
 
 405                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 
 411 def add_table_of_themes(root):
 
 413         from sortify import sortify
 
 419     for fragment in root.findall('.//a[@class="theme-begin"]'):
 
 420         if not fragment.text:
 
 422         theme_names = [s.strip() for s in fragment.text.split(',')]
 
 423         for theme_name in theme_names:
 
 424             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 
 425     book_themes = list(book_themes.items())
 
 426     book_themes.sort(key=lambda s: sortify(s[0]))
 
 427     themes_div = etree.Element('div', id="themes")
 
 428     themes_ol = etree.SubElement(themes_div, 'ol')
 
 429     for theme_name, fragments in book_themes:
 
 430         themes_li = etree.SubElement(themes_ol, 'li')
 
 431         themes_li.text = "%s: " % theme_name
 
 432         for i, fragment in enumerate(fragments):
 
 433             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 
 434             item.text = str(i + 1)
 
 436     root.insert(0, themes_div)
 
 439 def extract_annotations(html_path):
 
 440     """Extracts annotations from HTML for annotations dictionary.
 
 442     For each annotation, yields a tuple of:
 
 443     anchor, footnote type, valid qualifiers, text, html.
 
 446     from .fn_qualifiers import FN_QUALIFIERS
 
 448     parser = etree.HTMLParser(encoding='utf-8')
 
 449     tree = etree.parse(html_path, parser)
 
 450     footnotes = tree.find('//*[@id="footnotes"]')
 
 451     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 
 452     if footnotes is not None:
 
 453         for footnote in footnotes.findall('div'):
 
 454             fn_type = footnote.get('class').split('-')[1]
 
 455             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 
 458             if len(footnote) and footnote[-1].tail == '\n':
 
 459                 footnote[-1].tail = None
 
 460             text_str = etree.tostring(footnote, method='text',
 
 461                                       encoding='unicode').strip()
 
 462             html_str = etree.tostring(footnote, method='html',
 
 463                                       encoding='unicode').strip()
 
 465             match = re_qualifier.match(text_str)
 
 467                 qualifier_str = match.group(1)
 
 469                 for candidate in re.split('[;,]', qualifier_str):
 
 470                     candidate = candidate.strip()
 
 471                     if candidate in FN_QUALIFIERS:
 
 472                         qualifiers.append(candidate)
 
 473                     elif candidate.startswith('z '):
 
 474                         subcandidate = candidate.split()[1]
 
 475                         if subcandidate in FN_QUALIFIERS:
 
 476                             qualifiers.append(subcandidate)
 
 480             yield anchor, fn_type, qualifiers, text_str, html_str