1 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   2 # Copyright © Fundacja Wolne Lektury. See NOTICE for more information.
 
  11 from lxml import etree
 
  12 from librarian import XHTMLNS, ParseError, OutputFile
 
  13 from librarian import functions
 
  16 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
  19 functions.reg_substitute_entities()
 
  20 functions.reg_person_name()
 
  23     'legacy': 'xslt/book2html.xslt',
 
  27 def get_stylesheet(name):
 
  28     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
 
  31 def html_has_content(text):
 
  33         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
 
  37 def transform_abstrakt(abstrakt_element):
 
  38     style_filename = get_stylesheet('legacy')
 
  39     style = etree.parse(style_filename)
 
  40     xml = etree.tostring(abstrakt_element, encoding='unicode')
 
  41     document = etree.parse(io.StringIO(
 
  42         xml.replace('<abstrakt', '<dlugi_cytat').replace('</abstrakt', '</dlugi_cytat')
 
  44     result = document.xslt(style)
 
  45     html = re.sub('<a name="sec[0-9]*"/>', '',
 
  46                   etree.tostring(result, encoding='unicode'))
 
  47     return re.sub('</?blockquote[^>]*>', '', html)
 
  50 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
 
  51     widths = [360, 600, 1200, 1800, 2400]
 
  53         os.makedirs(gallery_path)
 
  57     for i, ilustr in enumerate(tree.findall('//ilustr')):
 
  58         rel_path = ilustr.attrib['src']
 
  59         img_url = urllib.parse.urljoin(base_url, rel_path)
 
  61         f = urllib.request.urlopen(img_url)
 
  63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
 
  66         # Needed widths: predefined and original, limited by
 
  67         # whichever is smaller.
 
  71                 set(widths + [img.size[0]])
 
  73             if w <= min(widths[-1], img.size[0])
 
  77             fname = '%d.W%d.%s' % (i, w, ext)
 
  78             fpath = gallery_path + fname
 
  79             if not os.path.exists(fpath):
 
  80                 height = round(img.size[1] * w / img.size[0])
 
  81                 th = img.resize((w, height))
 
  83             th_url = gallery_url + fname
 
  84             srcset.append(" ".join((
 
  89         ilustr.attrib['srcset'] = ", ".join(srcset)
 
  90         ilustr.attrib['src'] = largest_url
 
  95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
 
  96     """Transforms the WL document to XHTML.
 
  98     If output_filename is None, returns an XML,
 
  99     otherwise returns True if file has been written,False if it hasn't.
 
 100     File won't be written if it has no content.
 
 104         style_filename = get_stylesheet(stylesheet)
 
 105         style = etree.parse(style_filename)
 
 107         document = copy.deepcopy(wldoc)
 
 109         document.swap_endlines()
 
 113                 document.edoc.getroot().set(flag, 'yes')
 
 115         document.clean_ed_note()
 
 116         document.clean_ed_note('abstrakt')
 
 117         document.fix_pa_akap()
 
 123             os.makedirs(gallery_path)
 
 127         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
 131             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 
 134         result = document.transform(style, css=css, **options)
 
 135         del document  # no longer needed large object :)
 
 137         if html_has_content(result):
 
 138             add_anchors(result.getroot())
 
 139             add_table_of_themes(result.getroot())
 
 140             add_table_of_contents(result.getroot())
 
 142             return OutputFile.from_bytes(etree.tostring(
 
 143                 result, method='html', xml_declaration=False,
 
 144                 pretty_print=True, encoding='utf-8'
 
 149         raise ValueError("'%s' is not a valid stylesheet.")
 
 150     except (XMLSyntaxError, XSLTApplyError) as e:
 
 155     def __init__(self, id, themes):
 
 156         super(Fragment, self).__init__()
 
 161     def append(self, event, element):
 
 162         self.events.append((event, element))
 
 164     def closed_events(self):
 
 166         for event, element in self.events:
 
 168                 stack.append(('end', element))
 
 173                     print('CLOSED NON-OPEN TAG:', element)
 
 176         return self.events + stack
 
 180         for event, element in self.closed_events():
 
 182                 result.append('<%s %s>' % (
 
 186                         for k, v in element.attrib.items()
 
 190                     result.append(element.text)
 
 192                 result.append('</%s>' % element.tag)
 
 194                     result.append(element.tail)
 
 196                 result.append(element)
 
 198         return ''.join(result)
 
 201         return self.to_string()
 
 204 def extract_fragments(input_filename):
 
 205     """Extracts theme fragments from input_filename."""
 
 207     closed_fragments = {}
 
 209     # iterparse would die on a HTML document
 
 210     parser = etree.HTMLParser(encoding='utf-8')
 
 212     buf.write(etree.tostring(
 
 213         etree.parse(input_filename, parser).getroot()[0][0],
 
 218     for event, element in etree.iterparse(buf, events=('start', 'end')):
 
 219         # Process begin and end elements
 
 220         if element.get('class', '') in ('theme-begin', 'theme-end'):
 
 221             if not event == 'end':
 
 222                 continue  # Process elements only once, on end event
 
 225             if element.get('class', '') == 'theme-begin':
 
 226                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
 229                 parent = element.getparent()
 
 231                 while parent.get('id', None) != 'book-text':
 
 232                     cparent = copy.deepcopy(parent)
 
 234                     if 'id' in cparent.attrib:
 
 235                         del cparent.attrib['id']
 
 236                     parents.append(cparent)
 
 237                     parent = parent.getparent()
 
 240                 for parent in parents:
 
 241                     fragment.append('start', parent)
 
 243                 if fragment.id not in open_fragments:
 
 244                     open_fragments[fragment.id] = fragment
 
 246             # Close existing fragment
 
 249                     fragment = open_fragments[element.get('fid')]
 
 251                     print('%s:closed not open fragment #%s' % (
 
 252                         input_filename, element.get('fid')
 
 255                     closed_fragments[fragment.id] = fragment
 
 256                     del open_fragments[fragment.id]
 
 258             # Append element tail to lost_text
 
 259             # (we don't want to lose any text)
 
 261                 for fragment_id in open_fragments:
 
 262                     open_fragments[fragment_id].append('text', element.tail)
 
 264         # Process all elements except begin and end
 
 266             # Omit annotation tags
 
 267             if (len(element.get('name', '')) or
 
 268                     element.get('class', '') in ('annotation', 'anchor')):
 
 269                 if event == 'end' and element.tail:
 
 270                     for fragment_id in open_fragments:
 
 271                         open_fragments[fragment_id].append(
 
 275                 for fragment_id in open_fragments:
 
 276                     celem = copy.copy(element)
 
 277                     if 'id' in celem.attrib:
 
 278                         del celem.attrib['id']
 
 279                     open_fragments[fragment_id].append(
 
 283     return closed_fragments, open_fragments
 
 286 def add_anchor(element, prefix, with_link=True, with_target=True,
 
 288     parent = element.getparent()
 
 289     index = parent.index(element)
 
 292         if link_text is None:
 
 294         anchor = etree.Element('a', href='#%s' % prefix)
 
 295         anchor.set('class', 'anchor')
 
 296         anchor.text = str(link_text)
 
 297         parent.insert(index, anchor)
 
 300         anchor_target = etree.Element('a', name='%s' % prefix)
 
 301         anchor_target.set('class', 'target')
 
 302         anchor_target.text = ' '
 
 303         parent.insert(index, anchor_target)
 
 306 def any_ancestor(element, test):
 
 307     for ancestor in element.iterancestors():
 
 313 def add_anchors(root):
 
 316     for element in root.iterdescendants():
 
 320                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 
 322                 or e.get('id') == 'nota_red'
 
 323                 or e.tag == 'blockquote'
 
 324                 or e.get('id') == 'footnotes'
 
 327         if element.get('class') == 'numeracja':
 
 329                 visible_counter = int(element.get('data-start'))
 
 333         if any_ancestor(element, f):
 
 336         if element.tag == 'div' and 'verse' in element.get('class', ''):
 
 337             if visible_counter == 1 or visible_counter % 5 == 0:
 
 338                 add_anchor(element, "f%d" % counter, link_text=visible_counter)
 
 341         elif 'paragraph' in element.get('class', ''):
 
 342             add_anchor(element, "f%d" % counter, link_text=visible_counter)
 
 347 def raw_printable_text(element):
 
 348     working = copy.deepcopy(element)
 
 349     for e in working.findall('a'):
 
 350         if e.get('class') in ('annotation', 'theme-begin'):
 
 352     return etree.tostring(working, method='text', encoding='unicode').strip()
 
 355 def add_table_of_contents(root):
 
 358     for element in root.iterdescendants():
 
 359         if element.tag in ('h2', 'h3'):
 
 362                     lambda e: e.get('id') in (
 
 363                         'footnotes', 'nota_red'
 
 364                     ) or e.get('class') in ('person-list',)):
 
 367             element_text = raw_printable_text(element)
 
 368             if (element.tag == 'h3' and len(sections)
 
 369                     and sections[-1][1] == 'h2'):
 
 370                 sections[-1][3].append(
 
 371                     (counter, element.tag, element_text, [])
 
 374                 sections.append((counter, element.tag, element_text, []))
 
 375             add_anchor(element, "s%d" % counter, with_link=False)
 
 378     toc = etree.Element('div')
 
 380     toc_header = etree.SubElement(toc, 'h2')
 
 381     toc_header.text = 'Spis treści'
 
 382     toc_list = etree.SubElement(toc, 'ol')
 
 384     for n, section, text, subsections in sections:
 
 385         section_element = etree.SubElement(toc_list, 'li')
 
 386         add_anchor(section_element, "s%d" % n, with_target=False,
 
 390             subsection_list = etree.SubElement(section_element, 'ol')
 
 391             for n1, subsection, subtext, _ in subsections:
 
 392                 subsection_element = etree.SubElement(subsection_list, 'li')
 
 393                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 
 399 def add_table_of_themes(root):
 
 401         from sortify import sortify
 
 407     for fragment in root.findall('.//a[@class="theme-begin"]'):
 
 408         if not fragment.text:
 
 410         theme_names = [s.strip() for s in fragment.text.split(',')]
 
 411         for theme_name in theme_names:
 
 412             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 
 413     book_themes = list(book_themes.items())
 
 414     book_themes.sort(key=lambda s: sortify(s[0]))
 
 415     themes_div = etree.Element('div', id="themes")
 
 416     themes_ol = etree.SubElement(themes_div, 'ol')
 
 417     for theme_name, fragments in book_themes:
 
 418         themes_li = etree.SubElement(themes_ol, 'li')
 
 419         themes_li.text = "%s: " % theme_name
 
 420         for i, fragment in enumerate(fragments):
 
 421             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 
 422             item.text = str(i + 1)
 
 424     root.insert(0, themes_div)
 
 427 def extract_annotations(html_path):
 
 428     """Extracts annotations from HTML for annotations dictionary.
 
 430     For each annotation, yields a tuple of:
 
 431     anchor, footnote type, valid qualifiers, text, html.
 
 434     from .fn_qualifiers import FN_QUALIFIERS
 
 436     parser = etree.HTMLParser(encoding='utf-8')
 
 437     tree = etree.parse(html_path, parser)
 
 438     footnotes = tree.find('//*[@id="footnotes"]')
 
 439     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 
 440     if footnotes is not None:
 
 441         for footnote in footnotes.findall('div'):
 
 442             fn_type = footnote.get('class').split('-')[1]
 
 443             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 
 446             if len(footnote) and footnote[-1].tail == '\n':
 
 447                 footnote[-1].tail = None
 
 448             text_str = etree.tostring(footnote, method='text',
 
 449                                       encoding='unicode').strip()
 
 450             html_str = etree.tostring(footnote, method='html',
 
 451                                       encoding='unicode').strip()
 
 453             match = re_qualifier.match(text_str)
 
 455                 qualifier_str = match.group(1)
 
 457                 for candidate in re.split('[;,]', qualifier_str):
 
 458                     candidate = candidate.strip()
 
 459                     if candidate in FN_QUALIFIERS:
 
 460                         qualifiers.append(candidate)
 
 461                     elif candidate.startswith('z '):
 
 462                         subcandidate = candidate.split()[1]
 
 463                         if subcandidate in FN_QUALIFIERS:
 
 464                             qualifiers.append(subcandidate)
 
 468             yield anchor, fn_type, qualifiers, text_str, html_str