1 # -*- coding: utf-8 -*-
 
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
 
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   6 from __future__ import print_function, unicode_literals
 
  12 from lxml import etree
 
  13 from librarian import XHTMLNS, ParseError, OutputFile
 
  14 from librarian import functions
 
  17 from lxml.etree import XMLSyntaxError, XSLTApplyError
 
  21 functions.reg_substitute_entities()
 
  22 functions.reg_person_name()
 
  25     'legacy': 'xslt/book2html.xslt',
 
  26     'full': 'xslt/wl2html_full.xslt',
 
  27     'partial': 'xslt/wl2html_partial.xslt'
 
  31 def get_stylesheet(name):
 
  32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
 
  35 def html_has_content(text):
 
  37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
 
  41 def transform_abstrakt(abstrakt_element):
 
  42     style_filename = get_stylesheet('legacy')
 
  43     style = etree.parse(style_filename)
 
  44     xml = etree.tostring(abstrakt_element, encoding='unicode')
 
  45     document = etree.parse(six.StringIO(
 
  46         xml.replace('abstrakt', 'dlugi_cytat')
 
  48     result = document.xslt(style)
 
  49     html = re.sub('<a name="sec[0-9]*"/>', '',
 
  50                   etree.tostring(result, encoding='unicode'))
 
  51     return re.sub('</?blockquote[^>]*>', '', html)
 
  54 def add_image_sizes(tree, gallery_path, gallery_url, base_url):
 
  55     widths = [360, 600, 1200, 1800, 2400]
 
  57     for i, ilustr in enumerate(tree.findall('//ilustr')):
 
  58         rel_path = ilustr.attrib['src']
 
  59         img_url = six.moves.urllib.parse.urljoin(base_url, rel_path)
 
  61         f = six.moves.urllib.request.urlopen(img_url)
 
  63         ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg')
 
  66         # Needed widths: predefined and original, limited by
 
  67         # whichever is smaller.
 
  71                 set(widths + [img.size[0]])
 
  73             if w <= min(widths[-1], img.size[0])
 
  77             fname = '%d.W%d.%s' % (i, w, ext)
 
  78             fpath = gallery_path + fname
 
  79             if not os.path.exists(fpath):
 
  80                 height = round(img.size[1] * w / img.size[0])
 
  81                 th = img.resize((w, height))
 
  83             th_url = gallery_url + fname
 
  84             srcset.append(" ".join((
 
  89         ilustr.attrib['srcset'] = ", ".join(srcset)
 
  90         ilustr.attrib['src'] = largest_url
 
  95 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'):
 
  96     """Transforms the WL document to XHTML.
 
  98     If output_filename is None, returns an XML,
 
  99     otherwise returns True if file has been written,False if it hasn't.
 
 100     File won't be written if it has no content.
 
 104         style_filename = get_stylesheet(stylesheet)
 
 105         style = etree.parse(style_filename)
 
 107         document = copy.deepcopy(wldoc)
 
 109         document.swap_endlines()
 
 113                 document.edoc.getroot().set(flag, 'yes')
 
 115         document.clean_ed_note()
 
 116         document.clean_ed_note('abstrakt')
 
 117         document.fix_pa_akap()
 
 123             os.makedirs(gallery_path)
 
 127         add_image_sizes(document.edoc, gallery_path, gallery_url, base_url)
 
 131             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 
 134         result = document.transform(style, css=css, **options)
 
 135         del document  # no longer needed large object :)
 
 137         if html_has_content(result):
 
 138             add_anchors(result.getroot())
 
 139             add_table_of_themes(result.getroot())
 
 140             add_table_of_contents(result.getroot())
 
 142             return OutputFile.from_bytes(etree.tostring(
 
 143                 result, method='html', xml_declaration=False,
 
 144                 pretty_print=True, encoding='utf-8'
 
 149         raise ValueError("'%s' is not a valid stylesheet.")
 
 150     except (XMLSyntaxError, XSLTApplyError) as e:
 
 154 @six.python_2_unicode_compatible
 
 155 class Fragment(object):
 
 156     def __init__(self, id, themes):
 
 157         super(Fragment, self).__init__()
 
 162     def append(self, event, element):
 
 163         self.events.append((event, element))
 
 165     def closed_events(self):
 
 167         for event, element in self.events:
 
 169                 stack.append(('end', element))
 
 174                     print('CLOSED NON-OPEN TAG:', element)
 
 177         return self.events + stack
 
 181         for event, element in self.closed_events():
 
 183                 result.append(u'<%s %s>' % (
 
 187                         for k, v in element.attrib.items()
 
 191                     result.append(element.text)
 
 193                 result.append(u'</%s>' % element.tag)
 
 195                     result.append(element.tail)
 
 197                 result.append(element)
 
 199         return ''.join(result)
 
 202         return self.to_string()
 
 205 def extract_fragments(input_filename):
 
 206     """Extracts theme fragments from input_filename."""
 
 208     closed_fragments = {}
 
 210     # iterparse would die on a HTML document
 
 211     parser = etree.HTMLParser(encoding='utf-8')
 
 213     buf.write(etree.tostring(
 
 214         etree.parse(input_filename, parser).getroot()[0][0],
 
 219     for event, element in etree.iterparse(buf, events=('start', 'end')):
 
 220         # Process begin and end elements
 
 221         if element.get('class', '') in ('theme-begin', 'theme-end'):
 
 222             if not event == 'end':
 
 223                 continue  # Process elements only once, on end event
 
 226             if element.get('class', '') == 'theme-begin':
 
 227                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 
 230                 parent = element.getparent()
 
 232                 while parent.get('id', None) != 'book-text':
 
 233                     cparent = copy.deepcopy(parent)
 
 235                     if 'id' in cparent.attrib:
 
 236                         del cparent.attrib['id']
 
 237                     parents.append(cparent)
 
 238                     parent = parent.getparent()
 
 241                 for parent in parents:
 
 242                     fragment.append('start', parent)
 
 244                 if fragment.id not in open_fragments:
 
 245                     open_fragments[fragment.id] = fragment
 
 247             # Close existing fragment
 
 250                     fragment = open_fragments[element.get('fid')]
 
 252                     print('%s:closed not open fragment #%s' % (
 
 253                         input_filename, element.get('fid')
 
 256                     closed_fragments[fragment.id] = fragment
 
 257                     del open_fragments[fragment.id]
 
 259             # Append element tail to lost_text
 
 260             # (we don't want to lose any text)
 
 262                 for fragment_id in open_fragments:
 
 263                     open_fragments[fragment_id].append('text', element.tail)
 
 265         # Process all elements except begin and end
 
 267             # Omit annotation tags
 
 268             if (len(element.get('name', '')) or
 
 269                     element.get('class', '') in ('annotation', 'anchor')):
 
 270                 if event == 'end' and element.tail:
 
 271                     for fragment_id in open_fragments:
 
 272                         open_fragments[fragment_id].append(
 
 276                 for fragment_id in open_fragments:
 
 277                     celem = copy.copy(element)
 
 278                     if 'id' in celem.attrib:
 
 279                         del celem.attrib['id']
 
 280                     open_fragments[fragment_id].append(
 
 284     return closed_fragments, open_fragments
 
 287 def add_anchor(element, prefix, with_link=True, with_target=True,
 
 289     parent = element.getparent()
 
 290     index = parent.index(element)
 
 293         if link_text is None:
 
 295         anchor = etree.Element('a', href='#%s' % prefix)
 
 296         anchor.set('class', 'anchor')
 
 297         anchor.text = six.text_type(link_text)
 
 298         parent.insert(index, anchor)
 
 301         anchor_target = etree.Element('a', name='%s' % prefix)
 
 302         anchor_target.set('class', 'target')
 
 303         anchor_target.text = u' '
 
 304         parent.insert(index, anchor_target)
 
 307 def any_ancestor(element, test):
 
 308     for ancestor in element.iterancestors():
 
 314 def add_anchors(root):
 
 317     for element in root.iterdescendants():
 
 321                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 
 323                 or e.get('id') == 'nota_red'
 
 324                 or e.tag == 'blockquote'
 
 325                 or e.get('id') == 'footnotes'
 
 328         if element.get('class') == 'numeracja':
 
 330                 visible_counter = int(element.get('data-start'))
 
 334         if any_ancestor(element, f):
 
 337         if element.tag == 'div' and 'verse' in element.get('class', ''):
 
 338             if visible_counter == 1 or visible_counter % 5 == 0:
 
 339                 add_anchor(element, "f%d" % counter, link_text=visible_counter)
 
 342         elif 'paragraph' in element.get('class', ''):
 
 343             add_anchor(element, "f%d" % counter, link_text=visible_counter)
 
 348 def raw_printable_text(element):
 
 349     working = copy.deepcopy(element)
 
 350     for e in working.findall('a'):
 
 351         if e.get('class') in ('annotation', 'theme-begin'):
 
 353     return etree.tostring(working, method='text', encoding='unicode').strip()
 
 356 def add_table_of_contents(root):
 
 359     for element in root.iterdescendants():
 
 360         if element.tag in ('h2', 'h3'):
 
 363                     lambda e: e.get('id') in (
 
 364                         'footnotes', 'nota_red'
 
 365                     ) or e.get('class') in ('person-list',)):
 
 368             element_text = raw_printable_text(element)
 
 369             if (element.tag == 'h3' and len(sections)
 
 370                     and sections[-1][1] == 'h2'):
 
 371                 sections[-1][3].append(
 
 372                     (counter, element.tag, element_text, [])
 
 375                 sections.append((counter, element.tag, element_text, []))
 
 376             add_anchor(element, "s%d" % counter, with_link=False)
 
 379     toc = etree.Element('div')
 
 381     toc_header = etree.SubElement(toc, 'h2')
 
 382     toc_header.text = u'Spis treści'
 
 383     toc_list = etree.SubElement(toc, 'ol')
 
 385     for n, section, text, subsections in sections:
 
 386         section_element = etree.SubElement(toc_list, 'li')
 
 387         add_anchor(section_element, "s%d" % n, with_target=False,
 
 391             subsection_list = etree.SubElement(section_element, 'ol')
 
 392             for n1, subsection, subtext, _ in subsections:
 
 393                 subsection_element = etree.SubElement(subsection_list, 'li')
 
 394                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 
 400 def add_table_of_themes(root):
 
 402         from sortify import sortify
 
 408     for fragment in root.findall('.//a[@class="theme-begin"]'):
 
 409         if not fragment.text:
 
 411         theme_names = [s.strip() for s in fragment.text.split(',')]
 
 412         for theme_name in theme_names:
 
 413             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 
 414     book_themes = list(book_themes.items())
 
 415     book_themes.sort(key=lambda s: sortify(s[0]))
 
 416     themes_div = etree.Element('div', id="themes")
 
 417     themes_ol = etree.SubElement(themes_div, 'ol')
 
 418     for theme_name, fragments in book_themes:
 
 419         themes_li = etree.SubElement(themes_ol, 'li')
 
 420         themes_li.text = "%s: " % theme_name
 
 421         for i, fragment in enumerate(fragments):
 
 422             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 
 423             item.text = str(i + 1)
 
 425     root.insert(0, themes_div)
 
 428 def extract_annotations(html_path):
 
 429     """Extracts annotations from HTML for annotations dictionary.
 
 431     For each annotation, yields a tuple of:
 
 432     anchor, footnote type, valid qualifiers, text, html.
 
 435     from .fn_qualifiers import FN_QUALIFIERS
 
 437     parser = etree.HTMLParser(encoding='utf-8')
 
 438     tree = etree.parse(html_path, parser)
 
 439     footnotes = tree.find('//*[@id="footnotes"]')
 
 440     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 
 441     if footnotes is not None:
 
 442         for footnote in footnotes.findall('div'):
 
 443             fn_type = footnote.get('class').split('-')[1]
 
 444             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 
 447             if len(footnote) and footnote[-1].tail == '\n':
 
 448                 footnote[-1].tail = None
 
 449             text_str = etree.tostring(footnote, method='text',
 
 450                                       encoding='unicode').strip()
 
 451             html_str = etree.tostring(footnote, method='html',
 
 452                                       encoding='unicode').strip()
 
 454             match = re_qualifier.match(text_str)
 
 456                 qualifier_str = match.group(1)
 
 458                 for candidate in re.split('[;,]', qualifier_str):
 
 459                     candidate = candidate.strip()
 
 460                     if candidate in FN_QUALIFIERS:
 
 461                         qualifiers.append(candidate)
 
 462                     elif candidate.startswith('z '):
 
 463                         subcandidate = candidate.split()[1]
 
 464                         if subcandidate in FN_QUALIFIERS:
 
 465                             qualifiers.append(subcandidate)
 
 469             yield anchor, fn_type, qualifiers, text_str, html_str