src/librarian/html.py

   1 # -*- coding: utf-8 -*-
   2 #
   3 # This file is part of Librarian, licensed under GNU Affero GPLv3 or later.
   4 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   5 #
   6 from __future__ import print_function, unicode_literals
   7
   8 import os
   9 import re
  10 import copy
  11
  12 from lxml import etree
  13 from librarian import XHTMLNS, ParseError, OutputFile
  14 from librarian import functions
  15 from PIL import Image
  16
  17 from lxml.etree import XMLSyntaxError, XSLTApplyError
  18 import six
  19
  20
  21 functions.reg_substitute_entities()
  22 functions.reg_person_name()
  23
  24 STYLESHEETS = {
  25     'legacy': 'xslt/book2html.xslt',
  26     'full': 'xslt/wl2html_full.xslt',
  27     'partial': 'xslt/wl2html_partial.xslt'
  28 }
  29
  30
  31 def get_stylesheet(name):
  32     return os.path.join(os.path.dirname(__file__), STYLESHEETS[name])
  33
  34
  35 def html_has_content(text):
  36     return etree.ETXPath(
  37         '//p|//{%(ns)s}p|//h1|//{%(ns)s}h1' % {'ns': str(XHTMLNS)}
  38     )(text)
  39
  40
  41 def transform_abstrakt(abstrakt_element):
  42     style_filename = get_stylesheet('legacy')
  43     style = etree.parse(style_filename)
  44     xml = etree.tostring(abstrakt_element, encoding='unicode')
  45     document = etree.parse(six.StringIO(
  46         xml.replace('abstrakt', 'dlugi_cytat')
  47     ))  # HACK
  48     result = document.xslt(style)
  49     html = re.sub('<a name="sec[0-9]*"/>', '',
  50                   etree.tostring(result, encoding='unicode'))
  51     return re.sub('</?blockquote[^>]*>', '', html)
  52
  53
  54 def add_image_sizes(tree, gallery_path, gallery_url):
  55     widths = [360, 600, 1200, 1800]
  56     for ilustr in tree.findall('//ilustr'):
  57         rel_path = ilustr.attrib['src']
  58         img = Image.open(gallery_path + rel_path)
  59         srcset = []
  60         for w in widths:
  61             if w < img.size[0]:
  62                 height = round(img.size[1] * w / img.size[0])
  63                 th = img.resize((w, height))
  64
  65                 fname = ('.W%d.' % w).join(rel_path.rsplit('.', 1))
  66                 th.save(gallery_path + fname)
  67                 srcset.append(" ".join((
  68                     gallery_url + fname,
  69                     '%dw' % w
  70                     )))
  71         srcset.append(" ".join((
  72             gallery_url + rel_path,
  73             '%dw' % img.size[0]
  74         )))
  75         ilustr.attrib['srcset'] = ", ".join(srcset)
  76         ilustr.attrib['src'] = gallery_url + rel_path
  77
  78
  79 def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/'):
  80     """Transforms the WL document to XHTML.
  81
  82     If output_filename is None, returns an XML,
  83     otherwise returns True if file has been written,False if it hasn't.
  84     File won't be written if it has no content.
  85     """
  86     # Parse XSLT
  87     try:
  88         style_filename = get_stylesheet(stylesheet)
  89         style = etree.parse(style_filename)
  90
  91         document = copy.deepcopy(wldoc)
  92         del wldoc
  93         document.swap_endlines()
  94
  95         if flags:
  96             for flag in flags:
  97                 document.edoc.getroot().set(flag, 'yes')
  98
  99         document.clean_ed_note()
 100         document.clean_ed_note('abstrakt')
 101
 102         if not options:
 103             options = {}
 104
 105         add_image_sizes(document.edoc, gallery_path, gallery_url)
 106
 107         css = (
 108             css
 109             or 'https://static.wolnelektury.pl/css/compressed/book_text.css'
 110         )
 111         css = "'%s'" % css
 112         result = document.transform(style, css=css, **options)
 113         del document  # no longer needed large object :)
 114
 115         if html_has_content(result):
 116             add_anchors(result.getroot())
 117             add_table_of_themes(result.getroot())
 118             add_table_of_contents(result.getroot())
 119
 120             return OutputFile.from_bytes(etree.tostring(
 121                 result, method='html', xml_declaration=False,
 122                 pretty_print=True, encoding='utf-8'
 123             ))
 124         else:
 125             return None
 126     except KeyError:
 127         raise ValueError("'%s' is not a valid stylesheet.")
 128     except (XMLSyntaxError, XSLTApplyError) as e:
 129         raise ParseError(e)
 130
 131
 132 @six.python_2_unicode_compatible
 133 class Fragment(object):
 134     def __init__(self, id, themes):
 135         super(Fragment, self).__init__()
 136         self.id = id
 137         self.themes = themes
 138         self.events = []
 139
 140     def append(self, event, element):
 141         self.events.append((event, element))
 142
 143     def closed_events(self):
 144         stack = []
 145         for event, element in self.events:
 146             if event == 'start':
 147                 stack.append(('end', element))
 148             elif event == 'end':
 149                 try:
 150                     stack.pop()
 151                 except IndexError:
 152                     print('CLOSED NON-OPEN TAG:', element)
 153
 154         stack.reverse()
 155         return self.events + stack
 156
 157     def to_string(self):
 158         result = []
 159         for event, element in self.closed_events():
 160             if event == 'start':
 161                 result.append(u'<%s %s>' % (
 162                     element.tag,
 163                     ' '.join(
 164                         '%s="%s"' % (k, v)
 165                         for k, v in element.attrib.items()
 166                     )
 167                 ))
 168                 if element.text:
 169                     result.append(element.text)
 170             elif event == 'end':
 171                 result.append(u'</%s>' % element.tag)
 172                 if element.tail:
 173                     result.append(element.tail)
 174             else:
 175                 result.append(element)
 176
 177         return ''.join(result)
 178
 179     def __str__(self):
 180         return self.to_string()
 181
 182
 183 def extract_fragments(input_filename):
 184     """Extracts theme fragments from input_filename."""
 185     open_fragments = {}
 186     closed_fragments = {}
 187
 188     # iterparse would die on a HTML document
 189     parser = etree.HTMLParser(encoding='utf-8')
 190     buf = six.BytesIO()
 191     buf.write(etree.tostring(
 192         etree.parse(input_filename, parser).getroot()[0][0],
 193         encoding='utf-8'
 194     ))
 195     buf.seek(0)
 196
 197     for event, element in etree.iterparse(buf, events=('start', 'end')):
 198         # Process begin and end elements
 199         if element.get('class', '') in ('theme-begin', 'theme-end'):
 200             if not event == 'end':
 201                 continue  # Process elements only once, on end event
 202
 203             # Open new fragment
 204             if element.get('class', '') == 'theme-begin':
 205                 fragment = Fragment(id=element.get('fid'), themes=element.text)
 206
 207                 # Append parents
 208                 parent = element.getparent()
 209                 parents = []
 210                 while parent.get('id', None) != 'book-text':
 211                     cparent = copy.deepcopy(parent)
 212                     cparent.text = None
 213                     if 'id' in cparent.attrib:
 214                         del cparent.attrib['id']
 215                     parents.append(cparent)
 216                     parent = parent.getparent()
 217
 218                 parents.reverse()
 219                 for parent in parents:
 220                     fragment.append('start', parent)
 221
 222                 open_fragments[fragment.id] = fragment
 223
 224             # Close existing fragment
 225             else:
 226                 try:
 227                     fragment = open_fragments[element.get('fid')]
 228                 except KeyError:
 229                     print('%s:closed not open fragment #%s' % (
 230                         input_filename, element.get('fid')
 231                     ))
 232                 else:
 233                     closed_fragments[fragment.id] = fragment
 234                     del open_fragments[fragment.id]
 235
 236             # Append element tail to lost_text
 237             # (we don't want to lose any text)
 238             if element.tail:
 239                 for fragment_id in open_fragments:
 240                     open_fragments[fragment_id].append('text', element.tail)
 241
 242         # Process all elements except begin and end
 243         else:
 244             # Omit annotation tags
 245             if (len(element.get('name', '')) or
 246                     element.get('class', '') in ('annotation', 'anchor')):
 247                 if event == 'end' and element.tail:
 248                     for fragment_id in open_fragments:
 249                         open_fragments[fragment_id].append(
 250                             'text', element.tail
 251                         )
 252             else:
 253                 for fragment_id in open_fragments:
 254                     celem = copy.copy(element)
 255                     if 'id' in celem.attrib:
 256                         del celem.attrib['id']
 257                     open_fragments[fragment_id].append(
 258                         event, celem
 259                     )
 260
 261     return closed_fragments, open_fragments
 262
 263
 264 def add_anchor(element, prefix, with_link=True, with_target=True,
 265                link_text=None):
 266     parent = element.getparent()
 267     index = parent.index(element)
 268
 269     if with_link:
 270         if link_text is None:
 271             link_text = prefix
 272         anchor = etree.Element('a', href='#%s' % prefix)
 273         anchor.set('class', 'anchor')
 274         anchor.text = six.text_type(link_text)
 275         parent.insert(index, anchor)
 276
 277     if with_target:
 278         anchor_target = etree.Element('a', name='%s' % prefix)
 279         anchor_target.set('class', 'target')
 280         anchor_target.text = u' '
 281         parent.insert(index, anchor_target)
 282
 283
 284 def any_ancestor(element, test):
 285     for ancestor in element.iterancestors():
 286         if test(ancestor):
 287             return True
 288     return False
 289
 290
 291 def add_anchors(root):
 292     counter = 1
 293     for element in root.iterdescendants():
 294         def f(e):
 295             return (
 296                 e.get('class') in (
 297                     'note', 'motto', 'motto_podpis', 'dedication', 'frame'
 298                 )
 299                 or e.get('id') == 'nota_red'
 300                 or e.tag == 'blockquote'
 301             )
 302         if any_ancestor(element, f):
 303             continue
 304
 305         if element.tag == 'div' and 'verse' in element.get('class', ''):
 306             if counter == 1 or counter % 5 == 0:
 307                 add_anchor(element, "f%d" % counter, link_text=counter)
 308             counter += 1
 309         elif 'paragraph' in element.get('class', ''):
 310             add_anchor(element, "f%d" % counter, link_text=counter)
 311             counter += 1
 312
 313
 314 def raw_printable_text(element):
 315     working = copy.deepcopy(element)
 316     for e in working.findall('a'):
 317         if e.get('class') in ('annotation', 'theme-begin'):
 318             e.text = ''
 319     return etree.tostring(working, method='text', encoding='unicode').strip()
 320
 321
 322 def add_table_of_contents(root):
 323     sections = []
 324     counter = 1
 325     for element in root.iterdescendants():
 326         if element.tag in ('h2', 'h3'):
 327             if any_ancestor(
 328                     element,
 329                     lambda e: e.get('id') in (
 330                         'footnotes', 'nota_red'
 331                     ) or e.get('class') in ('person-list',)):
 332                 continue
 333
 334             element_text = raw_printable_text(element)
 335             if (element.tag == 'h3' and len(sections)
 336                     and sections[-1][1] == 'h2'):
 337                 sections[-1][3].append(
 338                     (counter, element.tag, element_text, [])
 339                 )
 340             else:
 341                 sections.append((counter, element.tag, element_text, []))
 342             add_anchor(element, "s%d" % counter, with_link=False)
 343             counter += 1
 344
 345     toc = etree.Element('div')
 346     toc.set('id', 'toc')
 347     toc_header = etree.SubElement(toc, 'h2')
 348     toc_header.text = u'Spis treści'
 349     toc_list = etree.SubElement(toc, 'ol')
 350
 351     for n, section, text, subsections in sections:
 352         section_element = etree.SubElement(toc_list, 'li')
 353         add_anchor(section_element, "s%d" % n, with_target=False,
 354                    link_text=text)
 355
 356         if len(subsections):
 357             subsection_list = etree.SubElement(section_element, 'ol')
 358             for n1, subsection, subtext, _ in subsections:
 359                 subsection_element = etree.SubElement(subsection_list, 'li')
 360                 add_anchor(subsection_element, "s%d" % n1, with_target=False,
 361                            link_text=subtext)
 362
 363     root.insert(0, toc)
 364
 365
 366 def add_table_of_themes(root):
 367     try:
 368         from sortify import sortify
 369     except ImportError:
 370         def sortify(x):
 371             return x
 372
 373     book_themes = {}
 374     for fragment in root.findall('.//a[@class="theme-begin"]'):
 375         if not fragment.text:
 376             continue
 377         theme_names = [s.strip() for s in fragment.text.split(',')]
 378         for theme_name in theme_names:
 379             book_themes.setdefault(theme_name, []).append(fragment.get('name'))
 380     book_themes = list(book_themes.items())
 381     book_themes.sort(key=lambda s: sortify(s[0]))
 382     themes_div = etree.Element('div', id="themes")
 383     themes_ol = etree.SubElement(themes_div, 'ol')
 384     for theme_name, fragments in book_themes:
 385         themes_li = etree.SubElement(themes_ol, 'li')
 386         themes_li.text = "%s: " % theme_name
 387         for i, fragment in enumerate(fragments):
 388             item = etree.SubElement(themes_li, 'a', href="#%s" % fragment)
 389             item.text = str(i + 1)
 390             item.tail = ' '
 391     root.insert(0, themes_div)
 392
 393
 394 def extract_annotations(html_path):
 395     """Extracts annotations from HTML for annotations dictionary.
 396
 397     For each annotation, yields a tuple of:
 398     anchor, footnote type, valid qualifiers, text, html.
 399
 400     """
 401     from .fn_qualifiers import FN_QUALIFIERS
 402
 403     parser = etree.HTMLParser(encoding='utf-8')
 404     tree = etree.parse(html_path, parser)
 405     footnotes = tree.find('//*[@id="footnotes"]')
 406     re_qualifier = re.compile(r'[^\u2014]+\s+\(([^\)]+)\)\s+\u2014')
 407     if footnotes is not None:
 408         for footnote in footnotes.findall('div'):
 409             fn_type = footnote.get('class').split('-')[1]
 410             anchor = footnote.find('a[@class="annotation"]').get('href')[1:]
 411             del footnote[:2]
 412             footnote.text = None
 413             if len(footnote) and footnote[-1].tail == '\n':
 414                 footnote[-1].tail = None
 415             text_str = etree.tostring(footnote, method='text',
 416                                       encoding='unicode').strip()
 417             html_str = etree.tostring(footnote, method='html',
 418                                       encoding='unicode').strip()
 419
 420             match = re_qualifier.match(text_str)
 421             if match:
 422                 qualifier_str = match.group(1)
 423                 qualifiers = []
 424                 for candidate in re.split('[;,]', qualifier_str):
 425                     candidate = candidate.strip()
 426                     if candidate in FN_QUALIFIERS:
 427                         qualifiers.append(candidate)
 428                     elif candidate.startswith('z '):
 429                         subcandidate = candidate.split()[1]
 430                         if subcandidate in FN_QUALIFIERS:
 431                             qualifiers.append(subcandidate)
 432             else:
 433                 qualifiers = []
 434
 435             yield anchor, fn_type, qualifiers, text_str, html_str