X-Git-Url: https://git.mdrn.pl/librarian.git/blobdiff_plain/716a9ab552bffbb7df2cb31ae41ee196902c7653..3a0c83394d5783715fab2be29fa1a9cfc3574e28:/src/librarian/html.py diff --git a/src/librarian/html.py b/src/librarian/html.py index 78f3dad..c2f41c0 100644 --- a/src/librarian/html.py +++ b/src/librarian/html.py @@ -1,20 +1,19 @@ -# -*- coding: utf-8 -*- -# # This file is part of Librarian, licensed under GNU Affero GPLv3 or later. -# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# Copyright © Fundacja Wolne Lektury. See NOTICE for more information. # -from __future__ import print_function, unicode_literals - +import io import os import re import copy +import urllib.parse +import urllib.request from lxml import etree from librarian import XHTMLNS, ParseError, OutputFile from librarian import functions +from PIL import Image from lxml.etree import XMLSyntaxError, XSLTApplyError -import six functions.reg_substitute_entities() @@ -22,8 +21,6 @@ functions.reg_person_name() STYLESHEETS = { 'legacy': 'xslt/book2html.xslt', - 'full': 'xslt/wl2html_full.xslt', - 'partial': 'xslt/wl2html_partial.xslt' } @@ -41,7 +38,7 @@ def transform_abstrakt(abstrakt_element): style_filename = get_stylesheet('legacy') style = etree.parse(style_filename) xml = etree.tostring(abstrakt_element, encoding='unicode') - document = etree.parse(six.StringIO( + document = etree.parse(io.StringIO( xml.replace('abstrakt', 'dlugi_cytat') )) # HACK result = document.xslt(style) @@ -50,7 +47,48 @@ def transform_abstrakt(abstrakt_element): return re.sub(']*>', '', html) -def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None): +def add_image_sizes(tree, gallery_path, gallery_url, base_url): + widths = [360, 600, 1200, 1800, 2400] + + for i, ilustr in enumerate(tree.findall('//ilustr')): + rel_path = ilustr.attrib['src'] + img_url = urllib.parse.urljoin(base_url, rel_path) + + f = urllib.request.urlopen(img_url) + img = Image.open(f) + ext = {'GIF': 'gif', 'PNG': 'png'}.get(img.format, 'jpg') + + srcset = [] + # Needed widths: predefined and original, limited by + # whichever is smaller. + img_widths = [ + w for w in + sorted( + set(widths + [img.size[0]]) + ) + if w <= min(widths[-1], img.size[0]) + ] + largest = None + for w in widths: + fname = '%d.W%d.%s' % (i, w, ext) + fpath = gallery_path + fname + if not os.path.exists(fpath): + height = round(img.size[1] * w / img.size[0]) + th = img.resize((w, height)) + th.save(fpath) + th_url = gallery_url + fname + srcset.append(" ".join(( + th_url, + '%dw' % w + ))) + largest_url = th_url + ilustr.attrib['srcset'] = ", ".join(srcset) + ilustr.attrib['src'] = largest_url + + f.close() + + +def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None, gallery_path='img/', gallery_url='img/', base_url='file://./'): """Transforms the WL document to XHTML. If output_filename is None, returns an XML, @@ -72,10 +110,17 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None): document.clean_ed_note() document.clean_ed_note('abstrakt') - + document.fix_pa_akap() + if not options: options = {} - options.setdefault('gallery', "''") + + try: + os.makedirs(gallery_path) + except OSError: + pass + + add_image_sizes(document.edoc, gallery_path, gallery_url, base_url) css = ( css @@ -102,8 +147,7 @@ def transform(wldoc, stylesheet='legacy', options=None, flags=None, css=None): raise ParseError(e) -@six.python_2_unicode_compatible -class Fragment(object): +class Fragment: def __init__(self, id, themes): super(Fragment, self).__init__() self.id = id @@ -131,7 +175,7 @@ class Fragment(object): result = [] for event, element in self.closed_events(): if event == 'start': - result.append(u'<%s %s>' % ( + result.append('<%s %s>' % ( element.tag, ' '.join( '%s="%s"' % (k, v) @@ -141,7 +185,7 @@ class Fragment(object): if element.text: result.append(element.text) elif event == 'end': - result.append(u'' % element.tag) + result.append('' % element.tag) if element.tail: result.append(element.tail) else: @@ -160,7 +204,7 @@ def extract_fragments(input_filename): # iterparse would die on a HTML document parser = etree.HTMLParser(encoding='utf-8') - buf = six.BytesIO() + buf = io.BytesIO() buf.write(etree.tostring( etree.parse(input_filename, parser).getroot()[0][0], encoding='utf-8' @@ -183,6 +227,8 @@ def extract_fragments(input_filename): while parent.get('id', None) != 'book-text': cparent = copy.deepcopy(parent) cparent.text = None + if 'id' in cparent.attrib: + del cparent.attrib['id'] parents.append(cparent) parent = parent.getparent() @@ -190,7 +236,8 @@ def extract_fragments(input_filename): for parent in parents: fragment.append('start', parent) - open_fragments[fragment.id] = fragment + if fragment.id not in open_fragments: + open_fragments[fragment.id] = fragment # Close existing fragment else: @@ -222,8 +269,11 @@ def extract_fragments(input_filename): ) else: for fragment_id in open_fragments: + celem = copy.copy(element) + if 'id' in celem.attrib: + del celem.attrib['id'] open_fragments[fragment_id].append( - event, copy.copy(element) + event, celem ) return closed_fragments, open_fragments @@ -239,13 +289,13 @@ def add_anchor(element, prefix, with_link=True, with_target=True, link_text = prefix anchor = etree.Element('a', href='#%s' % prefix) anchor.set('class', 'anchor') - anchor.text = six.text_type(link_text) + anchor.text = str(link_text) parent.insert(index, anchor) if with_target: anchor_target = etree.Element('a', name='%s' % prefix) anchor_target.set('class', 'target') - anchor_target.text = u' ' + anchor_target.text = ' ' parent.insert(index, anchor_target) @@ -258,6 +308,7 @@ def any_ancestor(element, test): def add_anchors(root): counter = 1 + visible_counter = 1 for element in root.iterdescendants(): def f(e): return ( @@ -266,17 +317,27 @@ def add_anchors(root): ) or e.get('id') == 'nota_red' or e.tag == 'blockquote' + or e.get('id') == 'footnotes' ) + + if element.get('class') == 'numeracja': + try: + visible_counter = int(element.get('data-start')) + except ValueError: + visible_counter = 1 + if any_ancestor(element, f): continue if element.tag == 'div' and 'verse' in element.get('class', ''): - if counter == 1 or counter % 5 == 0: - add_anchor(element, "f%d" % counter, link_text=counter) + if visible_counter == 1 or visible_counter % 5 == 0: + add_anchor(element, "f%d" % counter, link_text=visible_counter) counter += 1 + visible_counter += 1 elif 'paragraph' in element.get('class', ''): - add_anchor(element, "f%d" % counter, link_text=counter) + add_anchor(element, "f%d" % counter, link_text=visible_counter) counter += 1 + visible_counter += 1 def raw_printable_text(element): @@ -313,7 +374,7 @@ def add_table_of_contents(root): toc = etree.Element('div') toc.set('id', 'toc') toc_header = etree.SubElement(toc, 'h2') - toc_header.text = u'Spis treści' + toc_header.text = 'Spis treści' toc_list = etree.SubElement(toc, 'ol') for n, section, text, subsections in sections: