X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/d157af1061e9f03f59ea909d7d25f4a0b41f1c0e..5d3277b79ffef805948ca0e105135d13179926b6:/apps/search/custom.py diff --git a/apps/search/custom.py b/apps/search/custom.py index 86d387e02..b3b704d0b 100644 --- a/apps/search/custom.py +++ b/apps/search/custom.py @@ -1,4 +1,7 @@ - +# -*- coding: utf-8 -*- +# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# from sunburnt import sunburnt from lxml import etree import urllib @@ -6,6 +9,8 @@ import warnings from sunburnt import search import copy from httplib2 import socket +import re + class TermVectorOptions(search.Options): def __init__(self, schema, original=None): @@ -93,7 +98,6 @@ class CustomSolrInterface(sunburnt.SolrInterface): self.init_schema() except socket.error, e: raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e) - def _analyze(self, **kwargs): if not self.readable: @@ -134,6 +138,25 @@ class CustomSolrInterface(sunburnt.SolrInterface): terms = map(lambda n: unicode(n.text), terms) return terms + def expand_margins(self, text, start, end): + totlen = len(text) + + def is_boundary(x): + ws = re.compile(r"\W", re.UNICODE) + return bool(ws.match(x)) + + while start > 0: + if is_boundary(text[start - 1]): + break + start -= 1 + + while end < totlen - 1: + if is_boundary(text[end + 1]): + break + end += 1 + + return (start, end) + def substring(self, text, matches, margins=30, mark=("", "")): start = None end = None @@ -142,15 +165,21 @@ class CustomSolrInterface(sunburnt.SolrInterface): ((s, e), (max(0, s - margins), min(totlen, e + margins))), matches) + matches_margins = map(lambda (m, (s, e)): + (m, self.expand_margins(text, s, e)), + matches_margins) + + # lets start with first match (start, end) = matches_margins[0][1] - matches = [] + matches = [matches_margins[0][0]] + for (m, (s, e)) in matches_margins[1:]: if end < s or start > e: continue start = min(start, s) end = max(end, e) matches.append(m) - + snip = text[start:end] matches.sort(lambda a, b: cmp(b[0], a[0])) @@ -158,7 +187,5 @@ class CustomSolrInterface(sunburnt.SolrInterface): off = - start snip = snip[:e + off] + mark[1] + snip[e + off:] snip = snip[:s + off] + mark[0] + snip[s + off:] - # maybe break on word boundaries return snip -