X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/8b808f24709efb16f6b6eff6abb05b41341573c7..6b9c5843b31661562ee76ee75a360219de0397a6:/apps/search/custom.py diff --git a/apps/search/custom.py b/apps/search/custom.py index fcc3bace2..b3b704d0b 100644 --- a/apps/search/custom.py +++ b/apps/search/custom.py @@ -1,15 +1,18 @@ - +# -*- coding: utf-8 -*- +# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. +# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. +# from sunburnt import sunburnt from lxml import etree import urllib import warnings from sunburnt import search import copy +from httplib2 import socket +import re class TermVectorOptions(search.Options): - option_name = "tv" - def __init__(self, schema, original=None): self.schema = schema if original is None: @@ -30,7 +33,8 @@ class TermVectorOptions(search.Options): def options(self): opts = {} - opts['tv'] = 'true' + if self.positions or self.fields: + opts['tv'] = 'true' if self.positions: opts['tv.positions'] = 'true' if self.fields: @@ -72,12 +76,12 @@ def __term_vector(self, positions=False, fields=None): newself.term_vectorer.update(positions, fields) return newself setattr(search.SolrSearch, 'term_vector', __term_vector) -__original__init_common_modules = search.SolrSearch._init_common_modules def __patched__init_common_modules(self): __original__init_common_modules(self) self.term_vectorer = TermVectorOptions(self.schema) +__original__init_common_modules = search.SolrSearch._init_common_modules setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules) @@ -86,11 +90,14 @@ class CustomSolrInterface(sunburnt.SolrInterface): def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL): self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url) self.schemadoc = schemadoc - if mode == 'r': + if 'w' not in mode: self.writeable = False - elif mode == 'w': + elif 'r' not in mode: self.readable = False - self.init_schema() + try: + self.init_schema() + except socket.error, e: + raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e) def _analyze(self, **kwargs): if not self.readable: @@ -118,39 +125,67 @@ class CustomSolrInterface(sunburnt.SolrInterface): end = int(wrd.xpath("int[@name='end']")[0].text) matches.add((start, end)) - print matches if matches: return self.substring(kwargs['text'], matches, - margins=kwargs.get('margins', 30), - mark=kwargs.get('mark', ("", ""))) + margins=kwargs.get('margins', 30), + mark=kwargs.get('mark', ("", ""))) else: return None def analyze(self, **kwargs): - doc = self._analyze(self, **kwargs) - terms = doc.xpath("/lst[@name='index']/arr[last()]/lst/str[1]") + doc = self._analyze(**kwargs) + terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]") terms = map(lambda n: unicode(n.text), terms) return terms + def expand_margins(self, text, start, end): + totlen = len(text) + + def is_boundary(x): + ws = re.compile(r"\W", re.UNICODE) + return bool(ws.match(x)) + + while start > 0: + if is_boundary(text[start - 1]): + break + start -= 1 + + while end < totlen - 1: + if is_boundary(text[end + 1]): + break + end += 1 + + return (start, end) + def substring(self, text, matches, margins=30, mark=("", "")): start = None end = None totlen = len(text) - matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches) - (start, end) = matches_margins[0] - - for (s, e) in matches_margins[1:]: + matches_margins = map(lambda (s, e): + ((s, e), + (max(0, s - margins), min(totlen, e + margins))), + matches) + matches_margins = map(lambda (m, (s, e)): + (m, self.expand_margins(text, s, e)), + matches_margins) + + # lets start with first match + (start, end) = matches_margins[0][1] + matches = [matches_margins[0][0]] + + for (m, (s, e)) in matches_margins[1:]: if end < s or start > e: continue start = min(start, s) end = max(end, e) + matches.append(m) snip = text[start:end] - matches = list(matches) matches.sort(lambda a, b: cmp(b[0], a[0])) + for (s, e) in matches: off = - start snip = snip[:e + off] + mark[1] + snip[e + off:] snip = snip[:s + off] + mark[0] + snip[s + off:] - # maybe break on word boundaries + return snip