X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/357027375ff8867f42ca34bcbfb5a78b5b185fc3..a4cf06ec9135a9375fbc9ed2c697bc60ad6052b5:/src/search/custom.py diff --git a/src/search/custom.py b/src/search/custom.py index b3b704d0b..933715719 100644 --- a/src/search/custom.py +++ b/src/search/custom.py @@ -1,116 +1,86 @@ -# -*- coding: utf-8 -*- # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later. # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information. # -from sunburnt import sunburnt -from lxml import etree -import urllib +import re +from urllib.parse import urlencode import warnings -from sunburnt import search -import copy from httplib2 import socket -import re +from lxml import etree +from scorched import connection, exc, search -class TermVectorOptions(search.Options): - def __init__(self, schema, original=None): - self.schema = schema - if original is None: - self.fields = set() - self.positions = False - else: - self.fields = copy.copy(original.fields) - self.positions = copy.copy(original.positions) - - def update(self, positions=False, fields=None): - if fields is None: - fields = [] - if isinstance(fields, basestring): - fields = [fields] - self.schema.check_fields(fields, {"stored": True}) - self.fields.update(fields) - self.positions = positions - - def options(self): - opts = {} - if self.positions or self.fields: - opts['tv'] = 'true' - if self.positions: - opts['tv.positions'] = 'true' - if self.fields: - opts['tv.fl'] = ','.join(sorted(self.fields)) - return opts - - -class CustomSolrConnection(sunburnt.SolrConnection): +class CustomSolrConnection(connection.SolrConnection): def __init__(self, *args, **kw): super(CustomSolrConnection, self).__init__(*args, **kw) self.analysis_url = self.url + "analysis/field/" def analyze(self, params): - qs = urllib.urlencode(params) + qs = urlencode(params) url = "%s?%s" % (self.analysis_url, qs) if len(url) > self.max_length_get_url: - warnings.warn("Long query URL encountered - POSTing instead of " - "GETting. This query will not be cached at the HTTP layer") + warnings.warn("Long query URL encountered - POSTing instead of GETting. " + "This query will not be cached at the HTTP layer") url = self.analysis_url kwargs = dict( method="POST", - body=qs, + data=qs, headers={"Content-Type": "application/x-www-form-urlencoded"}, ) else: kwargs = dict(method="GET") - r, c = self.request(url, **kwargs) - if r.status != 200: - raise sunburnt.SolrError(r, c) - return c - - -# monkey patching sunburnt SolrSearch -search.SolrSearch.option_modules += ('term_vectorer',) + response = self.request(url=url, **kwargs) + if response.status_code != 200: + raise exc.SolrError(response) + return response.content -def __term_vector(self, positions=False, fields=None): - newself = self.clone() - newself.term_vectorer.update(positions, fields) - return newself -setattr(search.SolrSearch, 'term_vector', __term_vector) - - -def __patched__init_common_modules(self): - __original__init_common_modules(self) - self.term_vectorer = TermVectorOptions(self.schema) -__original__init_common_modules = search.SolrSearch._init_common_modules -setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules) - - -class CustomSolrInterface(sunburnt.SolrInterface): +class CustomSolrInterface(connection.SolrInterface): # just copied from parent and SolrConnection -> CustomSolrConnection - def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL): - self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url) - self.schemadoc = schemadoc - if 'w' not in mode: - self.writeable = False - elif 'r' not in mode: - self.readable = False - try: - self.init_schema() - except socket.error, e: - raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e) + def __init__(self, url, http_connection=None, mode='', + retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL, + search_timeout=()): + """ + :param url: url to Solr + :type url: str + :param http_connection: optional -- already existing connection + :type http_connection: requests connection + :param mode: optional -- mode (readable, writable) Solr + :type mode: str + :param retry_timeout: optional -- timeout until retry + :type retry_timeout: int + :param max_length_get_url: optional -- max length until switch to post + :type max_length_get_url: int + :param search_timeout: (optional) How long to wait for the server to + send data before giving up, as a float, or a + (connect timeout, read timeout) tuple. + :type search_timeout: float or tuple + """ + + self.conn = CustomSolrConnection( + url, http_connection, mode, retry_timeout, max_length_get_url) + self.schema = self.init_schema() + self._datefields = self._extract_datefields(self.schema) + def _analyze(self, **kwargs): - if not self.readable: + if not self.conn.readable: raise TypeError("This Solr instance is only for writing") args = { 'analysis_showmatch': True } - if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field'] - if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text'] - if 'q' in kwargs: args['q'] = kwargs['q'] - if 'query' in kwargs: args['q'] = kwargs['q'] - - params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args)) + if 'field' in kwargs: + args['analysis_fieldname'] = kwargs['field'] + if 'text' in kwargs: + args['analysis_fieldvalue'] = kwargs['text'] + if 'q' in kwargs: + args['q'] = kwargs['q'] + if 'query' in kwargs: + args['q'] = kwargs['q'] + + params = [ + (k.replace('_', '.'), v) + for (k, v) in search.params_from_dict(**args) + ] content = self.conn.analyze(params) doc = etree.fromstring(content) @@ -126,16 +96,15 @@ class CustomSolrInterface(sunburnt.SolrInterface): matches.add((start, end)) if matches: - return self.substring(kwargs['text'], matches, - margins=kwargs.get('margins', 30), - mark=kwargs.get('mark', ("", ""))) + return self.substring( + kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("", ""))) else: return None def analyze(self, **kwargs): doc = self._analyze(**kwargs) terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]") - terms = map(lambda n: unicode(n.text), terms) + terms = map(lambda n: str(n.text), terms) return terms def expand_margins(self, text, start, end): @@ -155,37 +124,31 @@ class CustomSolrInterface(sunburnt.SolrInterface): break end += 1 - return (start, end) + return start, end def substring(self, text, matches, margins=30, mark=("", "")): - start = None - end = None totlen = len(text) - matches_margins = map(lambda (s, e): - ((s, e), - (max(0, s - margins), min(totlen, e + margins))), - matches) - matches_margins = map(lambda (m, (s, e)): - (m, self.expand_margins(text, s, e)), - matches_margins) - - # lets start with first match + matches_margins = [ + ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches] + + # lets start with first match (start, end) = matches_margins[0][1] - matches = [matches_margins[0][0]] + new_matches = [matches_margins[0][0]] for (m, (s, e)) in matches_margins[1:]: if end < s or start > e: continue start = min(start, s) end = max(end, e) - matches.append(m) + new_matches.append(m) snip = text[start:end] - matches.sort(lambda a, b: cmp(b[0], a[0])) + new_matches.sort(key=lambda a: -a[0]) - for (s, e) in matches: - off = - start + for (s, e) in new_matches: + off = -start snip = snip[:e + off] + mark[1] + snip[e + off:] snip = snip[:s + off] + mark[0] + snip[s + off:] + snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip) return snip