X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/63f861861578b33416a0b2d801252b27443fabde..3596cf9db6eabb5f0aa36afe7919bc40e8ff0b9a:/src/search/custom.py?ds=inline
diff --git a/src/search/custom.py b/src/search/custom.py
deleted file mode 100644
index da21e019e..000000000
--- a/src/search/custom.py
+++ /dev/null
@@ -1,189 +0,0 @@
-# -*- coding: utf-8 -*-
-# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
-# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
-#
-from sunburnt import sunburnt
-from lxml import etree
-import urllib
-import warnings
-from sunburnt import search
-import copy
-from httplib2 import socket
-import re
-
-
-class TermVectorOptions(search.Options):
- def __init__(self, schema, original=None):
- self.schema = schema
- if original is None:
- self.fields = set()
- self.positions = False
- else:
- self.fields = copy.copy(original.fields)
- self.positions = copy.copy(original.positions)
-
- def update(self, positions=False, fields=None):
- if fields is None:
- fields = []
- if isinstance(fields, basestring):
- fields = [fields]
- self.schema.check_fields(fields, {"stored": True})
- self.fields.update(fields)
- self.positions = positions
-
- def options(self):
- opts = {}
- if self.positions or self.fields:
- opts['tv'] = 'true'
- if self.positions:
- opts['tv.positions'] = 'true'
- if self.fields:
- opts['tv.fl'] = ','.join(sorted(self.fields))
- return opts
-
-
-class CustomSolrConnection(sunburnt.SolrConnection):
- def __init__(self, *args, **kw):
- super(CustomSolrConnection, self).__init__(*args, **kw)
- self.analysis_url = self.url + "analysis/field/"
-
- def analyze(self, params):
- qs = urllib.urlencode(params)
- url = "%s?%s" % (self.analysis_url, qs)
- if len(url) > self.max_length_get_url:
- warnings.warn("Long query URL encountered - POSTing instead of GETting. "
- "This query will not be cached at the HTTP layer")
- url = self.analysis_url
- kwargs = dict(
- method="POST",
- body=qs,
- headers={"Content-Type": "application/x-www-form-urlencoded"},
- )
- else:
- kwargs = dict(method="GET")
- r, c = self.request(url, **kwargs)
- if r.status != 200:
- raise sunburnt.SolrError(r, c)
- return c
-
-
-# monkey patching sunburnt SolrSearch
-search.SolrSearch.option_modules += ('term_vectorer',)
-
-
-def __term_vector(self, positions=False, fields=None):
- newself = self.clone()
- newself.term_vectorer.update(positions, fields)
- return newself
-setattr(search.SolrSearch, 'term_vector', __term_vector)
-
-
-def __patched__init_common_modules(self):
- __original__init_common_modules(self)
- self.term_vectorer = TermVectorOptions(self.schema)
-__original__init_common_modules = search.SolrSearch._init_common_modules
-setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
-
-
-class CustomSolrInterface(sunburnt.SolrInterface):
- # just copied from parent and SolrConnection -> CustomSolrConnection
- def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1,
- max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
- self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
- self.schemadoc = schemadoc
- if 'w' not in mode:
- self.writeable = False
- elif 'r' not in mode:
- self.readable = False
- try:
- self.init_schema()
- except socket.error, e:
- raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
-
- def _analyze(self, **kwargs):
- if not self.readable:
- raise TypeError("This Solr instance is only for writing")
- args = {
- 'analysis_showmatch': True
- }
- if 'field' in kwargs:
- args['analysis_fieldname'] = kwargs['field']
- if 'text' in kwargs:
- args['analysis_fieldvalue'] = kwargs['text']
- if 'q' in kwargs:
- args['q'] = kwargs['q']
- if 'query' in kwargs:
- args['q'] = kwargs['q']
-
- params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
-
- content = self.conn.analyze(params)
- doc = etree.fromstring(content)
- return doc
-
- def highlight(self, **kwargs):
- doc = self._analyze(**kwargs)
- analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
- matches = set()
- for wrd in analyzed:
- start = int(wrd.xpath("int[@name='start']")[0].text)
- end = int(wrd.xpath("int[@name='end']")[0].text)
- matches.add((start, end))
-
- if matches:
- return self.substring(
- kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("", "")))
- else:
- return None
-
- def analyze(self, **kwargs):
- doc = self._analyze(**kwargs)
- terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
- terms = map(lambda n: unicode(n.text), terms)
- return terms
-
- def expand_margins(self, text, start, end):
- totlen = len(text)
-
- def is_boundary(x):
- ws = re.compile(r"\W", re.UNICODE)
- return bool(ws.match(x))
-
- while start > 0:
- if is_boundary(text[start - 1]):
- break
- start -= 1
-
- while end < totlen - 1:
- if is_boundary(text[end + 1]):
- break
- end += 1
-
- return start, end
-
- def substring(self, text, matches, margins=30, mark=("", "")):
- totlen = len(text)
- matches_margins = [
- ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
-
- # lets start with first match
- (start, end) = matches_margins[0][1]
- new_matches = [matches_margins[0][0]]
-
- for (m, (s, e)) in matches_margins[1:]:
- if end < s or start > e:
- continue
- start = min(start, s)
- end = max(end, e)
- new_matches.append(m)
-
- snip = text[start:end]
- new_matches.sort(lambda a, b: cmp(b[0], a[0]))
-
- for (s, e) in new_matches:
- off = -start
- snip = snip[:e + off] + mark[1] + snip[e + off:]
- snip = snip[:s + off] + mark[0] + snip[s + off:]
- snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)
-
- return snip