1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from scorched import connection
7 from urllib.parse import urlencode
9 from scorched import search
11 from httplib2 import socket
15 class TermVectorOptions(search.Options):
16 def __init__(self, schema, original=None):
20 self.positions = False
22 self.fields = copy.copy(original.fields)
23 self.positions = copy.copy(original.positions)
25 def update(self, positions=False, fields=None):
28 if isinstance(fields, (str, bytes)):
30 self.schema.check_fields(fields, {"stored": True})
31 self.fields.update(fields)
32 self.positions = positions
36 if self.positions or self.fields:
39 opts['tv.positions'] = 'true'
41 opts['tv.fl'] = ','.join(sorted(self.fields))
45 class CustomSolrConnection(connection.SolrConnection):
46 def __init__(self, *args, **kw):
47 super(CustomSolrConnection, self).__init__(*args, **kw)
48 self.analysis_url = self.url + "analysis/field/"
50 def analyze(self, params):
51 qs = urlencode(params)
52 url = "%s?%s" % (self.analysis_url, qs)
53 if len(url) > self.max_length_get_url:
54 warnings.warn("Long query URL encountered - POSTing instead of GETting. "
55 "This query will not be cached at the HTTP layer")
56 url = self.analysis_url
60 headers={"Content-Type": "application/x-www-form-urlencoded"},
63 kwargs = dict(method="GET")
64 r, c = self.request(url, **kwargs)
66 raise connection.SolrError(r, c)
70 # monkey patching scorched SolrSearch
71 search.SolrSearch.option_modules += ('term_vectorer',)
74 def __term_vector(self, positions=False, fields=None):
75 newself = self.clone()
76 newself.term_vectorer.update(positions, fields)
78 setattr(search.SolrSearch, 'term_vector', __term_vector)
81 def __patched__init_common_modules(self):
82 __original__init_common_modules(self)
83 self.term_vectorer = TermVectorOptions(self.schema)
84 __original__init_common_modules = search.SolrSearch._init_common_modules
85 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
88 class CustomSolrInterface(connection.SolrInterface):
89 # just copied from parent and SolrConnection -> CustomSolrConnection
90 def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1,
91 max_length_get_url=connection.MAX_LENGTH_GET_URL):
92 self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
93 self.schemadoc = schemadoc
95 self.writeable = False
100 except socket.error as e:
101 raise socket.error("Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e))
103 def _analyze(self, **kwargs):
104 if not self.readable:
105 raise TypeError("This Solr instance is only for writing")
107 'analysis_showmatch': True
109 if 'field' in kwargs:
110 args['analysis_fieldname'] = kwargs['field']
112 args['analysis_fieldvalue'] = kwargs['text']
114 args['q'] = kwargs['q']
115 if 'query' in kwargs:
116 args['q'] = kwargs['q']
118 params = map(lambda k, v: (k.replace('_', '.'), v), connection.params_from_dict(**args))
120 content = self.conn.analyze(params)
121 doc = etree.fromstring(content)
124 def highlight(self, **kwargs):
125 doc = self._analyze(**kwargs)
126 analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
129 start = int(wrd.xpath("int[@name='start']")[0].text)
130 end = int(wrd.xpath("int[@name='end']")[0].text)
131 matches.add((start, end))
134 return self.substring(
135 kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
139 def analyze(self, **kwargs):
140 doc = self._analyze(**kwargs)
141 terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
142 terms = map(lambda n: str(n.text), terms)
145 def expand_margins(self, text, start, end):
149 ws = re.compile(r"\W", re.UNICODE)
150 return bool(ws.match(x))
153 if is_boundary(text[start - 1]):
157 while end < totlen - 1:
158 if is_boundary(text[end + 1]):
164 def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
167 ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
169 # lets start with first match
170 (start, end) = matches_margins[0][1]
171 new_matches = [matches_margins[0][0]]
173 for (m, (s, e)) in matches_margins[1:]:
174 if end < s or start > e:
176 start = min(start, s)
178 new_matches.append(m)
180 snip = text[start:end]
181 new_matches.sort(lambda a, b: cmp(b[0], a[0]))
183 for (s, e) in new_matches:
185 snip = snip[:e + off] + mark[1] + snip[e + off:]
186 snip = snip[:s + off] + mark[0] + snip[s + off:]
187 snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)