1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
5 from urllib.parse import urlencode
7 from httplib2 import socket
9 from scorched import connection, exc, search
12 class CustomSolrConnection(connection.SolrConnection):
13 def __init__(self, *args, **kw):
14 super(CustomSolrConnection, self).__init__(*args, **kw)
15 self.analysis_url = self.url + "analysis/field/"
17 def analyze(self, params):
18 qs = urlencode(params)
19 url = "%s?%s" % (self.analysis_url, qs)
20 if len(url) > self.max_length_get_url:
21 warnings.warn("Long query URL encountered - POSTing instead of GETting. "
22 "This query will not be cached at the HTTP layer")
23 url = self.analysis_url
27 headers={"Content-Type": "application/x-www-form-urlencoded"},
30 kwargs = dict(method="GET")
31 response = self.request(url=url, **kwargs)
32 if response.status_code != 200:
33 raise exc.SolrError(response)
34 return response.content
37 class CustomSolrInterface(connection.SolrInterface):
38 # just copied from parent and SolrConnection -> CustomSolrConnection
39 def __init__(self, url, http_connection=None, mode='',
40 retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL,
43 :param url: url to Solr
45 :param http_connection: optional -- already existing connection
46 :type http_connection: requests connection
47 :param mode: optional -- mode (readable, writable) Solr
49 :param retry_timeout: optional -- timeout until retry
50 :type retry_timeout: int
51 :param max_length_get_url: optional -- max length until switch to post
52 :type max_length_get_url: int
53 :param search_timeout: (optional) How long to wait for the server to
54 send data before giving up, as a float, or a
55 (connect timeout, read timeout) tuple.
56 :type search_timeout: float or tuple
59 self.conn = CustomSolrConnection(
60 url, http_connection, mode, retry_timeout, max_length_get_url)
61 self.schema = self.init_schema()
62 self._datefields = self._extract_datefields(self.schema)
65 def _analyze(self, **kwargs):
66 if not self.conn.readable:
67 raise TypeError("This Solr instance is only for writing")
69 'analysis_showmatch': True
72 args['analysis_fieldname'] = kwargs['field']
74 args['analysis_fieldvalue'] = kwargs['text']
76 args['q'] = kwargs['q']
78 args['q'] = kwargs['q']
81 (k.replace('_', '.'), v)
82 for (k, v) in search.params_from_dict(**args)
85 content = self.conn.analyze(params)
86 doc = etree.fromstring(content)
89 def highlight(self, **kwargs):
90 doc = self._analyze(**kwargs)
91 analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
94 start = int(wrd.xpath("int[@name='start']")[0].text)
95 end = int(wrd.xpath("int[@name='end']")[0].text)
96 matches.add((start, end))
99 return self.substring(
100 kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
104 def analyze(self, **kwargs):
105 doc = self._analyze(**kwargs)
106 terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
107 terms = map(lambda n: str(n.text), terms)
110 def expand_margins(self, text, start, end):
114 ws = re.compile(r"\W", re.UNICODE)
115 return bool(ws.match(x))
118 if is_boundary(text[start - 1]):
122 while end < totlen - 1:
123 if is_boundary(text[end + 1]):
129 def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
132 ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
134 # lets start with first match
135 (start, end) = matches_margins[0][1]
136 new_matches = [matches_margins[0][0]]
138 for (m, (s, e)) in matches_margins[1:]:
139 if end < s or start > e:
141 start = min(start, s)
143 new_matches.append(m)
145 snip = text[start:end]
146 new_matches.sort(key=lambda a: -a[0])
148 for (s, e) in new_matches:
150 snip = snip[:e + off] + mark[1] + snip[e + off:]
151 snip = snip[:s + off] + mark[0] + snip[s + off:]
152 snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)