2 from sunburnt import sunburnt
6 from sunburnt import search
8 from httplib2 import socket
12 class TermVectorOptions(search.Options):
13 def __init__(self, schema, original=None):
17 self.positions = False
19 self.fields = copy.copy(original.fields)
20 self.positions = copy.copy(original.positions)
22 def update(self, positions=False, fields=None):
25 if isinstance(fields, basestring):
27 self.schema.check_fields(fields, {"stored": True})
28 self.fields.update(fields)
29 self.positions = positions
33 if self.positions or self.fields:
36 opts['tv.positions'] = 'true'
38 opts['tv.fl'] = ','.join(sorted(self.fields))
42 class CustomSolrConnection(sunburnt.SolrConnection):
43 def __init__(self, *args, **kw):
44 super(CustomSolrConnection, self).__init__(*args, **kw)
45 self.analysis_url = self.url + "analysis/field/"
47 def analyze(self, params):
48 qs = urllib.urlencode(params)
49 url = "%s?%s" % (self.analysis_url, qs)
50 if len(url) > self.max_length_get_url:
51 warnings.warn("Long query URL encountered - POSTing instead of "
52 "GETting. This query will not be cached at the HTTP layer")
53 url = self.analysis_url
57 headers={"Content-Type": "application/x-www-form-urlencoded"},
60 kwargs = dict(method="GET")
61 r, c = self.request(url, **kwargs)
63 raise sunburnt.SolrError(r, c)
67 # monkey patching sunburnt SolrSearch
68 search.SolrSearch.option_modules += ('term_vectorer',)
71 def __term_vector(self, positions=False, fields=None):
72 newself = self.clone()
73 newself.term_vectorer.update(positions, fields)
75 setattr(search.SolrSearch, 'term_vector', __term_vector)
78 def __patched__init_common_modules(self):
79 __original__init_common_modules(self)
80 self.term_vectorer = TermVectorOptions(self.schema)
81 __original__init_common_modules = search.SolrSearch._init_common_modules
82 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
85 class CustomSolrInterface(sunburnt.SolrInterface):
86 # just copied from parent and SolrConnection -> CustomSolrConnection
87 def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
88 self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
89 self.schemadoc = schemadoc
91 self.writeable = False
96 except socket.error, e:
97 raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
99 def _analyze(self, **kwargs):
100 if not self.readable:
101 raise TypeError("This Solr instance is only for writing")
103 'analysis_showmatch': True
105 if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field']
106 if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text']
107 if 'q' in kwargs: args['q'] = kwargs['q']
108 if 'query' in kwargs: args['q'] = kwargs['q']
110 params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
112 content = self.conn.analyze(params)
113 doc = etree.fromstring(content)
116 def highlight(self, **kwargs):
117 doc = self._analyze(**kwargs)
118 analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
121 start = int(wrd.xpath("int[@name='start']")[0].text)
122 end = int(wrd.xpath("int[@name='end']")[0].text)
123 matches.add((start, end))
126 return self.substring(kwargs['text'], matches,
127 margins=kwargs.get('margins', 30),
128 mark=kwargs.get('mark', ("<b>", "</b>")))
132 def analyze(self, **kwargs):
133 doc = self._analyze(**kwargs)
134 terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
135 terms = map(lambda n: unicode(n.text), terms)
138 def expand_margins(self, text, start, end):
142 ws = re.compile(r"\W", re.UNICODE)
143 return bool(ws.match(x))
146 if is_boundary(text[start - 1]):
150 while end < totlen - 1:
151 if is_boundary(text[end + 1]):
157 def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
161 matches_margins = map(lambda (s, e):
163 (max(0, s - margins), min(totlen, e + margins))),
165 matches_margins = map(lambda (m, (s, e)):
166 (m, self.expand_margins(text, s, e)),
169 # lets start with first match
170 (start, end) = matches_margins[0][1]
171 matches = [matches_margins[0][0]]
173 for (m, (s, e)) in matches_margins[1:]:
174 if end < s or start > e:
176 start = min(start, s)
180 snip = text[start:end]
181 matches.sort(lambda a, b: cmp(b[0], a[0]))
184 for (s, e) in matches:
186 snip = snip[:e + off] + mark[1] + snip[e + off:]
187 snip = snip[:s + off] + mark[0] + snip[s + off:]