2 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $
4 # Python version of Sphinx searchd client (Python API)
6 # Copyright (c) 2006-2007, Andrew Aksyonoff
7 # Copyright (c) 2006, Mike Osadnik
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License. You should have
12 # received a copy of the GPL license along with this program; if you
13 # did not, you can find it at http://www.gnu.org/
21 # known searchd commands
22 SEARCHD_COMMAND_SEARCH = 0
23 SEARCHD_COMMAND_EXCERPT = 1
25 # current client-side command implementation versions
26 VER_COMMAND_SEARCH = 0x107
27 VER_COMMAND_EXCERPT = 0x100
29 # known searchd status codes
40 SPH_MATCH_EXTENDED = 4
43 SPH_SORT_RELEVANCE = 0
44 SPH_SORT_ATTR_DESC = 1
46 SPH_SORT_TIME_SEGMENTS = 3
49 # known attribute types
51 SPH_ATTR_TIMESTAMP = 2
53 # known grouping functions
61 _host = 'localhost' # searchd host (default is "localhost")
62 _port = 3312 # searchd port (default is 3312)
63 _offset = 0 # how much records to seek from result-set start (default is 0)
64 _limit = 20 # how much records to return from result-set starting at offset (default is 20)
65 _mode = SPH_MATCH_ALL # query matching mode (default is SPH_MATCH_ALL)
66 _weights = [] # per-field weights (default is 1 for all fields)
67 _sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE)
68 _sortby = '' # attribute to sort by (defualt is "")
69 _min_id = 0 # min ID to match (default is 0)
70 _max_id = 0xFFFFFFFF # max ID to match (default is UINT_MAX)
71 _filters = [] # search filters
72 _groupby = '' # group-by attribute name
73 _groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with)
74 _groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with)
75 _maxmatches = 1000 # max matches to retrieve
76 _error = '' # last error message
77 _warning = '' # last warning message
82 create a new client object and fill defaults
87 def GetLastError (self):
89 get last error message (string)
94 def GetLastWarning (self):
96 get last warning message (string)
101 def SetServer (self, host, port):
105 assert(isinstance(host, str))
106 assert(isinstance(port, int))
114 connect to searchd server
117 sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
118 sock.connect ( ( self._host, self._port ) )
119 except socket.error, msg:
122 self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg )
125 v = unpack('>L', sock.recv(4))
128 self._error = 'expected searchd protocol version, got %s' % v
131 # all ok, send my version
132 sock.send(pack('>L', 1))
136 def _GetResponse (self, sock, client_ver):
138 get and check response packet from searchd server
140 (status, ver, length) = unpack('>2HL', sock.recv(8))
144 chunk = sock.recv(left)
155 if not response or read!=length:
157 self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
158 % (status, ver, length, read)
160 self._error = 'received zero-sized searchd response'
164 if status==SEARCHD_WARNING:
165 wend = 4 + unpack ( '>L', response[0:4] )[0]
166 self._warning = response[4:wend]
167 return response[wend:]
169 if status==SEARCHD_ERROR:
170 self._error = 'searchd error: '+response[4:]
173 if status==SEARCHD_RETRY:
174 self._error = 'temporary searchd error: '+response[4:]
177 if status!=SEARCHD_OK:
178 self._error = 'unknown status code %d' % status
183 self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
184 % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
189 def SetLimits (self, offset, limit, maxmatches=0):
191 set match offset, count, and max number to retrieve
193 assert(isinstance(offset, int) and offset>=0)
194 assert(isinstance(limit, int) and limit>0)
195 assert(maxmatches>=0)
196 self._offset = offset
199 self._maxmatches = maxmatches
202 def SetMatchMode (self, mode):
206 assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED])
210 def SetSortMode ( self, mode, clause='' ):
214 assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] )
215 assert ( isinstance ( clause, str ) )
217 self._sortby = clause
220 def SetWeights (self, weights):
222 set per-field weights
224 assert(isinstance(weights, list))
226 assert(isinstance(w, int))
227 self._weights = weights
230 def SetIDRange (self, minid, maxid):
232 set IDs range to match
233 only match those records where document ID
234 is beetwen minid and maxid (including minid and maxid)
236 assert(isinstance(minid, int))
237 assert(isinstance(maxid, int))
243 def SetFilter ( self, attribute, values, exclude=0 ):
246 only match those records where $attribute column values
249 assert(isinstance(attribute, str))
250 assert(isinstance(values, list))
253 values = map(int, values)
255 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } )
258 def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
261 only match those records where $attribute column value
262 is beetwen $min and $max (including $min and $max)
264 assert(isinstance(attribute, str))
265 assert(isinstance(min_, int))
266 assert(isinstance(max_, int))
269 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
272 def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
274 set grouping attribute and function
276 in grouping mode, all matches are assigned to different groups
277 based on grouping function value.
279 each group keeps track of the total match count, and the best match
280 (in this group) according to current sorting function.
282 the final result set contains one best match per group, with
283 grouping function value and matches count attached.
285 groups in result set could be sorted by any sorting clause,
286 including both document attributes and the following special
287 internal Sphinx attributes:
289 - @id - match document ID;
290 - @weight, @rank, @relevance - match weight;
291 - @group - groupby function value;
292 - @count - amount of matches in group.
294 the default mode is to sort by groupby value in descending order,
295 ie. by "@group desc".
297 "total_found" would contain total amount of matching groups over
300 WARNING: grouping is done in fixed memory and thus its results
301 are only approximate; so there might be more groups reported
302 in total_found than actually present. @count might also
305 for example, if sorting by relevance and grouping by "published"
306 attribute with SPH_GROUPBY_DAY function, then the result set will
307 contain one most relevant match per each day when there were any
308 matches published, with day number and per-day match count attached,
309 and sorted by day number in descending order (ie. recent days first).
311 assert(isinstance(attribute, str))
312 assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] )
313 assert(isinstance(groupsort, str))
315 self._groupby = attribute
316 self._groupfunc = func
317 self._groupsort = groupsort
320 def Query (self, query, index='*'):
322 connect to searchd server and run given search query
324 "query" is query string
325 "index" is index name to query, default is "*" which means to query all indexes
327 returns false on failure
328 returns hash which has the following keys on success:
330 an array of found matches represented as ( "id", "weight", "attrs" ) hashes
332 total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
334 total amount of matching documents in index
338 an array of ( "word", "docs", "hits" ) hashes which contains
339 docs and hits count for stemmed (!) query words
341 sock = self._Connect()
346 req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)]
348 req.append(pack('>L', len(self._sortby)))
349 req.append(self._sortby)
351 req.append(pack('>L', len(query)))
354 req.append(pack('>L', len(self._weights)))
355 for w in self._weights:
356 req.append(pack('>L', w))
358 req.append(pack('>L', len(index)))
360 req.append(pack('>L', self._min_id))
361 req.append(pack('>L', self._max_id))
364 req.append ( pack ( '>L', len(self._filters) ) )
365 for f in self._filters:
366 req.append ( pack ( '>L', len(f['attr']) ) )
367 req.append ( f['attr'] )
368 if ( 'values' in f ):
369 req.append ( pack ( '>L', len(f['values']) ) )
370 for v in f['values']:
371 req.append ( pack ( '>L', v ) )
373 req.append ( pack ( '>3L', 0, f['min'], f['max'] ) )
374 req.append ( pack ( '>L', f['exclude'] ) )
376 # group-by, max-matches, group-sort
377 req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
378 req.append ( self._groupby )
379 req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
380 req.append ( self._groupsort )
382 # send query, get response
386 req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req
388 response = self._GetResponse(sock, VER_COMMAND_SEARCH)
401 nfields = unpack('>L', response[p:p+4])[0]
403 while nfields>0 and p<max_:
405 length = unpack('>L', response[p:p+4])[0]
407 fields.append(response[p:p+length])
410 result['fields'] = fields
412 nattrs = unpack('>L', response[p:p+4])[0]
414 while nattrs>0 and p<max_:
416 length = unpack('>L', response[p:p+4])[0]
418 attr = response[p:p+length]
420 type_ = unpack('>L', response[p:p+4])[0]
422 attrs.append([attr,type_])
424 result['attrs'] = attrs
427 count = unpack('>L', response[p:p+4])[0]
431 result['matches'] = []
432 while count>0 and p<max_:
434 doc, weight = unpack('>2L', response[p:p+8])
437 match = { 'id':doc, 'weight':weight, 'attrs':{} }
438 for i in range(len(attrs)):
439 match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
442 result['matches'].append ( match )
444 result['total'], result['total_found'], result['time'], words = \
445 unpack('>4L', response[p:p+16])
447 result['time'] = '%.3f' % (result['time']/1000.0)
453 length = unpack('>L', response[p:p+4])[0]
455 word = response[p:p+length]
457 docs, hits = unpack('>2L', response[p:p+8])
460 result['words'].append({'word':word, 'docs':docs, 'hits':hits})
467 def BuildExcerpts (self, docs, index, words, opts=None):
469 connect to searchd server and generate exceprts from given documents
471 "docs" is an array of strings which represent the documents' contents
472 "index" is a string specifiying the index which settings will be used
473 for stemming, lexing and case folding
474 "words" is a string which contains the words to highlight
475 "opts" is a hash which contains additional optional highlighting parameters:
477 a string to insert before a set of matching words, default is "<b>"
479 a string to insert after a set of matching words, default is "<b>"
481 a string to insert between excerpts chunks, default is " ... "
483 max excerpt size in symbols (codepoints), default is 256
485 how much words to highlight around each match, default is 5
487 returns false on failure
488 returns an array of string excerpts on success
493 assert(isinstance(docs, list))
494 assert(isinstance(index, str))
495 assert(isinstance(words, str))
496 assert(isinstance(opts, dict))
498 sock = self._Connect()
504 opts.setdefault('before_match', '<b>')
505 opts.setdefault('after_match', '</b>')
506 opts.setdefault('chunk_separator', ' ... ')
507 opts.setdefault('limit', 256)
508 opts.setdefault('around', 5)
513 # mode=0, flags=1 (remove spaces)
514 req = [pack('>2L', 0, 1)]
517 req.append(pack('>L', len(index)))
521 req.append(pack('>L', len(words)))
525 req.append(pack('>L', len(opts['before_match'])))
526 req.append(opts['before_match'])
528 req.append(pack('>L', len(opts['after_match'])))
529 req.append(opts['after_match'])
531 req.append(pack('>L', len(opts['chunk_separator'])))
532 req.append(opts['chunk_separator'])
534 req.append(pack('>L', int(opts['limit'])))
535 req.append(pack('>L', int(opts['around'])))
538 req.append(pack('>L', len(docs)))
540 assert(isinstance(doc, str))
541 req.append(pack('>L', len(doc)))
546 # send query, get response
550 req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
551 wrote = sock.send(req)
553 response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
562 for i in range(len(docs)):
563 length = unpack('>L', response[pos:pos+4])[0]
566 if pos+length > rlen:
567 self._error = 'incomplete reply'
570 res.append(response[pos:pos+length])
576 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $