Added branch 1.0.
[wolnelektury.git] / apps / djangosphinx / apis / api263 / __init__.py
1 #
2 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $
3 #
4 # Python version of Sphinx searchd client (Python API)
5 #
6 # Copyright (c) 2006-2007, Andrew Aksyonoff
7 # Copyright (c) 2006, Mike Osadnik
8 # All rights reserved
9 #
10 # This program is free software; you can redistribute it and/or modify
11 # it under the terms of the GNU General Public License. You should have
12 # received a copy of the GPL license along with this program; if you
13 # did not, you can find it at http://www.gnu.org/
14 #
15
16 import select
17 import socket
18 from struct import *
19
20
21 # known searchd commands
22 SEARCHD_COMMAND_SEARCH  = 0
23 SEARCHD_COMMAND_EXCERPT = 1
24
25 # current client-side command implementation versions
26 VER_COMMAND_SEARCH              = 0x107
27 VER_COMMAND_EXCERPT             = 0x100
28
29 # known searchd status codes
30 SEARCHD_OK                              = 0
31 SEARCHD_ERROR                   = 1
32 SEARCHD_RETRY                   = 2
33 SEARCHD_WARNING                 = 3
34
35 # known match modes
36 SPH_MATCH_ALL                   = 0
37 SPH_MATCH_ANY                   = 1
38 SPH_MATCH_PHRASE                = 2
39 SPH_MATCH_BOOLEAN               = 3
40 SPH_MATCH_EXTENDED              = 4
41
42 # known sort modes
43 SPH_SORT_RELEVANCE              = 0
44 SPH_SORT_ATTR_DESC              = 1
45 SPH_SORT_ATTR_ASC               = 2
46 SPH_SORT_TIME_SEGMENTS  = 3
47 SPH_SORT_EXTENDED               = 4
48
49 # known attribute types
50 SPH_ATTR_INTEGER                = 1
51 SPH_ATTR_TIMESTAMP              = 2
52
53 # known grouping functions
54 SPH_GROUPBY_DAY                 = 0
55 SPH_GROUPBY_WEEK                = 1
56 SPH_GROUPBY_MONTH               = 2
57 SPH_GROUPBY_YEAR                = 3
58 SPH_GROUPBY_ATTR                = 4
59
60 class SphinxClient:
61         _host           = 'localhost'                   # searchd host (default is "localhost")
62         _port           = 3312                                  # searchd port (default is 3312)
63         _offset         = 0                                             # how much records to seek from result-set start (default is 0)
64         _limit          = 20                                    # how much records to return from result-set starting at offset (default is 20)
65         _mode           = SPH_MATCH_ALL                 # query matching mode (default is SPH_MATCH_ALL)
66         _weights        = []                                    # per-field weights (default is 1 for all fields)
67         _sort           = SPH_SORT_RELEVANCE    # match sorting mode (default is SPH_SORT_RELEVANCE)
68         _sortby         = ''                                    # attribute to sort by (defualt is "")
69         _min_id         = 0                                             # min ID to match (default is 0)
70         _max_id         = 0xFFFFFFFF                    # max ID to match (default is UINT_MAX)
71         _filters        = []                                    # search filters
72         _groupby        = ''                                    # group-by attribute name
73         _groupfunc      = SPH_GROUPBY_DAY               # group-by function (to pre-process group-by attribute value with)
74         _groupsort      = '@group desc'                 # group-by sorting clause (to sort groups in result set with)
75         _maxmatches     = 1000                                  # max matches to retrieve
76         _error          = ''                                    # last error message
77         _warning        = ''                                    # last warning message
78
79
80         def __init__ (self):
81                 """
82                 create a new client object and fill defaults
83                 """
84                 pass
85
86
87         def GetLastError (self):
88                 """
89                 get last error message (string)
90                 """
91                 return self._error
92
93
94         def GetLastWarning (self):
95                 """
96                 get last warning message (string)
97                 """
98                 return self._warning
99
100
101         def SetServer (self, host, port):
102                 """
103                 set searchd server
104                 """
105                 assert(isinstance(host, str))
106                 assert(isinstance(port, int))
107
108                 self._host = host
109                 self._port = port
110
111
112         def _Connect (self):
113                 """
114                 connect to searchd server
115                 """
116                 try:
117                         sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
118                         sock.connect ( ( self._host, self._port ) )
119                 except socket.error, msg:
120                         if sock:
121                                 sock.close()
122                         self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg )
123                         return 0
124
125                 v = unpack('>L', sock.recv(4))
126                 if v<1:
127                         sock.close()
128                         self._error = 'expected searchd protocol version, got %s' % v
129                         return 0
130
131                 # all ok, send my version
132                 sock.send(pack('>L', 1))
133                 return sock
134
135
136         def _GetResponse (self, sock, client_ver):
137                 """
138                 get and check response packet from searchd server
139                 """
140                 (status, ver, length) = unpack('>2HL', sock.recv(8))
141                 response = ''
142                 left = length
143                 while left>0:
144                         chunk = sock.recv(left)
145                         if chunk:
146                                 response += chunk
147                                 left -= len(chunk)
148                         else:
149                                 break
150
151                 sock.close()
152
153                 # check response
154                 read = len(response)
155                 if not response or read!=length:
156                         if length:
157                                 self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
158                                         % (status, ver, length, read)
159                         else:
160                                 self._error = 'received zero-sized searchd response'
161                         return None
162
163                 # check status
164                 if status==SEARCHD_WARNING:
165                         wend = 4 + unpack ( '>L', response[0:4] )[0]
166                         self._warning = response[4:wend]
167                         return response[wend:]
168
169                 if status==SEARCHD_ERROR:
170                         self._error = 'searchd error: '+response[4:]
171                         return None
172
173                 if status==SEARCHD_RETRY:
174                         self._error = 'temporary searchd error: '+response[4:]
175                         return None
176
177                 if status!=SEARCHD_OK:
178                         self._error = 'unknown status code %d' % status
179                         return None
180
181                 # check version
182                 if ver<client_ver:
183                         self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
184                                 % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
185
186                 return response
187
188
189         def SetLimits (self, offset, limit, maxmatches=0):
190                 """
191                 set match offset, count, and max number to retrieve
192                 """
193                 assert(isinstance(offset, int) and offset>=0)
194                 assert(isinstance(limit, int) and limit>0)
195                 assert(maxmatches>=0)
196                 self._offset = offset
197                 self._limit = limit
198                 if maxmatches>0:
199                         self._maxmatches = maxmatches
200
201
202         def SetMatchMode (self, mode):
203                 """
204                 set match mode
205                 """
206                 assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED])
207                 self._mode = mode
208
209
210         def SetSortMode ( self, mode, clause='' ):
211                 """
212                 set sort mode
213                 """
214                 assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] )
215                 assert ( isinstance ( clause, str ) )
216                 self._sort = mode
217                 self._sortby = clause
218
219
220         def SetWeights (self, weights): 
221                 """
222                 set per-field weights
223                 """
224                 assert(isinstance(weights, list))
225                 for w in weights:
226                         assert(isinstance(w, int))
227                 self._weights = weights
228
229
230         def SetIDRange (self, minid, maxid):
231                 """
232                 set IDs range to match
233                 only match those records where document ID
234                 is beetwen minid and maxid (including minid and maxid)
235                 """
236                 assert(isinstance(minid, int))
237                 assert(isinstance(maxid, int))
238                 assert(minid<=maxid)
239                 self._min_id = minid
240                 self._max_id = maxid
241
242
243         def SetFilter ( self, attribute, values, exclude=0 ):
244                 """
245                 set values filter
246                 only match those records where $attribute column values
247                 are in specified set
248                 """
249                 assert(isinstance(attribute, str))
250                 assert(isinstance(values, list))
251                 assert(values)
252
253                 values = map(int, values)
254
255                 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } )
256
257
258         def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
259                 """
260                 set range filter
261                 only match those records where $attribute column value
262                 is beetwen $min and $max (including $min and $max)
263                 """
264                 assert(isinstance(attribute, str))
265                 assert(isinstance(min_, int))
266                 assert(isinstance(max_, int))
267                 assert(min_<=max_)
268
269                 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
270
271
272         def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
273                 """
274                 set grouping attribute and function
275
276                 in grouping mode, all matches are assigned to different groups
277                 based on grouping function value.
278
279                 each group keeps track of the total match count, and the best match
280                 (in this group) according to current sorting function.
281
282                 the final result set contains one best match per group, with
283                 grouping function value and matches count attached.
284
285                 groups in result set could be sorted by any sorting clause,
286                 including both document attributes and the following special
287                 internal Sphinx attributes:
288
289                 - @id - match document ID;
290                 - @weight, @rank, @relevance -  match weight;
291                 - @group - groupby function value;
292                 - @count - amount of matches in group.
293
294                 the default mode is to sort by groupby value in descending order,
295                 ie. by "@group desc".
296
297                 "total_found" would contain total amount of matching groups over
298                 the whole index.
299
300                 WARNING: grouping is done in fixed memory and thus its results
301                 are only approximate; so there might be more groups reported
302                 in total_found than actually present. @count might also
303                 be underestimated. 
304
305                 for example, if sorting by relevance and grouping by "published"
306                 attribute with SPH_GROUPBY_DAY function, then the result set will
307                 contain one most relevant match per each day when there were any
308                 matches published, with day number and per-day match count attached,
309                 and sorted by day number in descending order (ie. recent days first).
310                 """
311                 assert(isinstance(attribute, str))
312                 assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] )
313                 assert(isinstance(groupsort, str))
314
315                 self._groupby = attribute
316                 self._groupfunc = func
317                 self._groupsort = groupsort
318
319
320         def Query (self, query, index='*'):
321                 """
322                 connect to searchd server and run given search query
323
324                 "query" is query string
325                 "index" is index name to query, default is "*" which means to query all indexes
326
327                 returns false on failure
328                 returns hash which has the following keys on success:
329                         "matches"
330                                 an array of found matches represented as ( "id", "weight", "attrs" ) hashes
331                         "total"
332                                 total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
333                         "total_found"
334                                 total amount of matching documents in index
335                         "time"
336                                 search time
337                         "words"
338                                 an array of ( "word", "docs", "hits" ) hashes which contains
339                                 docs and hits count for stemmed (!) query words
340                 """
341                 sock = self._Connect()
342                 if not sock:
343                         return {}
344
345                 # build request
346                 req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)]
347
348                 req.append(pack('>L', len(self._sortby)))
349                 req.append(self._sortby)
350
351                 req.append(pack('>L', len(query)))
352                 req.append(query)
353
354                 req.append(pack('>L', len(self._weights)))
355                 for w in self._weights:
356                         req.append(pack('>L', w))
357
358                 req.append(pack('>L', len(index)))
359                 req.append(index)
360                 req.append(pack('>L', self._min_id))
361                 req.append(pack('>L', self._max_id))
362
363                 # filters
364                 req.append ( pack ( '>L', len(self._filters) ) )
365                 for f in self._filters:
366                         req.append ( pack ( '>L', len(f['attr']) ) )
367                         req.append ( f['attr'] )
368                         if ( 'values' in f ):
369                                 req.append ( pack ( '>L', len(f['values']) ) )
370                                 for v in f['values']:
371                                         req.append ( pack ( '>L', v ) )
372                         else:
373                                 req.append ( pack ( '>3L', 0, f['min'], f['max'] ) )
374                         req.append ( pack ( '>L', f['exclude'] ) )
375
376                 # group-by, max-matches, group-sort
377                 req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
378                 req.append ( self._groupby )
379                 req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
380                 req.append ( self._groupsort )
381
382                 # send query, get response
383                 req = ''.join(req)
384
385                 length = len(req)
386                 req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req
387                 sock.send(req)
388                 response = self._GetResponse(sock, VER_COMMAND_SEARCH)
389                 if not response:
390                         return {}
391
392                 # parse response
393                 result = {}
394                 max_ = len(response)
395
396                 # read schema
397                 p = 0
398                 fields = []
399                 attrs = []
400
401                 nfields = unpack('>L', response[p:p+4])[0]
402                 p += 4
403                 while nfields>0 and p<max_:
404                         nfields -= 1
405                         length = unpack('>L', response[p:p+4])[0]
406                         p += 4
407                         fields.append(response[p:p+length])
408                         p += length
409
410                 result['fields'] = fields
411
412                 nattrs = unpack('>L', response[p:p+4])[0]
413                 p += 4
414                 while nattrs>0 and p<max_:
415                         nattrs -= 1
416                         length = unpack('>L', response[p:p+4])[0]
417                         p += 4
418                         attr = response[p:p+length]
419                         p += length
420                         type_ = unpack('>L', response[p:p+4])[0]
421                         p += 4
422                         attrs.append([attr,type_])
423
424                 result['attrs'] = attrs
425
426                 # read match count
427                 count = unpack('>L', response[p:p+4])[0]
428                 p += 4
429
430                 # read matches
431                 result['matches'] = []
432                 while count>0 and p<max_:
433                         count -= 1
434                         doc, weight = unpack('>2L', response[p:p+8])
435                         p += 8
436
437                         match = { 'id':doc, 'weight':weight, 'attrs':{} }
438                         for i in range(len(attrs)):
439                                 match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
440                                 p += 4
441
442                         result['matches'].append ( match )
443
444                 result['total'], result['total_found'], result['time'], words = \
445                         unpack('>4L', response[p:p+16])
446
447                 result['time'] = '%.3f' % (result['time']/1000.0)
448                 p += 16
449
450                 result['words'] = []
451                 while words>0:
452                         words -= 1
453                         length = unpack('>L', response[p:p+4])[0]
454                         p += 4
455                         word = response[p:p+length]
456                         p += length
457                         docs, hits = unpack('>2L', response[p:p+8])
458                         p += 8
459
460                         result['words'].append({'word':word, 'docs':docs, 'hits':hits})
461
462                 sock.close()
463
464                 return result   
465
466
467         def BuildExcerpts (self, docs, index, words, opts=None):
468                 """
469                 connect to searchd server and generate exceprts from given documents
470
471                 "docs" is an array of strings which represent the documents' contents
472                 "index" is a string specifiying the index which settings will be used
473                         for stemming, lexing and case folding
474                 "words" is a string which contains the words to highlight
475                 "opts" is a hash which contains additional optional highlighting parameters:
476                         "before_match"
477                                 a string to insert before a set of matching words, default is "<b>"
478                         "after_match"
479                                 a string to insert after a set of matching words, default is "<b>"
480                         "chunk_separator"
481                                 a string to insert between excerpts chunks, default is " ... "
482                         "limit"
483                                 max excerpt size in symbols (codepoints), default is 256
484                         "around"
485                                 how much words to highlight around each match, default is 5
486
487                 returns false on failure
488                 returns an array of string excerpts on success
489                 """
490                 if not opts:
491                         opts = {}
492
493                 assert(isinstance(docs, list))
494                 assert(isinstance(index, str))
495                 assert(isinstance(words, str))
496                 assert(isinstance(opts, dict))
497
498                 sock = self._Connect()
499
500                 if not sock:
501                         return []
502
503                 # fixup options
504                 opts.setdefault('before_match', '<b>')
505                 opts.setdefault('after_match', '</b>')
506                 opts.setdefault('chunk_separator', ' ... ')
507                 opts.setdefault('limit', 256)
508                 opts.setdefault('around', 5)
509
510                 # build request
511                 # v.1.0 req
512
513                 # mode=0, flags=1 (remove spaces)
514                 req = [pack('>2L', 0, 1)]
515
516                 # req index
517                 req.append(pack('>L', len(index)))
518                 req.append(index)
519
520                 # req words
521                 req.append(pack('>L', len(words)))
522                 req.append(words)
523
524                 # options
525                 req.append(pack('>L', len(opts['before_match'])))
526                 req.append(opts['before_match'])
527
528                 req.append(pack('>L', len(opts['after_match'])))
529                 req.append(opts['after_match'])
530
531                 req.append(pack('>L', len(opts['chunk_separator'])))
532                 req.append(opts['chunk_separator'])
533
534                 req.append(pack('>L', int(opts['limit'])))
535                 req.append(pack('>L', int(opts['around'])))
536
537                 # documents
538                 req.append(pack('>L', len(docs)))
539                 for doc in docs:
540                         assert(isinstance(doc, str))
541                         req.append(pack('>L', len(doc)))
542                         req.append(doc)
543
544                 req = ''.join(req)
545
546                 # send query, get response
547                 length = len(req)
548
549                 # add header
550                 req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
551                 wrote = sock.send(req)
552
553                 response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
554                 if not response:
555                         return []
556
557                 # parse response
558                 pos = 0
559                 res = []
560                 rlen = len(response)
561
562                 for i in range(len(docs)):
563                         length = unpack('>L', response[pos:pos+4])[0]
564                         pos += 4
565
566                         if pos+length > rlen:
567                                 self._error = 'incomplete reply'
568                                 return []
569
570                         res.append(response[pos:pos+length])
571                         pos += length
572
573                 return res
574
575 #
576 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $
577 #