apps/djangosphinx/apis/api263/__init__.py

   1 #
   2 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $
   3 #
   4 # Python version of Sphinx searchd client (Python API)
   5 #
   6 # Copyright (c) 2006-2007, Andrew Aksyonoff
   7 # Copyright (c) 2006, Mike Osadnik
   8 # All rights reserved
   9 #
  10 # This program is free software; you can redistribute it and/or modify
  11 # it under the terms of the GNU General Public License. You should have
  12 # received a copy of the GPL license along with this program; if you
  13 # did not, you can find it at http://www.gnu.org/
  14 #
  15
  16 import select
  17 import socket
  18 from struct import *
  19
  20
  21 # known searchd commands
  22 SEARCHD_COMMAND_SEARCH  = 0
  23 SEARCHD_COMMAND_EXCERPT = 1
  24
  25 # current client-side command implementation versions
  26 VER_COMMAND_SEARCH              = 0x107
  27 VER_COMMAND_EXCERPT             = 0x100
  28
  29 # known searchd status codes
  30 SEARCHD_OK                              = 0
  31 SEARCHD_ERROR                   = 1
  32 SEARCHD_RETRY                   = 2
  33 SEARCHD_WARNING                 = 3
  34
  35 # known match modes
  36 SPH_MATCH_ALL                   = 0
  37 SPH_MATCH_ANY                   = 1
  38 SPH_MATCH_PHRASE                = 2
  39 SPH_MATCH_BOOLEAN               = 3
  40 SPH_MATCH_EXTENDED              = 4
  41
  42 # known sort modes
  43 SPH_SORT_RELEVANCE              = 0
  44 SPH_SORT_ATTR_DESC              = 1
  45 SPH_SORT_ATTR_ASC               = 2
  46 SPH_SORT_TIME_SEGMENTS  = 3
  47 SPH_SORT_EXTENDED               = 4
  48
  49 # known attribute types
  50 SPH_ATTR_INTEGER                = 1
  51 SPH_ATTR_TIMESTAMP              = 2
  52
  53 # known grouping functions
  54 SPH_GROUPBY_DAY                 = 0
  55 SPH_GROUPBY_WEEK                = 1
  56 SPH_GROUPBY_MONTH               = 2
  57 SPH_GROUPBY_YEAR                = 3
  58 SPH_GROUPBY_ATTR                = 4
  59
  60 class SphinxClient:
  61         _host           = 'localhost'                   # searchd host (default is "localhost")
  62         _port           = 3312                                  # searchd port (default is 3312)
  63         _offset         = 0                                             # how much records to seek from result-set start (default is 0)
  64         _limit          = 20                                    # how much records to return from result-set starting at offset (default is 20)
  65         _mode           = SPH_MATCH_ALL                 # query matching mode (default is SPH_MATCH_ALL)
  66         _weights        = []                                    # per-field weights (default is 1 for all fields)
  67         _sort           = SPH_SORT_RELEVANCE    # match sorting mode (default is SPH_SORT_RELEVANCE)
  68         _sortby         = ''                                    # attribute to sort by (defualt is "")
  69         _min_id         = 0                                             # min ID to match (default is 0)
  70         _max_id         = 0xFFFFFFFF                    # max ID to match (default is UINT_MAX)
  71         _filters        = []                                    # search filters
  72         _groupby        = ''                                    # group-by attribute name
  73         _groupfunc      = SPH_GROUPBY_DAY               # group-by function (to pre-process group-by attribute value with)
  74         _groupsort      = '@group desc'                 # group-by sorting clause (to sort groups in result set with)
  75         _maxmatches     = 1000                                  # max matches to retrieve
  76         _error          = ''                                    # last error message
  77         _warning        = ''                                    # last warning message
  78
  79
  80         def __init__ (self):
  81                 """
  82                 create a new client object and fill defaults
  83                 """
  84                 pass
  85
  86
  87         def GetLastError (self):
  88                 """
  89                 get last error message (string)
  90                 """
  91                 return self._error
  92
  93
  94         def GetLastWarning (self):
  95                 """
  96                 get last warning message (string)
  97                 """
  98                 return self._warning
  99
 100
 101         def SetServer (self, host, port):
 102                 """
 103                 set searchd server
 104                 """
 105                 assert(isinstance(host, str))
 106                 assert(isinstance(port, int))
 107
 108                 self._host = host
 109                 self._port = port
 110
 111
 112         def _Connect (self):
 113                 """
 114                 connect to searchd server
 115                 """
 116                 try:
 117                         sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM )
 118                         sock.connect ( ( self._host, self._port ) )
 119                 except socket.error, msg:
 120                         if sock:
 121                                 sock.close()
 122                         self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg )
 123                         return 0
 124
 125                 v = unpack('>L', sock.recv(4))
 126                 if v<1:
 127                         sock.close()
 128                         self._error = 'expected searchd protocol version, got %s' % v
 129                         return 0
 130
 131                 # all ok, send my version
 132                 sock.send(pack('>L', 1))
 133                 return sock
 134
 135
 136         def _GetResponse (self, sock, client_ver):
 137                 """
 138                 get and check response packet from searchd server
 139                 """
 140                 (status, ver, length) = unpack('>2HL', sock.recv(8))
 141                 response = ''
 142                 left = length
 143                 while left>0:
 144                         chunk = sock.recv(left)
 145                         if chunk:
 146                                 response += chunk
 147                                 left -= len(chunk)
 148                         else:
 149                                 break
 150
 151                 sock.close()
 152
 153                 # check response
 154                 read = len(response)
 155                 if not response or read!=length:
 156                         if length:
 157                                 self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \
 158                                         % (status, ver, length, read)
 159                         else:
 160                                 self._error = 'received zero-sized searchd response'
 161                         return None
 162
 163                 # check status
 164                 if status==SEARCHD_WARNING:
 165                         wend = 4 + unpack ( '>L', response[0:4] )[0]
 166                         self._warning = response[4:wend]
 167                         return response[wend:]
 168
 169                 if status==SEARCHD_ERROR:
 170                         self._error = 'searchd error: '+response[4:]
 171                         return None
 172
 173                 if status==SEARCHD_RETRY:
 174                         self._error = 'temporary searchd error: '+response[4:]
 175                         return None
 176
 177                 if status!=SEARCHD_OK:
 178                         self._error = 'unknown status code %d' % status
 179                         return None
 180
 181                 # check version
 182                 if ver<client_ver:
 183                         self._warning = 'searchd command v.%d.%d older than client\'s v.%d.%d, some options might not work' \
 184                                 % (ver>>8, ver&0xff, client_ver>>8, client_ver&0xff)
 185
 186                 return response
 187
 188
 189         def SetLimits (self, offset, limit, maxmatches=0):
 190                 """
 191                 set match offset, count, and max number to retrieve
 192                 """
 193                 assert(isinstance(offset, int) and offset>=0)
 194                 assert(isinstance(limit, int) and limit>0)
 195                 assert(maxmatches>=0)
 196                 self._offset = offset
 197                 self._limit = limit
 198                 if maxmatches>0:
 199                         self._maxmatches = maxmatches
 200
 201
 202         def SetMatchMode (self, mode):
 203                 """
 204                 set match mode
 205                 """
 206                 assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED])
 207                 self._mode = mode
 208
 209
 210         def SetSortMode ( self, mode, clause='' ):
 211                 """
 212                 set sort mode
 213                 """
 214                 assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] )
 215                 assert ( isinstance ( clause, str ) )
 216                 self._sort = mode
 217                 self._sortby = clause
 218
 219
 220         def SetWeights (self, weights):
 221                 """
 222                 set per-field weights
 223                 """
 224                 assert(isinstance(weights, list))
 225                 for w in weights:
 226                         assert(isinstance(w, int))
 227                 self._weights = weights
 228
 229
 230         def SetIDRange (self, minid, maxid):
 231                 """
 232                 set IDs range to match
 233                 only match those records where document ID
 234                 is beetwen minid and maxid (including minid and maxid)
 235                 """
 236                 assert(isinstance(minid, int))
 237                 assert(isinstance(maxid, int))
 238                 assert(minid<=maxid)
 239                 self._min_id = minid
 240                 self._max_id = maxid
 241
 242
 243         def SetFilter ( self, attribute, values, exclude=0 ):
 244                 """
 245                 set values filter
 246                 only match those records where $attribute column values
 247                 are in specified set
 248                 """
 249                 assert(isinstance(attribute, str))
 250                 assert(isinstance(values, list))
 251                 assert(values)
 252
 253                 values = map(int, values)
 254
 255                 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } )
 256
 257
 258         def SetFilterRange (self, attribute, min_, max_, exclude=0 ):
 259                 """
 260                 set range filter
 261                 only match those records where $attribute column value
 262                 is beetwen $min and $max (including $min and $max)
 263                 """
 264                 assert(isinstance(attribute, str))
 265                 assert(isinstance(min_, int))
 266                 assert(isinstance(max_, int))
 267                 assert(min_<=max_)
 268
 269                 self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } )
 270
 271
 272         def SetGroupBy ( self, attribute, func, groupsort='@group desc' ):
 273                 """
 274                 set grouping attribute and function
 275
 276                 in grouping mode, all matches are assigned to different groups
 277                 based on grouping function value.
 278
 279                 each group keeps track of the total match count, and the best match
 280                 (in this group) according to current sorting function.
 281
 282                 the final result set contains one best match per group, with
 283                 grouping function value and matches count attached.
 284
 285                 groups in result set could be sorted by any sorting clause,
 286                 including both document attributes and the following special
 287                 internal Sphinx attributes:
 288
 289                 - @id - match document ID;
 290                 - @weight, @rank, @relevance -  match weight;
 291                 - @group - groupby function value;
 292                 - @count - amount of matches in group.
 293
 294                 the default mode is to sort by groupby value in descending order,
 295                 ie. by "@group desc".
 296
 297                 "total_found" would contain total amount of matching groups over
 298                 the whole index.
 299
 300                 WARNING: grouping is done in fixed memory and thus its results
 301                 are only approximate; so there might be more groups reported
 302                 in total_found than actually present. @count might also
 303                 be underestimated.
 304
 305                 for example, if sorting by relevance and grouping by "published"
 306                 attribute with SPH_GROUPBY_DAY function, then the result set will
 307                 contain one most relevant match per each day when there were any
 308                 matches published, with day number and per-day match count attached,
 309                 and sorted by day number in descending order (ie. recent days first).
 310                 """
 311                 assert(isinstance(attribute, str))
 312                 assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] )
 313                 assert(isinstance(groupsort, str))
 314
 315                 self._groupby = attribute
 316                 self._groupfunc = func
 317                 self._groupsort = groupsort
 318
 319
 320         def Query (self, query, index='*'):
 321                 """
 322                 connect to searchd server and run given search query
 323
 324                 "query" is query string
 325                 "index" is index name to query, default is "*" which means to query all indexes
 326
 327                 returns false on failure
 328                 returns hash which has the following keys on success:
 329                         "matches"
 330                                 an array of found matches represented as ( "id", "weight", "attrs" ) hashes
 331                         "total"
 332                                 total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h)
 333                         "total_found"
 334                                 total amount of matching documents in index
 335                         "time"
 336                                 search time
 337                         "words"
 338                                 an array of ( "word", "docs", "hits" ) hashes which contains
 339                                 docs and hits count for stemmed (!) query words
 340                 """
 341                 sock = self._Connect()
 342                 if not sock:
 343                         return {}
 344
 345                 # build request
 346                 req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)]
 347
 348                 req.append(pack('>L', len(self._sortby)))
 349                 req.append(self._sortby)
 350
 351                 req.append(pack('>L', len(query)))
 352                 req.append(query)
 353
 354                 req.append(pack('>L', len(self._weights)))
 355                 for w in self._weights:
 356                         req.append(pack('>L', w))
 357
 358                 req.append(pack('>L', len(index)))
 359                 req.append(index)
 360                 req.append(pack('>L', self._min_id))
 361                 req.append(pack('>L', self._max_id))
 362
 363                 # filters
 364                 req.append ( pack ( '>L', len(self._filters) ) )
 365                 for f in self._filters:
 366                         req.append ( pack ( '>L', len(f['attr']) ) )
 367                         req.append ( f['attr'] )
 368                         if ( 'values' in f ):
 369                                 req.append ( pack ( '>L', len(f['values']) ) )
 370                                 for v in f['values']:
 371                                         req.append ( pack ( '>L', v ) )
 372                         else:
 373                                 req.append ( pack ( '>3L', 0, f['min'], f['max'] ) )
 374                         req.append ( pack ( '>L', f['exclude'] ) )
 375
 376                 # group-by, max-matches, group-sort
 377                 req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) )
 378                 req.append ( self._groupby )
 379                 req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) )
 380                 req.append ( self._groupsort )
 381
 382                 # send query, get response
 383                 req = ''.join(req)
 384
 385                 length = len(req)
 386                 req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req
 387                 sock.send(req)
 388                 response = self._GetResponse(sock, VER_COMMAND_SEARCH)
 389                 if not response:
 390                         return {}
 391
 392                 # parse response
 393                 result = {}
 394                 max_ = len(response)
 395
 396                 # read schema
 397                 p = 0
 398                 fields = []
 399                 attrs = []
 400
 401                 nfields = unpack('>L', response[p:p+4])[0]
 402                 p += 4
 403                 while nfields>0 and p<max_:
 404                         nfields -= 1
 405                         length = unpack('>L', response[p:p+4])[0]
 406                         p += 4
 407                         fields.append(response[p:p+length])
 408                         p += length
 409
 410                 result['fields'] = fields
 411
 412                 nattrs = unpack('>L', response[p:p+4])[0]
 413                 p += 4
 414                 while nattrs>0 and p<max_:
 415                         nattrs -= 1
 416                         length = unpack('>L', response[p:p+4])[0]
 417                         p += 4
 418                         attr = response[p:p+length]
 419                         p += length
 420                         type_ = unpack('>L', response[p:p+4])[0]
 421                         p += 4
 422                         attrs.append([attr,type_])
 423
 424                 result['attrs'] = attrs
 425
 426                 # read match count
 427                 count = unpack('>L', response[p:p+4])[0]
 428                 p += 4
 429
 430                 # read matches
 431                 result['matches'] = []
 432                 while count>0 and p<max_:
 433                         count -= 1
 434                         doc, weight = unpack('>2L', response[p:p+8])
 435                         p += 8
 436
 437                         match = { 'id':doc, 'weight':weight, 'attrs':{} }
 438                         for i in range(len(attrs)):
 439                                 match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0]
 440                                 p += 4
 441
 442                         result['matches'].append ( match )
 443
 444                 result['total'], result['total_found'], result['time'], words = \
 445                         unpack('>4L', response[p:p+16])
 446
 447                 result['time'] = '%.3f' % (result['time']/1000.0)
 448                 p += 16
 449
 450                 result['words'] = []
 451                 while words>0:
 452                         words -= 1
 453                         length = unpack('>L', response[p:p+4])[0]
 454                         p += 4
 455                         word = response[p:p+length]
 456                         p += length
 457                         docs, hits = unpack('>2L', response[p:p+8])
 458                         p += 8
 459
 460                         result['words'].append({'word':word, 'docs':docs, 'hits':hits})
 461
 462                 sock.close()
 463
 464                 return result
 465
 466
 467         def BuildExcerpts (self, docs, index, words, opts=None):
 468                 """
 469                 connect to searchd server and generate exceprts from given documents
 470
 471                 "docs" is an array of strings which represent the documents' contents
 472                 "index" is a string specifiying the index which settings will be used
 473                         for stemming, lexing and case folding
 474                 "words" is a string which contains the words to highlight
 475                 "opts" is a hash which contains additional optional highlighting parameters:
 476                         "before_match"
 477                                 a string to insert before a set of matching words, default is "<b>"
 478                         "after_match"
 479                                 a string to insert after a set of matching words, default is "<b>"
 480                         "chunk_separator"
 481                                 a string to insert between excerpts chunks, default is " ... "
 482                         "limit"
 483                                 max excerpt size in symbols (codepoints), default is 256
 484                         "around"
 485                                 how much words to highlight around each match, default is 5
 486
 487                 returns false on failure
 488                 returns an array of string excerpts on success
 489                 """
 490                 if not opts:
 491                         opts = {}
 492
 493                 assert(isinstance(docs, list))
 494                 assert(isinstance(index, str))
 495                 assert(isinstance(words, str))
 496                 assert(isinstance(opts, dict))
 497
 498                 sock = self._Connect()
 499
 500                 if not sock:
 501                         return []
 502
 503                 # fixup options
 504                 opts.setdefault('before_match', '<b>')
 505                 opts.setdefault('after_match', '</b>')
 506                 opts.setdefault('chunk_separator', ' ... ')
 507                 opts.setdefault('limit', 256)
 508                 opts.setdefault('around', 5)
 509
 510                 # build request
 511                 # v.1.0 req
 512
 513                 # mode=0, flags=1 (remove spaces)
 514                 req = [pack('>2L', 0, 1)]
 515
 516                 # req index
 517                 req.append(pack('>L', len(index)))
 518                 req.append(index)
 519
 520                 # req words
 521                 req.append(pack('>L', len(words)))
 522                 req.append(words)
 523
 524                 # options
 525                 req.append(pack('>L', len(opts['before_match'])))
 526                 req.append(opts['before_match'])
 527
 528                 req.append(pack('>L', len(opts['after_match'])))
 529                 req.append(opts['after_match'])
 530
 531                 req.append(pack('>L', len(opts['chunk_separator'])))
 532                 req.append(opts['chunk_separator'])
 533
 534                 req.append(pack('>L', int(opts['limit'])))
 535                 req.append(pack('>L', int(opts['around'])))
 536
 537                 # documents
 538                 req.append(pack('>L', len(docs)))
 539                 for doc in docs:
 540                         assert(isinstance(doc, str))
 541                         req.append(pack('>L', len(doc)))
 542                         req.append(doc)
 543
 544                 req = ''.join(req)
 545
 546                 # send query, get response
 547                 length = len(req)
 548
 549                 # add header
 550                 req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req
 551                 wrote = sock.send(req)
 552
 553                 response = self._GetResponse(sock, VER_COMMAND_EXCERPT )
 554                 if not response:
 555                         return []
 556
 557                 # parse response
 558                 pos = 0
 559                 res = []
 560                 rlen = len(response)
 561
 562                 for i in range(len(docs)):
 563                         length = unpack('>L', response[pos:pos+4])[0]
 564                         pos += 4
 565
 566                         if pos+length > rlen:
 567                                 self._error = 'incomplete reply'
 568                                 return []
 569
 570                         res.append(response[pos:pos+length])
 571                         pos += length
 572
 573                 return res
 574
 575 #
 576 # $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $
 577 #