From: Marek Stępniowski Date: Mon, 6 Oct 2008 22:37:34 +0000 (+0200) Subject: Added branch 1.0. X-Git-Url: https://git.mdrn.pl/wolnelektury.git/commitdiff_plain/db833ba4517084f61a64907c6d15606e7c881edd?ds=inline Added branch 1.0. --- diff --git a/apps/catalogue/models.py b/apps/catalogue/models.py index 60d278580..9d072b44f 100644 --- a/apps/catalogue/models.py +++ b/apps/catalogue/models.py @@ -10,6 +10,7 @@ from django.core.urlresolvers import reverse from newtagging.models import TagBase from newtagging import managers +import djangosphinx from librarian import html, dcparser @@ -45,6 +46,8 @@ class Tag(TagBase): user = models.ForeignKey(User, blank=True, null=True) book_count = models.IntegerField(_('book count'), default=0, blank=False, null=False) + search = djangosphinx.SphinxSearch() + def has_description(self): return len(self.description) > 0 has_description.short_description = _('description') @@ -97,6 +100,8 @@ class Book(models.Model): objects = models.Manager() tagged = managers.ModelTaggedItemManager(Tag) tags = managers.TagDescriptor(Tag) + + search = djangosphinx.SphinxSearch() @property def name(self): @@ -259,6 +264,8 @@ class Fragment(models.Model): tagged = managers.ModelTaggedItemManager(Tag) tags = managers.TagDescriptor(Tag) + search = djangosphinx.SphinxSearch() + def short_html(self): if len(self._short_html): return mark_safe(self._short_html) diff --git a/apps/djangosphinx/__init__.py b/apps/djangosphinx/__init__.py new file mode 100644 index 000000000..176befc5a --- /dev/null +++ b/apps/djangosphinx/__init__.py @@ -0,0 +1,35 @@ +""" +Sphinx Search Engine ORM for Django models +http://www.sphinxsearch.com/ +Developed and maintained David Cramer + +To add a search manager to your model: + + search = SphinxSearch([index=, weight=[,], mode=]) + + +To query the engine and retrieve objects: + + MyModel.search.query('my string') + + +To use multiple index support, you need to define a "content_type" field in your SQL +clause. Each index also needs to have the exact same field's. The rules are almost identical +to that of an SQL UNION query. + + SELECT id, name, 1 as content_type FROM model_myapp + SELECT id, name, 2 as content_type FROM model_myotherapp + search_results = SphinxSearch() + search_results.on_index('model_myapp model_myotherapp') + search_results.query('hello') + + +default settings.py values + + SPHINX_SERVER = 'localhost' + SPHINX_PORT = 3312 + +""" + +from manager import SearchError, ConnectionError, SphinxSearch +from utils import generate_config_for_model, generate_config_for_models \ No newline at end of file diff --git a/apps/djangosphinx/apis/__init__.py b/apps/djangosphinx/apis/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/djangosphinx/apis/api263/__init__.py b/apps/djangosphinx/apis/api263/__init__.py new file mode 100644 index 000000000..d9a2d43f7 --- /dev/null +++ b/apps/djangosphinx/apis/api263/__init__.py @@ -0,0 +1,577 @@ +# +# $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $ +# +# Python version of Sphinx searchd client (Python API) +# +# Copyright (c) 2006-2007, Andrew Aksyonoff +# Copyright (c) 2006, Mike Osadnik +# All rights reserved +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License. You should have +# received a copy of the GPL license along with this program; if you +# did not, you can find it at http://www.gnu.org/ +# + +import select +import socket +from struct import * + + +# known searchd commands +SEARCHD_COMMAND_SEARCH = 0 +SEARCHD_COMMAND_EXCERPT = 1 + +# current client-side command implementation versions +VER_COMMAND_SEARCH = 0x107 +VER_COMMAND_EXCERPT = 0x100 + +# known searchd status codes +SEARCHD_OK = 0 +SEARCHD_ERROR = 1 +SEARCHD_RETRY = 2 +SEARCHD_WARNING = 3 + +# known match modes +SPH_MATCH_ALL = 0 +SPH_MATCH_ANY = 1 +SPH_MATCH_PHRASE = 2 +SPH_MATCH_BOOLEAN = 3 +SPH_MATCH_EXTENDED = 4 + +# known sort modes +SPH_SORT_RELEVANCE = 0 +SPH_SORT_ATTR_DESC = 1 +SPH_SORT_ATTR_ASC = 2 +SPH_SORT_TIME_SEGMENTS = 3 +SPH_SORT_EXTENDED = 4 + +# known attribute types +SPH_ATTR_INTEGER = 1 +SPH_ATTR_TIMESTAMP = 2 + +# known grouping functions +SPH_GROUPBY_DAY = 0 +SPH_GROUPBY_WEEK = 1 +SPH_GROUPBY_MONTH = 2 +SPH_GROUPBY_YEAR = 3 +SPH_GROUPBY_ATTR = 4 + +class SphinxClient: + _host = 'localhost' # searchd host (default is "localhost") + _port = 3312 # searchd port (default is 3312) + _offset = 0 # how much records to seek from result-set start (default is 0) + _limit = 20 # how much records to return from result-set starting at offset (default is 20) + _mode = SPH_MATCH_ALL # query matching mode (default is SPH_MATCH_ALL) + _weights = [] # per-field weights (default is 1 for all fields) + _sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE) + _sortby = '' # attribute to sort by (defualt is "") + _min_id = 0 # min ID to match (default is 0) + _max_id = 0xFFFFFFFF # max ID to match (default is UINT_MAX) + _filters = [] # search filters + _groupby = '' # group-by attribute name + _groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with) + _groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with) + _maxmatches = 1000 # max matches to retrieve + _error = '' # last error message + _warning = '' # last warning message + + + def __init__ (self): + """ + create a new client object and fill defaults + """ + pass + + + def GetLastError (self): + """ + get last error message (string) + """ + return self._error + + + def GetLastWarning (self): + """ + get last warning message (string) + """ + return self._warning + + + def SetServer (self, host, port): + """ + set searchd server + """ + assert(isinstance(host, str)) + assert(isinstance(port, int)) + + self._host = host + self._port = port + + + def _Connect (self): + """ + connect to searchd server + """ + try: + sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM ) + sock.connect ( ( self._host, self._port ) ) + except socket.error, msg: + if sock: + sock.close() + self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg ) + return 0 + + v = unpack('>L', sock.recv(4)) + if v<1: + sock.close() + self._error = 'expected searchd protocol version, got %s' % v + return 0 + + # all ok, send my version + sock.send(pack('>L', 1)) + return sock + + + def _GetResponse (self, sock, client_ver): + """ + get and check response packet from searchd server + """ + (status, ver, length) = unpack('>2HL', sock.recv(8)) + response = '' + left = length + while left>0: + chunk = sock.recv(left) + if chunk: + response += chunk + left -= len(chunk) + else: + break + + sock.close() + + # check response + read = len(response) + if not response or read!=length: + if length: + self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \ + % (status, ver, length, read) + else: + self._error = 'received zero-sized searchd response' + return None + + # check status + if status==SEARCHD_WARNING: + wend = 4 + unpack ( '>L', response[0:4] )[0] + self._warning = response[4:wend] + return response[wend:] + + if status==SEARCHD_ERROR: + self._error = 'searchd error: '+response[4:] + return None + + if status==SEARCHD_RETRY: + self._error = 'temporary searchd error: '+response[4:] + return None + + if status!=SEARCHD_OK: + self._error = 'unknown status code %d' % status + return None + + # check version + if ver>8, ver&0xff, client_ver>>8, client_ver&0xff) + + return response + + + def SetLimits (self, offset, limit, maxmatches=0): + """ + set match offset, count, and max number to retrieve + """ + assert(isinstance(offset, int) and offset>=0) + assert(isinstance(limit, int) and limit>0) + assert(maxmatches>=0) + self._offset = offset + self._limit = limit + if maxmatches>0: + self._maxmatches = maxmatches + + + def SetMatchMode (self, mode): + """ + set match mode + """ + assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED]) + self._mode = mode + + + def SetSortMode ( self, mode, clause='' ): + """ + set sort mode + """ + assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED] ) + assert ( isinstance ( clause, str ) ) + self._sort = mode + self._sortby = clause + + + def SetWeights (self, weights): + """ + set per-field weights + """ + assert(isinstance(weights, list)) + for w in weights: + assert(isinstance(w, int)) + self._weights = weights + + + def SetIDRange (self, minid, maxid): + """ + set IDs range to match + only match those records where document ID + is beetwen minid and maxid (including minid and maxid) + """ + assert(isinstance(minid, int)) + assert(isinstance(maxid, int)) + assert(minid<=maxid) + self._min_id = minid + self._max_id = maxid + + + def SetFilter ( self, attribute, values, exclude=0 ): + """ + set values filter + only match those records where $attribute column values + are in specified set + """ + assert(isinstance(attribute, str)) + assert(isinstance(values, list)) + assert(values) + + values = map(int, values) + + self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'values':values } ) + + + def SetFilterRange (self, attribute, min_, max_, exclude=0 ): + """ + set range filter + only match those records where $attribute column value + is beetwen $min and $max (including $min and $max) + """ + assert(isinstance(attribute, str)) + assert(isinstance(min_, int)) + assert(isinstance(max_, int)) + assert(min_<=max_) + + self._filters.append ( { 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } ) + + + def SetGroupBy ( self, attribute, func, groupsort='@group desc' ): + """ + set grouping attribute and function + + in grouping mode, all matches are assigned to different groups + based on grouping function value. + + each group keeps track of the total match count, and the best match + (in this group) according to current sorting function. + + the final result set contains one best match per group, with + grouping function value and matches count attached. + + groups in result set could be sorted by any sorting clause, + including both document attributes and the following special + internal Sphinx attributes: + + - @id - match document ID; + - @weight, @rank, @relevance - match weight; + - @group - groupby function value; + - @count - amount of matches in group. + + the default mode is to sort by groupby value in descending order, + ie. by "@group desc". + + "total_found" would contain total amount of matching groups over + the whole index. + + WARNING: grouping is done in fixed memory and thus its results + are only approximate; so there might be more groups reported + in total_found than actually present. @count might also + be underestimated. + + for example, if sorting by relevance and grouping by "published" + attribute with SPH_GROUPBY_DAY function, then the result set will + contain one most relevant match per each day when there were any + matches published, with day number and per-day match count attached, + and sorted by day number in descending order (ie. recent days first). + """ + assert(isinstance(attribute, str)) + assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] ) + assert(isinstance(groupsort, str)) + + self._groupby = attribute + self._groupfunc = func + self._groupsort = groupsort + + + def Query (self, query, index='*'): + """ + connect to searchd server and run given search query + + "query" is query string + "index" is index name to query, default is "*" which means to query all indexes + + returns false on failure + returns hash which has the following keys on success: + "matches" + an array of found matches represented as ( "id", "weight", "attrs" ) hashes + "total" + total amount of matches retrieved (upto SPH_MAX_MATCHES, see sphinx.h) + "total_found" + total amount of matching documents in index + "time" + search time + "words" + an array of ( "word", "docs", "hits" ) hashes which contains + docs and hits count for stemmed (!) query words + """ + sock = self._Connect() + if not sock: + return {} + + # build request + req = [pack('>4L', self._offset, self._limit, self._mode, self._sort)] + + req.append(pack('>L', len(self._sortby))) + req.append(self._sortby) + + req.append(pack('>L', len(query))) + req.append(query) + + req.append(pack('>L', len(self._weights))) + for w in self._weights: + req.append(pack('>L', w)) + + req.append(pack('>L', len(index))) + req.append(index) + req.append(pack('>L', self._min_id)) + req.append(pack('>L', self._max_id)) + + # filters + req.append ( pack ( '>L', len(self._filters) ) ) + for f in self._filters: + req.append ( pack ( '>L', len(f['attr']) ) ) + req.append ( f['attr'] ) + if ( 'values' in f ): + req.append ( pack ( '>L', len(f['values']) ) ) + for v in f['values']: + req.append ( pack ( '>L', v ) ) + else: + req.append ( pack ( '>3L', 0, f['min'], f['max'] ) ) + req.append ( pack ( '>L', f['exclude'] ) ) + + # group-by, max-matches, group-sort + req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) ) + req.append ( self._groupby ) + req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) ) + req.append ( self._groupsort ) + + # send query, get response + req = ''.join(req) + + length = len(req) + req = pack('>2HL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length)+req + sock.send(req) + response = self._GetResponse(sock, VER_COMMAND_SEARCH) + if not response: + return {} + + # parse response + result = {} + max_ = len(response) + + # read schema + p = 0 + fields = [] + attrs = [] + + nfields = unpack('>L', response[p:p+4])[0] + p += 4 + while nfields>0 and pL', response[p:p+4])[0] + p += 4 + fields.append(response[p:p+length]) + p += length + + result['fields'] = fields + + nattrs = unpack('>L', response[p:p+4])[0] + p += 4 + while nattrs>0 and pL', response[p:p+4])[0] + p += 4 + attr = response[p:p+length] + p += length + type_ = unpack('>L', response[p:p+4])[0] + p += 4 + attrs.append([attr,type_]) + + result['attrs'] = attrs + + # read match count + count = unpack('>L', response[p:p+4])[0] + p += 4 + + # read matches + result['matches'] = [] + while count>0 and p2L', response[p:p+8]) + p += 8 + + match = { 'id':doc, 'weight':weight, 'attrs':{} } + for i in range(len(attrs)): + match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0] + p += 4 + + result['matches'].append ( match ) + + result['total'], result['total_found'], result['time'], words = \ + unpack('>4L', response[p:p+16]) + + result['time'] = '%.3f' % (result['time']/1000.0) + p += 16 + + result['words'] = [] + while words>0: + words -= 1 + length = unpack('>L', response[p:p+4])[0] + p += 4 + word = response[p:p+length] + p += length + docs, hits = unpack('>2L', response[p:p+8]) + p += 8 + + result['words'].append({'word':word, 'docs':docs, 'hits':hits}) + + sock.close() + + return result + + + def BuildExcerpts (self, docs, index, words, opts=None): + """ + connect to searchd server and generate exceprts from given documents + + "docs" is an array of strings which represent the documents' contents + "index" is a string specifiying the index which settings will be used + for stemming, lexing and case folding + "words" is a string which contains the words to highlight + "opts" is a hash which contains additional optional highlighting parameters: + "before_match" + a string to insert before a set of matching words, default is "" + "after_match" + a string to insert after a set of matching words, default is "" + "chunk_separator" + a string to insert between excerpts chunks, default is " ... " + "limit" + max excerpt size in symbols (codepoints), default is 256 + "around" + how much words to highlight around each match, default is 5 + + returns false on failure + returns an array of string excerpts on success + """ + if not opts: + opts = {} + + assert(isinstance(docs, list)) + assert(isinstance(index, str)) + assert(isinstance(words, str)) + assert(isinstance(opts, dict)) + + sock = self._Connect() + + if not sock: + return [] + + # fixup options + opts.setdefault('before_match', '') + opts.setdefault('after_match', '') + opts.setdefault('chunk_separator', ' ... ') + opts.setdefault('limit', 256) + opts.setdefault('around', 5) + + # build request + # v.1.0 req + + # mode=0, flags=1 (remove spaces) + req = [pack('>2L', 0, 1)] + + # req index + req.append(pack('>L', len(index))) + req.append(index) + + # req words + req.append(pack('>L', len(words))) + req.append(words) + + # options + req.append(pack('>L', len(opts['before_match']))) + req.append(opts['before_match']) + + req.append(pack('>L', len(opts['after_match']))) + req.append(opts['after_match']) + + req.append(pack('>L', len(opts['chunk_separator']))) + req.append(opts['chunk_separator']) + + req.append(pack('>L', int(opts['limit']))) + req.append(pack('>L', int(opts['around']))) + + # documents + req.append(pack('>L', len(docs))) + for doc in docs: + assert(isinstance(doc, str)) + req.append(pack('>L', len(doc))) + req.append(doc) + + req = ''.join(req) + + # send query, get response + length = len(req) + + # add header + req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req + wrote = sock.send(req) + + response = self._GetResponse(sock, VER_COMMAND_EXCERPT ) + if not response: + return [] + + # parse response + pos = 0 + res = [] + rlen = len(response) + + for i in range(len(docs)): + length = unpack('>L', response[pos:pos+4])[0] + pos += 4 + + if pos+length > rlen: + self._error = 'incomplete reply' + return [] + + res.append(response[pos:pos+length]) + pos += length + + return res + +# +# $Id: sphinxapi.py,v 1.7 2007/04/01 21:38:13 shodan Exp $ +# diff --git a/apps/djangosphinx/apis/api275/__init__.py b/apps/djangosphinx/apis/api275/__init__.py new file mode 100644 index 000000000..236a5a20d --- /dev/null +++ b/apps/djangosphinx/apis/api275/__init__.py @@ -0,0 +1,855 @@ +# +# $Id: sphinxapi.py 1216 2008-03-14 23:25:39Z shodan $ +# +# Python version of Sphinx searchd client (Python API) +# +# Copyright (c) 2006-2008, Andrew Aksyonoff +# Copyright (c) 2006, Mike Osadnik +# All rights reserved +# +# This program is free software; you can redistribute it and/or modify +# it under the terms of the GNU General Public License. You should have +# received a copy of the GPL license along with this program; if you +# did not, you can find it at http://www.gnu.org/ +# + +import sys +import select +import socket +from struct import * + + +# known searchd commands +SEARCHD_COMMAND_SEARCH = 0 +SEARCHD_COMMAND_EXCERPT = 1 +SEARCHD_COMMAND_UPDATE = 2 +SEARCHD_COMMAND_KEYWORDS= 3 + +# current client-side command implementation versions +VER_COMMAND_SEARCH = 0x113 +VER_COMMAND_EXCERPT = 0x100 +VER_COMMAND_UPDATE = 0x101 +VER_COMMAND_KEYWORDS = 0x100 + +# known searchd status codes +SEARCHD_OK = 0 +SEARCHD_ERROR = 1 +SEARCHD_RETRY = 2 +SEARCHD_WARNING = 3 + +# known match modes +SPH_MATCH_ALL = 0 +SPH_MATCH_ANY = 1 +SPH_MATCH_PHRASE = 2 +SPH_MATCH_BOOLEAN = 3 +SPH_MATCH_EXTENDED = 4 +SPH_MATCH_FULLSCAN = 5 +SPH_MATCH_EXTENDED2 = 6 + +# known ranking modes (extended2 mode only) +SPH_RANK_PROXIMITY_BM25 = 0 # default mode, phrase proximity major factor and BM25 minor one +SPH_RANK_BM25 = 1 # statistical mode, BM25 ranking only (faster but worse quality) +SPH_RANK_NONE = 2 # no ranking, all matches get a weight of 1 +SPH_RANK_WORDCOUNT = 3 # simple word-count weighting, rank is a weighted sum of per-field keyword occurence counts + +# known sort modes +SPH_SORT_RELEVANCE = 0 +SPH_SORT_ATTR_DESC = 1 +SPH_SORT_ATTR_ASC = 2 +SPH_SORT_TIME_SEGMENTS = 3 +SPH_SORT_EXTENDED = 4 +SPH_SORT_EXPR = 5 + +# known filter types +SPH_FILTER_VALUES = 0 +SPH_FILTER_RANGE = 1 +SPH_FILTER_FLOATRANGE = 2 + +# known attribute types +SPH_ATTR_NONE = 0 +SPH_ATTR_INTEGER = 1 +SPH_ATTR_TIMESTAMP = 2 +SPH_ATTR_ORDINAL = 3 +SPH_ATTR_BOOL = 4 +SPH_ATTR_FLOAT = 5 +SPH_ATTR_MULTI = 0X40000000L + +# known grouping functions +SPH_GROUPBY_DAY = 0 +SPH_GROUPBY_WEEK = 1 +SPH_GROUPBY_MONTH = 2 +SPH_GROUPBY_YEAR = 3 +SPH_GROUPBY_ATTR = 4 + + +class SphinxClient: + def __init__ (self): + """ + Create a new client object, and fill defaults. + """ + self._host = 'localhost' # searchd host (default is "localhost") + self._port = 3312 # searchd port (default is 3312) + self._offset = 0 # how much records to seek from result-set start (default is 0) + self._limit = 20 # how much records to return from result-set starting at offset (default is 20) + self._mode = SPH_MATCH_ALL # query matching mode (default is SPH_MATCH_ALL) + self._weights = [] # per-field weights (default is 1 for all fields) + self._sort = SPH_SORT_RELEVANCE # match sorting mode (default is SPH_SORT_RELEVANCE) + self._sortby = '' # attribute to sort by (defualt is "") + self._min_id = 0 # min ID to match (default is 0) + self._max_id = 0xFFFFFFFF # max ID to match (default is UINT_MAX) + self._filters = [] # search filters + self._groupby = '' # group-by attribute name + self._groupfunc = SPH_GROUPBY_DAY # group-by function (to pre-process group-by attribute value with) + self._groupsort = '@group desc' # group-by sorting clause (to sort groups in result set with) + self._groupdistinct = '' # group-by count-distinct attribute + self._maxmatches = 1000 # max matches to retrieve + self._cutoff = 0 # cutoff to stop searching at + self._retrycount = 0 # distributed retry count + self._retrydelay = 0 # distributed retry delay + self._anchor = {} # geographical anchor point + self._indexweights = {} # per-index weights + self._ranker = SPH_RANK_PROXIMITY_BM25 # ranking mode + self._maxquerytime = 0 # max query time, milliseconds (default is 0, do not limit) + self._fieldweights = {} # per-field-name weights + self._error = '' # last error message + self._warning = '' # last warning message + self._reqs = [] # requests array for multi-query + return + + + def GetLastError (self): + """ + Get last error message (string). + """ + return self._error + + + def GetLastWarning (self): + """ + Get last warning message (string). + """ + return self._warning + + + def SetServer (self, host, port): + """ + Set searchd server host and port. + """ + assert(isinstance(host, str)) + assert(isinstance(port, int)) + self._host = host + self._port = port + + + def _Connect (self): + """ + INTERNAL METHOD, DO NOT CALL. Connects to searchd server. + """ + try: + sock = socket.socket ( socket.AF_INET, socket.SOCK_STREAM ) + sock.connect ( ( self._host, self._port ) ) + except socket.error, msg: + if sock: + sock.close() + self._error = 'connection to %s:%s failed (%s)' % ( self._host, self._port, msg ) + return 0 + + v = unpack('>L', sock.recv(4)) + if v<1: + sock.close() + self._error = 'expected searchd protocol version, got %s' % v + return 0 + + # all ok, send my version + sock.send(pack('>L', 1)) + return sock + + + def _GetResponse (self, sock, client_ver): + """ + INTERNAL METHOD, DO NOT CALL. Gets and checks response packet from searchd server. + """ + (status, ver, length) = unpack('>2HL', sock.recv(8)) + response = '' + left = length + while left>0: + chunk = sock.recv(left) + if chunk: + response += chunk + left -= len(chunk) + else: + break + + sock.close() + + # check response + read = len(response) + if not response or read!=length: + if length: + self._error = 'failed to read searchd response (status=%s, ver=%s, len=%s, read=%s)' \ + % (status, ver, length, read) + else: + self._error = 'received zero-sized searchd response' + return None + + # check status + if status==SEARCHD_WARNING: + wend = 4 + unpack ( '>L', response[0:4] )[0] + self._warning = response[4:wend] + return response[wend:] + + if status==SEARCHD_ERROR: + self._error = 'searchd error: '+response[4:] + return None + + if status==SEARCHD_RETRY: + self._error = 'temporary searchd error: '+response[4:] + return None + + if status!=SEARCHD_OK: + self._error = 'unknown status code %d' % status + return None + + # check version + if ver>8, ver&0xff, client_ver>>8, client_ver&0xff) + + return response + + + def SetLimits (self, offset, limit, maxmatches=0, cutoff=0): + """ + Set offset and count into result set, and optionally set max-matches and cutoff limits. + """ + assert(isinstance(offset, int) and offset>=0) + assert(isinstance(limit, int) and limit>0) + assert(maxmatches>=0) + self._offset = offset + self._limit = limit + if maxmatches>0: + self._maxmatches = maxmatches + if cutoff>=0: + self._cutoff = cutoff + + + def SetMaxQueryTime (self, maxquerytime): + """ + Set maximum query time, in milliseconds, per-index. 0 means 'do not limit'. + """ + assert(isinstance(maxquerytime,int) and maxquerytime>0) + self._maxquerytime = maxquerytime + + + def SetMatchMode (self, mode): + """ + Set matching mode. + """ + assert(mode in [SPH_MATCH_ALL, SPH_MATCH_ANY, SPH_MATCH_PHRASE, SPH_MATCH_BOOLEAN, SPH_MATCH_EXTENDED, SPH_MATCH_FULLSCAN, SPH_MATCH_EXTENDED2]) + self._mode = mode + + + def SetRankingMode (self, ranker): + """ + Set ranking mode. + """ + assert(ranker in [SPH_RANK_PROXIMITY_BM25, SPH_RANK_BM25, SPH_RANK_NONE, SPH_RANK_WORDCOUNT]) + self._ranker = ranker + + + def SetSortMode ( self, mode, clause='' ): + """ + Set sorting mode. + """ + assert ( mode in [SPH_SORT_RELEVANCE, SPH_SORT_ATTR_DESC, SPH_SORT_ATTR_ASC, SPH_SORT_TIME_SEGMENTS, SPH_SORT_EXTENDED, SPH_SORT_EXPR] ) + assert ( isinstance ( clause, str ) ) + self._sort = mode + self._sortby = clause + + + def SetWeights (self, weights): + """ + Set per-field weights. + WARNING, DEPRECATED; do not use it! use SetFieldWeights() instead + """ + assert(isinstance(weights, list)) + for w in weights: + assert(isinstance(w, int)) + self._weights = weights + + + def SetFieldWeights (self, weights): + """ + Bind per-field weights by name; expects (name,field_weight) dictionary as argument. + """ + assert(isinstance(weights,dict)) + for key,val in weights.items(): + assert(isinstance(key,str)) + assert(isinstance(val,int)) + self._fieldweights = weights + + + def SetIndexWeights (self, weights): + """ + Bind per-index weights by name; expects (name,index_weight) dictionary as argument. + """ + assert(isinstance(weights,dict)) + for key,val in weights.items(): + assert(isinstance(key,str)) + assert(isinstance(val,int)) + self._indexweights = weights + + + def SetIDRange (self, minid, maxid): + """ + Set IDs range to match. + Only match records if document ID is beetwen $min and $max (inclusive). + """ + assert(isinstance(minid, int)) + assert(isinstance(maxid, int)) + assert(minid<=maxid) + self._min_id = minid + self._max_id = maxid + + + def SetFilter ( self, attribute, values, exclude=0 ): + """ + Set values set filter. + Only match records where 'attribute' value is in given 'values' set. + """ + assert(isinstance(attribute, str)) + assert(isinstance(values, list)) + assert(values) + + for value in values: + assert(isinstance(value, int)) + + self._filters.append ( { 'type':SPH_FILTER_VALUES, 'attr':attribute, 'exclude':exclude, 'values':values } ) + + + def SetFilterRange (self, attribute, min_, max_, exclude=0 ): + """ + Set range filter. + Only match records if 'attribute' value is beetwen 'min_' and 'max_' (inclusive). + """ + assert(isinstance(attribute, str)) + assert(isinstance(min_, int)) + assert(isinstance(max_, int)) + assert(min_<=max_) + + self._filters.append ( { 'type':SPH_FILTER_RANGE, 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_ } ) + + + def SetFilterFloatRange (self, attribute, min_, max_, exclude=0 ): + assert(isinstance(attribute,str)) + assert(isinstance(min_,float)) + assert(isinstance(max_,float)) + assert(min_ <= max_) + self._filters.append ( {'type':SPH_FILTER_FLOATRANGE, 'attr':attribute, 'exclude':exclude, 'min':min_, 'max':max_} ) + + + def SetGeoAnchor (self, attrlat, attrlong, latitude, longitude): + assert(isinstance(attrlat,str)) + assert(isinstance(attrlong,str)) + assert(isinstance(latitude,float)) + assert(isinstance(longitude,float)) + self._anchor['attrlat'] = attrlat + self._anchor['attrlong'] = attrlong + self._anchor['lat'] = latitude + self._anchor['long'] = longitude + + + def SetGroupBy ( self, attribute, func, groupsort='@group desc' ): + """ + Set grouping attribute and function. + """ + assert(isinstance(attribute, str)) + assert(func in [SPH_GROUPBY_DAY, SPH_GROUPBY_WEEK, SPH_GROUPBY_MONTH, SPH_GROUPBY_YEAR, SPH_GROUPBY_ATTR] ) + assert(isinstance(groupsort, str)) + + self._groupby = attribute + self._groupfunc = func + self._groupsort = groupsort + + + def SetGroupDistinct (self, attribute): + assert(isinstance(attribute,str)) + self._groupdistinct = attribute + + + def SetRetries (self, count, delay=0): + assert(isinstance(count,int) and count>=0) + assert(isinstance(delay,int) and delay>=0) + self._retrycount = count + self._retrydelay = delay + + + def ResetFilters (self): + """ + Clear all filters (for multi-queries). + """ + self._filters = [] + self._anchor = {} + + + def ResetGroupBy (self): + """ + Clear groupby settings (for multi-queries). + """ + self._groupby = '' + self._groupfunc = SPH_GROUPBY_DAY + self._groupsort = '@group desc' + self._groupdistinct = '' + + + def Query (self, query, index='*', comment=''): + """ + Connect to searchd server and run given search query. + Returns None on failure; result set hash on success (see documentation for details). + """ + assert(len(self._reqs)==0) + self.AddQuery(query,index,comment) + results = self.RunQueries() + + if not results or len(results)==0: + return None + self._error = results[0]['error'] + self._warning = results[0]['warning'] + if results[0]['status'] == SEARCHD_ERROR: + return None + return results[0] + + + def AddQuery (self, query, index='*', comment=''): + """ + Add query to batch. + """ + # build request + req = [pack('>5L', self._offset, self._limit, self._mode, self._ranker, self._sort)] + req.append(pack('>L', len(self._sortby))) + req.append(self._sortby) + + if isinstance(query,unicode): + query = query.encode('utf-8') + assert(isinstance(query,str)) + + req.append(pack('>L', len(query))) + req.append(query) + + req.append(pack('>L', len(self._weights))) + for w in self._weights: + req.append(pack('>L', w)) + req.append(pack('>L', len(index))) + req.append(index) + req.append(pack('>L',0)) # id64 range marker FIXME! IMPLEMENT! + req.append(pack('>L', self._min_id)) + req.append(pack('>L', self._max_id)) + + # filters + req.append ( pack ( '>L', len(self._filters) ) ) + for f in self._filters: + req.append ( pack ( '>L', len(f['attr'])) + f['attr']) + filtertype = f['type'] + req.append ( pack ( '>L', filtertype)) + if filtertype == SPH_FILTER_VALUES: + req.append ( pack ('>L', len(f['values']))) + for val in f['values']: + req.append ( pack ('>L', val)) + elif filtertype == SPH_FILTER_RANGE: + req.append ( pack ('>2L', f['min'], f['max'])) + elif filtertype == SPH_FILTER_FLOATRANGE: + req.append ( pack ('>2f', f['min'], f['max'])) + req.append ( pack ( '>L', f['exclude'] ) ) + + # group-by, max-matches, group-sort + req.append ( pack ( '>2L', self._groupfunc, len(self._groupby) ) ) + req.append ( self._groupby ) + req.append ( pack ( '>2L', self._maxmatches, len(self._groupsort) ) ) + req.append ( self._groupsort ) + req.append ( pack ( '>LLL', self._cutoff, self._retrycount, self._retrydelay)) + req.append ( pack ( '>L', len(self._groupdistinct))) + req.append ( self._groupdistinct) + + # anchor point + if len(self._anchor) == 0: + req.append ( pack ('>L', 0)) + else: + attrlat, attrlong = self._anchor['attrlat'], self._anchor['attrlong'] + latitude, longitude = self._anchor['lat'], self._anchor['long'] + req.append ( pack ('>L', 1)) + req.append ( pack ('>L', len(attrlat)) + attrlat) + req.append ( pack ('>L', len(attrlong)) + attrlong) + req.append ( pack ('>f', latitude) + pack ('>f', longitude)) + + # per-index weights + req.append ( pack ('>L',len(self._indexweights))) + for indx,weight in self._indexweights.items(): + req.append ( pack ('>L',len(indx)) + indx + pack ('>L',weight)) + + # max query time + req.append ( pack ('>L', self._maxquerytime) ) + + # per-field weights + req.append ( pack ('>L',len(self._fieldweights) ) ) + for field,weight in self._fieldweights.items(): + req.append ( pack ('>L',len(field)) + field + pack ('>L',weight) ) + + # comment + req.append ( pack('>L',len(comment)) + comment ) + + # send query, get response + req = ''.join(req) + + self._reqs.append(req) + return + + + def RunQueries (self): + """ + Run queries batch. + Returns None on network IO failure; or an array of result set hashes on success. + """ + if len(self._reqs)==0: + self._error = 'no queries defined, issue AddQuery() first' + return None + + sock = self._Connect() + if not sock: + return None + + req = ''.join(self._reqs) + length = len(req)+4 + req = pack('>HHLL', SEARCHD_COMMAND_SEARCH, VER_COMMAND_SEARCH, length, len(self._reqs))+req + sock.send(req) + + response = self._GetResponse(sock, VER_COMMAND_SEARCH) + if not response: + return None + + nreqs = len(self._reqs) + + # parse response + max_ = len(response) + p = 0 + + results = [] + for i in range(0,nreqs,1): + result = {} + result['error'] = '' + result['warning'] = '' + status = unpack('>L', response[p:p+4])[0] + p += 4 + result['status'] = status + if status != SEARCHD_OK: + length = unpack('>L', response[p:p+4])[0] + p += 4 + message = response[p:p+length] + p += length + + if status == SEARCHD_WARNING: + result['warning'] = message + else: + result['error'] = message + continue + + # read schema + fields = [] + attrs = [] + + nfields = unpack('>L', response[p:p+4])[0] + p += 4 + while nfields>0 and pL', response[p:p+4])[0] + p += 4 + fields.append(response[p:p+length]) + p += length + + result['fields'] = fields + + nattrs = unpack('>L', response[p:p+4])[0] + p += 4 + while nattrs>0 and pL', response[p:p+4])[0] + p += 4 + attr = response[p:p+length] + p += length + type_ = unpack('>L', response[p:p+4])[0] + p += 4 + attrs.append([attr,type_]) + + result['attrs'] = attrs + + # read match count + count = unpack('>L', response[p:p+4])[0] + p += 4 + id64 = unpack('>L', response[p:p+4])[0] + p += 4 + + # read matches + result['matches'] = [] + while count>0 and p3L', response[p:p+12]) + doc += (dochi<<32) + p += 12 + else: + doc, weight = unpack('>2L', response[p:p+8]) + p += 8 + + match = { 'id':doc, 'weight':weight, 'attrs':{} } + for i in range(len(attrs)): + if attrs[i][1] == SPH_ATTR_FLOAT: + match['attrs'][attrs[i][0]] = unpack('>f', response[p:p+4])[0] + elif attrs[i][1] == (SPH_ATTR_MULTI | SPH_ATTR_INTEGER): + match['attrs'][attrs[i][0]] = [] + nvals = unpack('>L', response[p:p+4])[0] + p += 4 + for n in range(0,nvals,1): + match['attrs'][attrs[i][0]].append(unpack('>L', response[p:p+4])[0]) + p += 4 + p -= 4 + else: + match['attrs'][attrs[i][0]] = unpack('>L', response[p:p+4])[0] + p += 4 + + result['matches'].append ( match ) + + result['total'], result['total_found'], result['time'], words = unpack('>4L', response[p:p+16]) + + result['time'] = '%.3f' % (result['time']/1000.0) + p += 16 + + result['words'] = [] + while words>0: + words -= 1 + length = unpack('>L', response[p:p+4])[0] + p += 4 + word = response[p:p+length] + p += length + docs, hits = unpack('>2L', response[p:p+8]) + p += 8 + + result['words'].append({'word':word, 'docs':docs, 'hits':hits}) + + results.append(result) + + self._reqs = [] + sock.close() + return results + + + def BuildExcerpts (self, docs, index, words, opts=None): + """ + Connect to searchd server and generate exceprts from given documents. + """ + if not opts: + opts = {} + if isinstance(words,unicode): + words = words.encode('utf-8') + + assert(isinstance(docs, list)) + assert(isinstance(index, str)) + assert(isinstance(words, str)) + assert(isinstance(opts, dict)) + + sock = self._Connect() + + if not sock: + return None + + # fixup options + opts.setdefault('before_match', '') + opts.setdefault('after_match', '') + opts.setdefault('chunk_separator', ' ... ') + opts.setdefault('limit', 256) + opts.setdefault('around', 5) + + # build request + # v.1.0 req + + # mode=0, flags=1 (remove spaces) + req = [pack('>2L', 0, 1)] + + # req index + req.append(pack('>L', len(index))) + req.append(index) + + # req words + req.append(pack('>L', len(words))) + req.append(words) + + # options + req.append(pack('>L', len(opts['before_match']))) + req.append(opts['before_match']) + + req.append(pack('>L', len(opts['after_match']))) + req.append(opts['after_match']) + + req.append(pack('>L', len(opts['chunk_separator']))) + req.append(opts['chunk_separator']) + + req.append(pack('>L', int(opts['limit']))) + req.append(pack('>L', int(opts['around']))) + + # documents + req.append(pack('>L', len(docs))) + for doc in docs: + if isinstance(doc,unicode): + doc = doc.encode('utf-8') + assert(isinstance(doc, str)) + req.append(pack('>L', len(doc))) + req.append(doc) + + req = ''.join(req) + + # send query, get response + length = len(req) + + # add header + req = pack('>2HL', SEARCHD_COMMAND_EXCERPT, VER_COMMAND_EXCERPT, length)+req + wrote = sock.send(req) + + response = self._GetResponse(sock, VER_COMMAND_EXCERPT ) + if not response: + return [] + + # parse response + pos = 0 + res = [] + rlen = len(response) + + for i in range(len(docs)): + length = unpack('>L', response[pos:pos+4])[0] + pos += 4 + + if pos+length > rlen: + self._error = 'incomplete reply' + return [] + + res.append(response[pos:pos+length]) + pos += length + + return res + + + def UpdateAttributes ( self, index, attrs, values ): + """ + Update given attribute values on given documents in given indexes. + Returns amount of updated documents (0 or more) on success, or -1 on failure. + + 'attrs' must be a list of strings. + 'values' must be a dict with int key (document ID) and list of int values (new attribute values). + + Example: + res = cl.UpdateAttributes ( 'test1', [ 'group_id', 'date_added' ], { 2:[123,1000000000], 4:[456,1234567890] } ) + """ + assert ( isinstance ( index, str ) ) + assert ( isinstance ( attrs, list ) ) + assert ( isinstance ( values, dict ) ) + for attr in attrs: + assert ( isinstance ( attr, str ) ) + for docid, entry in values.items(): + assert ( isinstance ( docid, int ) ) + assert ( isinstance ( entry, list ) ) + assert ( len(attrs)==len(entry) ) + for val in entry: + assert ( isinstance ( val, int ) ) + + # build request + req = [ pack('>L',len(index)), index ] + + req.append ( pack('>L',len(attrs)) ) + for attr in attrs: + req.append ( pack('>L',len(attr)) + attr ) + + req.append ( pack('>L',len(values)) ) + for docid, entry in values.items(): + req.append ( pack('>q',docid) ) + for val in entry: + req.append ( pack('>L',val) ) + + # connect, send query, get response + sock = self._Connect() + if not sock: + return None + + req = ''.join(req) + length = len(req) + req = pack ( '>2HL', SEARCHD_COMMAND_UPDATE, VER_COMMAND_UPDATE, length ) + req + wrote = sock.send ( req ) + + response = self._GetResponse ( sock, VER_COMMAND_UPDATE ) + if not response: + return -1 + + # parse response + updated = unpack ( '>L', response[0:4] )[0] + return updated + + + def BuildKeywords ( self, query, index, hits ): + """ + Connect to searchd server, and generate keywords list for a given query. + Returns None on failure, or a list of keywords on success. + """ + assert ( isinstance ( query, str ) ) + assert ( isinstance ( index, str ) ) + assert ( isinstance ( hits, int ) ) + + # build request + req = [ pack ( '>L', len(query) ) + query ] + req.append ( pack ( '>L', len(index) ) + index ) + req.append ( pack ( '>L', hits ) ) + + # connect, send query, get response + sock = self._Connect() + if not sock: + return None + + req = ''.join(req) + length = len(req) + req = pack ( '>2HL', SEARCHD_COMMAND_KEYWORDS, VER_COMMAND_KEYWORDS, length ) + req + wrote = sock.send ( req ) + + response = self._GetResponse ( sock, VER_COMMAND_KEYWORDS ) + if not response: + return None + + # parse response + res = [] + + nwords = unpack ( '>L', response[0:4] )[0] + p = 4 + max_ = len(response) + + while nwords>0 and pL', response[p:p+4] )[0] + p += 4 + tokenized = response[p:p+length] + p += length + + length = unpack ( '>L', response[p:p+4] )[0] + p += 4 + normalized = response[p:p+length] + p += length + + entry = { 'tokenized':tokenized, 'normalized':normalized } + if hits: + entry['docs'], entry['hits'] = unpack ( '>2L', response[p:p+8] ) + p += 8 + + res.append ( entry ) + + if nwords>0 or p>max_: + self._error = 'incomplete reply' + return None + + return res +# +# $Id: sphinxapi.py 1216 2008-03-14 23:25:39Z shodan $ +# \ No newline at end of file diff --git a/apps/djangosphinx/apis/api275/templates/source-multiple.conf b/apps/djangosphinx/apis/api275/templates/source-multiple.conf new file mode 100644 index 000000000..506e6f190 --- /dev/null +++ b/apps/djangosphinx/apis/api275/templates/source-multiple.conf @@ -0,0 +1,36 @@ +source {{ source_name }} +{ + type = {{ database_engine }} + strip_html = 0 + index_html_attrs = + sql_host = {{ database_host }} + sql_user = {{ database_user }} + sql_pass = {{ database_password }} + sql_db = {{ database_name }} + sql_port = {{ database_port }} + log = {{ log_file }} + + sql_query_pre = + sql_query_post = + sql_query = \ +{% for table_name, content_type in tables %} + SELECT {{ field_names|join:", " }}, {{ content_type.id }} as content_type \ + FROM `{{ table_name }}`{% if not loop.last %} UNION \{% endif %} +{% endfor %} +{% if group_columns %} + # ForeignKey's +{% for field_name in group_columns %} sql_attr_uint = {{ field_name }} +{% endfor %}{% endif %} +{% if date_columns %} + # DateField's and DateTimeField's +{% for field_name in date_columns %} sql_attr_timestamp = {{ field_name }} +{% endfor %}{% endif %} +{% if bool_columns %} + # BooleanField's +{% for field_name in bool_columns %} sql_attr_bool = {{ field_name }} +{% endfor %}{% endif %} +{% if float_columns %} + # FloatField's and DecimalField's +{% for field_name in float_columns %} sql_attr_float = {{ field_name }} +{% endfor %}{% endif %} +} \ No newline at end of file diff --git a/apps/djangosphinx/apis/api275/templates/source.conf b/apps/djangosphinx/apis/api275/templates/source.conf new file mode 100644 index 000000000..9108aeea0 --- /dev/null +++ b/apps/djangosphinx/apis/api275/templates/source.conf @@ -0,0 +1,32 @@ +source {{ source_name }} +{ + type = {{ database_engine }} + sql_host = {{ database_host }} + sql_user = {{ database_user }} + sql_pass = {{ database_password }} + sql_db = {{ database_name }} + sql_port = {{ database_port }} + + sql_query_pre = + sql_query_post = + sql_query = \ + SELECT {{ field_names|join:", " }} \ + FROM {{ table_name }} + sql_query_info = SELECT * FROM `{{ table_name }}` WHERE `{{ primary_key }}` = $id +{% if group_columns %} + # ForeignKey's +{% for field_name in group_columns %} sql_attr_uint = {{ field_name }} +{% endfor %}{% endif %} +{% if date_columns %} + # DateField's and DateTimeField's +{% for field_name in date_columns %} sql_attr_timestamp = {{ field_name }} +{% endfor %}{% endif %} +{% if bool_columns %} + # BooleanField's +{% for field_name in bool_columns %} sql_attr_bool = {{ field_name }} +{% endfor %}{% endif %} +{% if float_columns %} + # FloatField's and DecimalField's +{% for field_name in float_columns %} sql_attr_float = {{ field_name }} +{% endfor %}{% endif %} +} \ No newline at end of file diff --git a/apps/djangosphinx/apis/current.py b/apps/djangosphinx/apis/current.py new file mode 100644 index 000000000..e85f4ec86 --- /dev/null +++ b/apps/djangosphinx/apis/current.py @@ -0,0 +1,11 @@ +from djangosphinx.constants import * + +try: + from sphinxapi import * +except ImportError, exc: + name = 'djangosphinx.apis.api%d' % (SPHINX_API_VERSION,) + sphinxapi = __import__(name) + for name in name.split('.')[1:]: + sphinxapi = getattr(sphinxapi, name) + for attr in dir(sphinxapi): + globals()[attr] = getattr(sphinxapi, attr) diff --git a/apps/djangosphinx/constants.py b/apps/djangosphinx/constants.py new file mode 100644 index 000000000..976d48d41 --- /dev/null +++ b/apps/djangosphinx/constants.py @@ -0,0 +1,7 @@ +from django.conf import settings + +__all__ = ('SPHINX_API_VERSION',) + +# 0x113 = 1.19 +# 0x107 = 1.17 +SPHINX_API_VERSION = getattr(settings, 'SPHINX_API_VERSION', 0x107) \ No newline at end of file diff --git a/apps/djangosphinx/management/__init__.py b/apps/djangosphinx/management/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/djangosphinx/management/commands/__init__.py b/apps/djangosphinx/management/commands/__init__.py new file mode 100644 index 000000000..e69de29bb diff --git a/apps/djangosphinx/management/commands/generate_sphinx_config.py b/apps/djangosphinx/management/commands/generate_sphinx_config.py new file mode 100644 index 000000000..3320ce601 --- /dev/null +++ b/apps/djangosphinx/management/commands/generate_sphinx_config.py @@ -0,0 +1,22 @@ +from django.core.management.base import AppCommand +from django.db import models + +from djangosphinx.manager import SphinxModelManager + +class Command(AppCommand): + help = "Prints generic configuration for any models which use a standard SphinxSearch manager." + + output_transaction = True + + def handle_app(self, app, **options): + from djangosphinx.utils.config import generate_config_for_model + model_classes = [getattr(app, n) for n in dir(app) if hasattr(getattr(app, n), '_meta')] + found = 0 + for model in model_classes: + indexes = getattr(model, '__sphinx_indexes__', []) + for index in indexes: + found += 1 + print generate_config_for_model(model, index) + if found == 0: + print "Unable to find any models in application which use standard SphinxSearch configuration." + #return u'\n'.join(sql_create(app, self.style)).encode('utf-8') diff --git a/apps/djangosphinx/manager.py b/apps/djangosphinx/manager.py new file mode 100644 index 000000000..3fbcc9657 --- /dev/null +++ b/apps/djangosphinx/manager.py @@ -0,0 +1,648 @@ +import select +import socket +import time +import struct +import warnings +import operator +import apis.current as sphinxapi + +try: + import decimal +except ImportError: + from django.utils import _decimal as decimal # for Python 2.3 + +from django.db.models.query import QuerySet, Q +from django.conf import settings + +__all__ = ('SearchError', 'ConnectionError', 'SphinxSearch', 'SphinxRelation') + +from django.contrib.contenttypes.models import ContentType +from datetime import datetime, date + +# server settings +SPHINX_SERVER = getattr(settings, 'SPHINX_SERVER', 'localhost') +SPHINX_PORT = int(getattr(settings, 'SPHINX_PORT', 3312)) + +# These require search API 275 (Sphinx 0.9.8) +SPHINX_RETRIES = int(getattr(settings, 'SPHINX_RETRIES', 0)) +SPHINX_RETRIES_DELAY = int(getattr(settings, 'SPHINX_RETRIES_DELAY', 5)) + +MAX_INT = int(2**31-1) + +class SearchError(Exception): pass +class ConnectionError(Exception): pass + +class SphinxProxy(object): + """ + Acts exactly like a normal instance of an object except that + it will handle any special sphinx attributes in a _sphinx class. + """ + __slots__ = ('__dict__', '__instance__', '_sphinx') + + def __init__(self, instance, attributes): + object.__setattr__(self, '__instance__', instance) + object.__setattr__(self, '_sphinx', attributes) + + def _get_current_object(self): + """ + Return the current object. This is useful if you want the real object + behind the proxy at a time for performance reasons or because you want + to pass the object into a different context. + """ + return self.__instance__ + __current_object = property(_get_current_object) + + def __dict__(self): + try: + return self.__current_object.__dict__ + except RuntimeError: + return AttributeError('__dict__') + __dict__ = property(__dict__) + + def __repr__(self): + try: + obj = self.__current_object + except RuntimeError: + return '<%s unbound>' % self.__class__.__name__ + return repr(obj) + + def __nonzero__(self): + try: + return bool(self.__current_object) + except RuntimeError: + return False + + def __unicode__(self): + try: + return unicode(self.__current_oject) + except RuntimeError: + return repr(self) + + def __dir__(self): + try: + return dir(self.__current_object) + except RuntimeError: + return [] + + def __getattr__(self, name, value=None): + if name == '__members__': + return dir(self.__current_object) + elif name == '_sphinx': + return object.__getattr__(self, '_sphinx', value) + return getattr(self.__current_object, name) + + def __setattr__(self, name, value): + if name == '_sphinx': + return object.__setattr__(self, '_sphinx', value) + return setattr(self.__current_object, name, value) + + def __setitem__(self, key, value): + self.__current_object[key] = value + + def __delitem__(self, key): + del self.__current_object[key] + + def __setslice__(self, i, j, seq): + self.__current_object[i:j] = seq + + def __delslice__(self, i, j): + del self.__current_object[i:j] + + __delattr__ = lambda x, n: delattr(x.__current_object, n) + __str__ = lambda x: str(x.__current_object) + __unicode__ = lambda x: unicode(x.__current_object) + __lt__ = lambda x, o: x.__current_object < o + __le__ = lambda x, o: x.__current_object <= o + __eq__ = lambda x, o: x.__current_object == o + __ne__ = lambda x, o: x.__current_object != o + __gt__ = lambda x, o: x.__current_object > o + __ge__ = lambda x, o: x.__current_object >= o + __cmp__ = lambda x, o: cmp(x.__current_object, o) + __hash__ = lambda x: hash(x.__current_object) + # attributes are currently not callable + # __call__ = lambda x, *a, **kw: x.__current_object(*a, **kw) + __len__ = lambda x: len(x.__current_object) + __getitem__ = lambda x, i: x.__current_object[i] + __iter__ = lambda x: iter(x.__current_object) + __contains__ = lambda x, i: i in x.__current_object + __getslice__ = lambda x, i, j: x.__current_object[i:j] + __add__ = lambda x, o: x.__current_object + o + __sub__ = lambda x, o: x.__current_object - o + __mul__ = lambda x, o: x.__current_object * o + __floordiv__ = lambda x, o: x.__current_object // o + __mod__ = lambda x, o: x.__current_object % o + __divmod__ = lambda x, o: x.__current_object.__divmod__(o) + __pow__ = lambda x, o: x.__current_object ** o + __lshift__ = lambda x, o: x.__current_object << o + __rshift__ = lambda x, o: x.__current_object >> o + __and__ = lambda x, o: x.__current_object & o + __xor__ = lambda x, o: x.__current_object ^ o + __or__ = lambda x, o: x.__current_object | o + __div__ = lambda x, o: x.__current_object.__div__(o) + __truediv__ = lambda x, o: x.__current_object.__truediv__(o) + __neg__ = lambda x: -(x.__current_object) + __pos__ = lambda x: +(x.__current_object) + __abs__ = lambda x: abs(x.__current_object) + __invert__ = lambda x: ~(x.__current_object) + __complex__ = lambda x: complex(x.__current_object) + __int__ = lambda x: int(x.__current_object) + __long__ = lambda x: long(x.__current_object) + __float__ = lambda x: float(x.__current_object) + __oct__ = lambda x: oct(x.__current_object) + __hex__ = lambda x: hex(x.__current_object) + __index__ = lambda x: x.__current_object.__index__() + __coerce__ = lambda x, o: x.__coerce__(x, o) + __enter__ = lambda x: x.__enter__() + __exit__ = lambda x, *a, **kw: x.__exit__(*a, **kw) + +def to_sphinx(value): + "Convert a value into a sphinx query value" + if isinstance(value, date) or isinstance(value, datetime): + return int(time.mktime(value.timetuple())) + elif isinstance(value, decimal.Decimal) or isinstance(value, float): + return float(value) + return int(value) + +class SphinxQuerySet(object): + available_kwargs = ('rankmode', 'mode', 'weights', 'maxmatches') + + def __init__(self, model=None, **kwargs): + self._select_related = False + self._select_related_args = {} + self._select_related_fields = [] + self._filters = {} + self._excludes = {} + self._extra = {} + self._query = '' + self.__metadata = None + self._offset = 0 + self._limit = 20 + + self._groupby = None + self._sort = None + self._weights = [1, 100] + + self._maxmatches = 1000 + self._result_cache = None + self._mode = sphinxapi.SPH_MATCH_ALL + self._rankmode = getattr(sphinxapi, 'SPH_RANK_PROXIMITY_BM25', None) + self._model = model + self._anchor = {} + self.__metadata = {} + + self.set_options(**kwargs) + + if model: + self._index = kwargs.get('index', model._meta.db_table) + else: + self._index = kwargs.get('index') + + def __repr__(self): + if self._result_cache is not None: + return repr(self._get_data()) + else: + return '<%s instance>' % (self.__class__.__name__,) + + def __len__(self): + return len(self._get_data()) + + def __iter__(self): + return iter(self._get_data()) + + def __getitem__(self, k): + if not isinstance(k, (slice, int, long)): + raise TypeError + assert (not isinstance(k, slice) and (k >= 0)) \ + or (isinstance(k, slice) and (k.start is None or k.start >= 0) and (k.stop is None or k.stop >= 0)), \ + "Negative indexing is not supported." + if type(k) == slice: + if self._offset < k.start or k.stop-k.start > self._limit: + self._result_cache = None + else: + if k not in range(self._offset, self._limit+self._offset): + self._result_cache = None + if self._result_cache is None: + if type(k) == slice: + self._offset = k.start + self._limit = k.stop-k.start + return self._get_results() + else: + self._offset = k + self._limit = 1 + return self._get_results()[0] + else: + return self._result_cache[k] + + def set_options(self, **kwargs): + if 'rankmode' in kwargs: + if kwargs.get('rankmode') is None: + kwargs['rankmode'] = sphinxapi.SPH_RANK_NONE + for key in self.available_kwargs: + if key in kwargs: + setattr(self, '_%s' % (key,), kwargs[key]) + + def query(self, string): + return self._clone(_query=unicode(string).encode('utf-8')) + + def group_by(self, attribute, func, groupsort='@group desc'): + return self._clone(_groupby=attribute, _groupfunc=func, _groupsort=groupsort) + + def rank_none(self): + warnings.warn('`rank_none()` is deprecated. Use `set_options(rankmode=None)` instead.', DeprecationWarning) + return self._clone(_rankmode=sphinxapi.SPH_RANK_NONE) + + def mode(self, mode): + warnings.warn('`mode()` is deprecated. Use `set_options(mode='')` instead.', DeprecationWarning) + return self._clone(_mode=mode) + + def weights(self, weights): + warnings.warn('`mode()` is deprecated. Use `set_options(weights=[])` instead.', DeprecationWarning) + return self._clone(_weights=weights) + + def on_index(self, index): + warnings.warn('`mode()` is deprecated. Use `set_options(on_index=foo)` instead.', DeprecationWarning) + return self._clone(_index=index) + + # only works on attributes + def filter(self, **kwargs): + filters = self._filters.copy() + for k,v in kwargs.iteritems(): + if hasattr(v, 'next'): + v = list(v) + elif not (isinstance(v, list) or isinstance(v, tuple)): + v = [v,] + filters.setdefault(k, []).extend(map(to_sphinx, v)) + return self._clone(_filters=filters) + + def geoanchor(self, lat_attr, lng_attr, lat, lng): + assert(sphinxapi.VER_COMMAND_SEARCH >= 0x113, "You must upgrade sphinxapi to version 0.98 to use Geo Anchoring.") + return self._clone(_anchor=(lat_attr, lng_attr, float(lat), float(lng))) + + # this actually does nothing, its just a passthru to + # keep things looking/working generally the same + def all(self): + return self + + # only works on attributes + def exclude(self, **kwargs): + filters = self._excludes.copy() + for k,v in kwargs.iteritems(): + if hasattr(v, 'next'): + v = list(v) + elif not (isinstance(v, list) or isinstance(v, tuple)): + v = [v,] + filters.setdefault(k, []).extend(map(to_sphinx, v)) + return self._clone(_excludes=filters) + + # you cannot order by @weight (it always orders in descending) + # keywords are @id, @weight, @rank, and @relevance + def order_by(self, *args): + sort_by = [] + for arg in args: + sort = 'ASC' + if arg[0] == '-': + arg = arg[1:] + sort = 'DESC' + if arg == 'id': + arg = '@id' + sort_by.append('%s %s' % (arg, sort)) + if sort_by: + return self._clone(_sort=(sphinxapi.SPH_SORT_EXTENDED, ', '.join(sort_by))) + return self + + # pass these thru on the queryset and let django handle it + def select_related(self, *args, **kwargs): + _args = self._select_related_fields[:] + _args.extend(args) + _kwargs = self._select_related_args.copy() + _kwargs.update(kwargs) + + return self._clone( + _select_related=True, + _select_related_fields=_args, + _select_related_args=_kwargs, + ) + + def extra(self, **kwargs): + extra = self._extra.copy() + extra.update(kwargs) + return self._clone(_extra=extra) + + def count(self): + return min(self._sphinx.get('total_found', 0), self._maxmatches) + + def reset(self): + return self.__class__(self._model, self._index) + + # Internal methods + def _clone(self, **kwargs): + # Clones the queryset passing any changed args + c = self.__class__() + c.__dict__.update(self.__dict__) + c.__dict__.update(kwargs) + return c + + def _sphinx(self): + if not self.__metadata: + # We have to force execution if this is accessed beforehand + self._get_data() + return self.__metadata + _sphinx = property(_sphinx) + + def _get_data(self): + assert(self._index) + # need to find a way to make this work yet + if self._result_cache is None: + self._result_cache = list(self._get_results()) + return self._result_cache + + def _get_sphinx_results(self): + assert(self._offset + self._limit <= self._maxmatches) + + client = sphinxapi.SphinxClient() + client.SetServer(SPHINX_SERVER, SPHINX_PORT) + + if self._sort: + client.SetSortMode(*self._sort) + + if isinstance(self._weights, dict): + client.SetFieldWeights(self._weights) + else: + # assume its a list + client.SetWeights(map(int, self._weights)) + + client.SetMatchMode(self._mode) + + # 0.97 requires you to reset it + if hasattr(client, 'ResetFilters'): + client.ResetFilters() + if hasattr(client, 'ResetGroupBy'): + client.ResetGroupBy() + + def _handle_filters(filter_list, exclude=False): + for name, values in filter_list.iteritems(): + parts = len(name.split('__')) + if parts > 2: + raise NotImplementedError, 'Related object and/or multiple field lookups not supported' + elif parts == 2: + # The float handling for __gt and __lt is kind of ugly.. + name, lookup = name.split('__', 1) + is_float = isinstance(values[0], float) + if lookup == 'gt': + value = is_float and values[0] + (1.0/MAX_INT) or values[0] - 1 + args = (name, value, MAX_INT, exclude) + elif lookup == 'gte': + args = (name, values[0], MAX_INT, exclude) + elif lookup == 'lt': + value = is_float and values[0] - (1.0/MAX_INT) or values[0] - 1 + args = (name, -MAX_INT, value, exclude) + elif lookup == 'lte': + args = (name, -MAX_INT, values[0], exclude) + elif lookup == 'range': + args = (name, values[0], values[1], exclude) + else: + raise NotImplementedError, 'Related object and/or field lookup "%s" not supported' % lookup + if is_float: + client.SetFilterFloatRange(*args) + elif not exclude and self._model and name == self._model._meta.pk.column: + client.SetIDRange(*args[1:3]) + else: + client.SetFilterRange(*args) + + else: + client.SetFilter(name, values, exclude) + + # Include filters + if self._filters: + _handle_filters(self._filters) + + # Exclude filters + if self._excludes: + _handle_filters(self._excludes, True) + + if self._groupby: + client.SetGroupBy(self._groupby, self._groupfunc, self._groupsort) + + if self._anchor: + client.SetGeoAnchor(*self._anchor) + + if self._rankmode: + client.SetRankingMode(self._rankmode) + + if not self._limit > 0: + # Fix for Sphinx throwing an assertion error when you pass it an empty limiter + return [] + + + if sphinxapi.VER_COMMAND_SEARCH >= 0x113: + client.SetRetries(SPHINX_RETRIES, SPHINX_RETRIES_DELAY) + + client.SetLimits(int(self._offset), int(self._limit), int(self._maxmatches)) + + results = client.Query(self._query, self._index) + + # The Sphinx API doesn't raise exceptions + if not results: + if client.GetLastError(): + raise SearchError, client.GetLastError() + elif client.GetLastWarning(): + raise SearchError, client.GetLastWarning() + return results + + def _get_results(self): + results = self._get_sphinx_results() + if not results or not results['matches']: + results = [] + elif self._model: + queryset = self._model.objects.all() + if self._select_related: + queryset = queryset.select_related(*self._select_related_fields, **self._select_related_args) + if self._extra: + queryset = queryset.extra(**self._extra) + pks = getattr(self._model._meta, 'pks', None) + if pks is None or len(pks) == 1: + queryset = queryset.filter(pk__in=[r['id'] for r in results['matches']]) + queryset = dict([(o.pk, o) for o in queryset]) + else: + for r in results['matches']: + r['id'] = ', '.join([unicode(r['attrs'][p.column]) for p in pks]) + q = reduce(operator.or_, [reduce(operator.and_, [Q(**{p.name: r['attrs'][p.column]}) for p in pks]) for r in results['matches']]) + if q: + queryset = queryset.filter(q) + queryset = dict([(', '.join([unicode(p) for p in o.pks]), o) for o in queryset]) + else: + queryset = None + + if queryset: + self.__metadata = { + 'total': results['total'], + 'total_found': results['total_found'], + 'words': results['words'], + } + results = [SphinxProxy(queryset[r['id']], r) for r in results['matches'] if r['id'] in queryset] + else: + results = [] + else: + "We did a query without a model, lets see if there's a content_type" + results['attrs'] = dict(results['attrs']) + if 'content_type' in results['attrs']: + "Now we have to do one query per content_type" + objcache = {} + for r in results['matches']: + ct = r['attrs']['content_type'] + if ct not in objcache: + objcache[ct] = {} + objcache[ct][r['id']] = None + for ct in objcache: + queryset = ContentType.objects.get(pk=ct).model_class().objects.filter(pk__in=objcache[ct]) + for o in queryset: + objcache[ct][o.id] = o + results = [objcache[r['attrs']['content_type']][r['id']] for r in results['matches']] + else: + results = results['matches'] + self._result_cache = results + return results + +class SphinxModelManager(object): + def __init__(self, model, **kwargs): + self._model = model + self._index = kwargs.pop('index', model._meta.db_table) + self._kwargs = kwargs + + def _get_query_set(self): + return SphinxQuerySet(self._model, index=self._index, **self._kwargs) + + def get_index(self): + return self._index + + def all(self): + return self._get_query_set() + + def filter(self, **kwargs): + return self._get_query_set().filter(**kwargs) + + def query(self, *args, **kwargs): + return self._get_query_set().query(*args, **kwargs) + + def on_index(self, *args, **kwargs): + return self._get_query_set().on_index(*args, **kwargs) + + def geoanchor(self, *args, **kwargs): + return self._get_query_set().geoanchor(*args, **kwargs) + +class SphinxInstanceManager(object): + """Collection of tools useful for objects which are in a Sphinx index.""" + def __init__(self, instance, index): + self._instance = instance + self._index = index + + def update(self, **kwargs): + assert(sphinxapi.VER_COMMAND_SEARCH >= 0x113, "You must upgrade sphinxapi to version 0.98 to use Geo Anchoring.") + sphinxapi.UpdateAttributes(index, kwargs.keys(), dict(self.instance.pk, map(to_sphinx, kwargs.values()))) + + +class SphinxSearch(object): + def __init__(self, index=None, **kwargs): + self._kwargs = kwargs + self._sphinx = None + self._index = index + self.model = None + + def __call__(self, index, **kwargs): + warnings.warn('For non-model searches use a SphinxQuerySet instance.', DeprecationWarning) + return SphinxQuerySet(index=index, **kwargs) + + def __get__(self, instance, model, **kwargs): + if instance: + return SphinxInstanceManager(instance, index) + return self._sphinx + + def contribute_to_class(self, model, name, **kwargs): + if self._index is None: + self._index = model._meta.db_table + self._sphinx = SphinxModelManager(model, index=self._index, **self._kwargs) + self.model = model + if getattr(model, '__sphinx_indexes__', None) is None: + setattr(model, '__sphinx_indexes__', [self._index]) + else: + model.__sphinx_indexes__.append(self._index) + setattr(model, name, self._sphinx) + +class SphinxRelationProxy(SphinxProxy): + def count(self): + return min(self._sphinx['attrs']['@count'], self._maxmatches) + +class SphinxRelation(SphinxSearch): + """ + Adds "related model" support to django-sphinx -- + http://code.google.com/p/django-sphinx/ + http://www.sphinxsearch.com/ + + Example -- + + class MySearch(SphinxSearch): + myrelatedobject = SphinxRelation(RelatedModel) + anotherone = SphinxRelation(AnotherModel) + ... + + class MyModel(models.Model): + search = MySearch('index') + + """ + def __init__(self, model=None, attr=None, sort='@count desc', **kwargs): + if model: + self._related_model = model + self._related_attr = attr or model.__name__.lower() + self._related_sort = sort + super(SphinxRelation, self).__init__(**kwargs) + + def __get__(self, instance, instance_model, **kwargs): + self._mode = instance._mode + self._rankmode = instance._rankmode + self._index = instance._index + self._query = instance._query + self._filters = instance._filters + self._excludes = instance._excludes + self._model = self._related_model + self._groupby = self._related_attr + self._groupsort = self._related_sort + self._groupfunc = sphinxapi.SPH_GROUPBY_ATTR + return self + + def _get_results(self): + results = self._get_sphinx_results() + if not results: return [] + if results['matches'] and self._model: + ids = [] + for r in results['matches']: + value = r['attrs']['@groupby'] + if isinstance(value, (int, long)): + ids.append(value) + else: + ids.extend() + qs = self._model.objects.filter(pk__in=set(ids)) + if self._select_related: + qs = qs.select_related(*self._select_related_fields, + **self._select_related_args) + if self._extra: + qs = qs.extra(**self._extra) + queryset = dict([(o.id, o) for o in qs]) + self.__metadata = { + 'total': results['total'], + 'total_found': results['total_found'], + 'words': results['words'], + } + results = [ SphinxRelationProxy(queryset[k['attrs']['@groupby']], k) \ + for k in results['matches'] \ + if k['attrs']['@groupby'] in queryset ] + else: + results = [] + self._result_cache = results + return results + + def _sphinx(self): + if not self.__metadata: + # We have to force execution if this is accessed beforehand + self._get_data() + return self.__metadata + _sphinx = property(_sphinx) \ No newline at end of file diff --git a/apps/djangosphinx/templates/index-multiple.conf b/apps/djangosphinx/templates/index-multiple.conf new file mode 100644 index 000000000..3516e4619 --- /dev/null +++ b/apps/djangosphinx/templates/index-multiple.conf @@ -0,0 +1,12 @@ +index {{ index_name }} +{ + source = {{ source_name }} + path = /var/data/{{ index_name }} + docinfo = extern + morphology = none + stopwords = + min_word_len = 2 + charset_type = sbcs + min_prefix_len = 0 + min_infix_len = 0 +} \ No newline at end of file diff --git a/apps/djangosphinx/templates/index.conf b/apps/djangosphinx/templates/index.conf new file mode 100644 index 000000000..3516e4619 --- /dev/null +++ b/apps/djangosphinx/templates/index.conf @@ -0,0 +1,12 @@ +index {{ index_name }} +{ + source = {{ source_name }} + path = /var/data/{{ index_name }} + docinfo = extern + morphology = none + stopwords = + min_word_len = 2 + charset_type = sbcs + min_prefix_len = 0 + min_infix_len = 0 +} \ No newline at end of file diff --git a/apps/djangosphinx/templates/source-multiple.conf b/apps/djangosphinx/templates/source-multiple.conf new file mode 100644 index 000000000..6f525dcdb --- /dev/null +++ b/apps/djangosphinx/templates/source-multiple.conf @@ -0,0 +1,31 @@ +source {{ source_name }} +{ + type = {{ database_engine }} + html_strip = 0 + html_index_attrs = + sql_host = {{ database_host }} + sql_user = {{ database_user }} + sql_pass = {{ database_password }} + sql_db = {{ database_name }} + sql_port = {{ database_port }} + + sql_query_pre = + sql_query_post = + sql_query = \ +{% for table_name, content_type in tables %} + SELECT {{ field_names|join:", " }}, {{ content_type.id }} as content_type \ + FROM `{{ table_name }}`{% if not loop.last %} UNION \{% endif %} +{% endfor %} +{% if group_columns %} + # ForeignKey's +{% for field_name in group_columns %} sql_group_column = {{ field_name }} +{% endfor %}{% endif %} +{% if bool_columns %} + # BooleanField's +{% for field_name in bool_columns %} sql_group_column = {{ field_name }} +{% endfor %}{% endif %} +{% if date_columns %} + # DateField's and DateTimeField's +{% for field_name in date_columns %} sql_date_column = {{ field_name }} +{% endfor %}{% endif %} +} \ No newline at end of file diff --git a/apps/djangosphinx/templates/source.conf b/apps/djangosphinx/templates/source.conf new file mode 100644 index 000000000..a991f6415 --- /dev/null +++ b/apps/djangosphinx/templates/source.conf @@ -0,0 +1,31 @@ +source {{ source_name }} +{ + type = {{ database_engine }} + strip_html = 0 + index_html_attrs = + sql_host = {{ database_host }} + sql_user = {{ database_user }} + sql_pass = {{ database_password }} + sql_db = {{ database_name }} + sql_port = {{ database_port }} + log = {{ log_file }} + + sql_query_pre = + sql_query_post = + sql_query = \ + SELECT {{ field_names|join:", " }} \ + FROM {{ table_name }} + sql_query_info = SELECT * FROM `{{ table_name }}` WHERE `{{ primary_key }}` = $id +{% if group_columns %} + # ForeignKey's +{% for field_name in group_columns %} sql_group_column = {{ field_name }} +{% endfor %}{% endif %} +{% if bool_columns %} + # BooleanField's +{% for field_name in bool_columns %} sql_group_column = {{ field_name }} +{% endfor %}{% endif %} +{% if date_columns %} + # DateField's and DateTimeField's +{% for field_name in date_columns %} sql_date_column = {{ field_name }} +{% endfor %}{% endif %} +} \ No newline at end of file diff --git a/apps/djangosphinx/utils/__init__.py b/apps/djangosphinx/utils/__init__.py new file mode 100644 index 000000000..635cf5678 --- /dev/null +++ b/apps/djangosphinx/utils/__init__.py @@ -0,0 +1 @@ +from config import * \ No newline at end of file diff --git a/apps/djangosphinx/utils/config.py b/apps/djangosphinx/utils/config.py new file mode 100644 index 000000000..d73abd141 --- /dev/null +++ b/apps/djangosphinx/utils/config.py @@ -0,0 +1,182 @@ +from django.conf import settings +from django.template import Template, Context + +from django.db import models +from django.contrib.contenttypes.models import ContentType + +import os.path + +import djangosphinx.apis.current as sphinxapi + +__all__ = ('generate_config_for_model', 'generate_config_for_models') + +def _get_database_engine(): + if settings.DATABASE_ENGINE == 'mysql': + return settings.DATABASE_ENGINE + elif settings.DATABASE_ENGINE.startswith('postgresql'): + return 'pgsql' + raise ValueError, "Only MySQL and PostgreSQL engines are supported by Sphinx." + +def _get_template(name): + paths = ( + os.path.join(os.path.dirname(__file__), '../apis/api%s/templates/' % (sphinxapi.VER_COMMAND_SEARCH,)), + os.path.join(os.path.dirname(__file__), '../templates/'), + ) + for path in paths: + try: + fp = open(path + name, 'r') + except IOError: + continue + try: + t = Template(fp.read()) + return t + finally: + fp.close() + raise ValueError, "Template matching name does not exist: %s." % (name,) + +def _is_sourcable_field(field): + # We can use float fields in 0.98 + if sphinxapi.VER_COMMAND_SEARCH >= 0x113 and (isinstance(field, models.FloatField) or isinstance(field, models.DecimalField)): + return True + if isinstance(field, models.ForeignKey): + return True + if isinstance(field, models.IntegerField) and field.choices: + return True + if not field.rel: + return True + return False + +# No trailing slashes on paths +DEFAULT_SPHINX_PARAMS = { + 'database_engine': _get_database_engine(), + 'database_host': settings.DATABASE_HOST, + 'database_port': settings.DATABASE_PORT, + 'database_name': settings.DATABASE_NAME, + 'database_user': settings.DATABASE_USER, + 'database_password': settings.DATABASE_PASSWORD, + 'log_file': '/var/log/sphinx/searchd.log', + 'data_path': '/var/data', +} + +# Generate for single models + +def generate_config_for_model(model_class, index=None, sphinx_params={}): + """ + Generates a sample configuration including an index and source for + the given model which includes all attributes and date fields. + """ + return generate_source_for_model(model_class, index, sphinx_params) + "\n\n" + generate_index_for_model(model_class, index, sphinx_params) + +def generate_index_for_model(model_class, index=None, sphinx_params={}): + """Generates a source configmration for a model.""" + t = _get_template('index.conf') + + if index is None: + index = model_class._meta.db_table + + params = DEFAULT_SPHINX_PARAMS + params.update(sphinx_params) + params.update({ + 'index_name': index, + 'source_name': index, + }) + + c = Context(params) + + return t.render(c) + + +def generate_source_for_model(model_class, index=None, sphinx_params={}): + """Generates a source configmration for a model.""" + t = _get_template('source.conf') + + valid_fields = [f for f in model_class._meta.fields if _is_sourcable_field(f)] + + # Hackish solution for a bug I've introduced into composite pks branch + pk = model_class._meta.get_field(model_class._meta.pk.name) + + if pk not in valid_fields: + valid_fields.insert(0, model_class._meta.pk) + + if index is None: + index = model_class._meta.db_table + + params = DEFAULT_SPHINX_PARAMS + params.update(sphinx_params) + params.update({ + 'source_name': index, + 'index_name': index, + 'table_name': index, + 'primary_key': pk.column, + 'field_names': [f.column for f in valid_fields], + 'group_columns': [f.column for f in valid_fields if (f.rel or isinstance(f, models.BooleanField) or isinstance(f, models.IntegerField)) and not f.primary_key], + 'date_columns': [f.column for f in valid_fields if isinstance(f, models.DateTimeField) or isinstance(f, models.DateField)], + 'float_columns': [f.column for f in valid_fields if isinstance(f, models.FloatField) or isinstance(f, models.DecimalField)], + }) + + c = Context(params) + + return t.render(c) + +# Generate for multiple models (search UNIONs) + +def generate_config_for_models(model_classes, index=None, sphinx_params={}): + """ + Generates a sample configuration including an index and source for + the given model which includes all attributes and date fields. + """ + return generate_source_for_models(model_classes, index, sphinx_params) + "\n\n" + generate_index_for_models(model_classes, index, sphinx_params) + +def generate_index_for_models(model_classes, index=None, sphinx_params={}): + """Generates a source configmration for a model.""" + t = _get_template('index-multiple.conf') + + if index is None: + index = '_'.join(m._meta.db_table for m in model_classes) + + params = DEFAULT_SPHINX_PARAMS + params.update(sphinx_params) + params.update({ + 'index_name': index, + 'source_name': index, + }) + + c = Context(params) + + return t.render(c) + +def generate_source_for_models(model_classes, index=None, sphinx_params={}): + """Generates a source configmration for a model.""" + t = _get_template('source-multiple.conf') + + # We need to loop through each model and find only the fields that exist *exactly* the + # same across models. + def _the_tuple(f): + return (f.__class__, f.column, getattr(f.rel, 'to', None), f.choices) + + valid_fields = [_the_tuple(f) for f in model_classes[0]._meta.fields if _is_sourcable_field(f)] + for model_class in model_classes[1:]: + valid_fields = [_the_tuple(f) for f in model_class._meta.fields if _the_tuple(f) in valid_fields] + + tables = [] + for model_class in model_classes: + tables.append((model_class._meta.db_table, ContentType.objects.get_for_model(model_class))) + + if index is None: + index = '_'.join(m._meta.db_table for m in model_classes) + + params = DEFAULT_SPHINX_PARAMS + params.update(sphinx_params) + params.update({ + 'tables': tables, + 'source_name': index, + 'index_name': index, + 'field_names': [f[1] for f in valid_fields], + 'group_columns': [f[1] for f in valid_fields if f[2] or isinstance(f[0], models.BooleanField) or isinstance(f[0], models.IntegerField)], + 'date_columns': [f[1] for f in valid_fields if issubclass(f[0], models.DateTimeField) or issubclass(f[0], models.DateField)], + 'float_columns': [f[1] for f in valid_fields if isinstance(f[0], models.FloatField) or isinstance(f[0], models.DecimalField)], + }) + + c = Context(params) + + return t.render(c) \ No newline at end of file diff --git a/wolnelektury/settings.py b/wolnelektury/settings.py index bdbc9923b..2b9d542d0 100644 --- a/wolnelektury/settings.py +++ b/wolnelektury/settings.py @@ -96,6 +96,7 @@ INSTALLED_APPS = [ # external 'south', + 'djangosphinx', 'newtagging', 'pagination', 'chunks', @@ -135,6 +136,9 @@ COMPRESS_JS = { COMPRESS_CSS_FILTERS = None +SPHINX_SERVER = 'localhost' +SPHINX_PORT = 3312 + # Load localsettings, if they exist try: diff --git a/wolnelektury/sphinx.conf b/wolnelektury/sphinx.conf new file mode 100644 index 000000000..80c411286 --- /dev/null +++ b/wolnelektury/sphinx.conf @@ -0,0 +1,127 @@ +searchd { + port = 3312 + log = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/searchd.log + query_log = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/query.log + read_timeout = 5 + max_children = 30 + pid_file = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/searchd.pid + max_matches = 1000 + seamless_rotate = 1 + preopen_indexes = 0 + unlink_old = 1 +} + +source catalogue_book +{ + type = mysql + strip_html = 0 + index_html_attrs = + sql_host = + sql_user = root + sql_pass = + sql_db = wolnelektury + sql_port = + + sql_query_pre = + sql_query_post = + sql_query = \ + SELECT id, title, slug, description, created_at, _short_html, parent_number, xml_file, html_file, pdf_file, odt_file, txt_file, parent_id \ + FROM catalogue_book + sql_query_info = SELECT * FROM `catalogue_book` WHERE `id` = $id + + # ForeignKey's + sql_group_column = parent_number + sql_group_column = parent_id + + + + # DateField's and DateTimeField's + sql_date_column = created_at + +} + +index catalogue_book +{ + source = catalogue_book + path = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/catalogue_book + docinfo = extern + morphology = none + stopwords = + min_word_len = 2 + charset_type = sbcs + min_prefix_len = 0 + min_infix_len = 0 +} + + +source catalogue_fragment +{ + type = mysql + strip_html = 0 + index_html_attrs = + sql_host = + sql_user = root + sql_pass = + sql_db = wolnelektury + sql_port = + + sql_query_pre = + sql_query_post = + sql_query = \ + SELECT id, text, short_text, _short_html, anchor, book_id \ + FROM catalogue_fragment + sql_query_info = SELECT * FROM `catalogue_fragment` WHERE `id` = $id + + # ForeignKey's + sql_group_column = book_id +} + +index catalogue_fragment +{ + source = catalogue_fragment + path = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/catalogue_fragment + docinfo = extern + morphology = none + stopwords = + min_word_len = 2 + charset_type = sbcs + min_prefix_len = 0 + min_infix_len = 0 +} + +source catalogue_tag +{ + type = mysql + strip_html = 0 + index_html_attrs = + sql_host = + sql_user = root + sql_pass = + sql_db = wolnelektury + sql_port = + + sql_query_pre = + sql_query_post = + sql_query = \ + SELECT id, name, slug, sort_key, category, description, main_page, user_id, book_count \ + FROM catalogue_tag + sql_query_info = SELECT * FROM `catalogue_tag` WHERE `id` = $id + + # ForeignKey's + sql_group_column = main_page + sql_group_column = user_id + sql_group_column = book_count +} + +index catalogue_tag +{ + source = catalogue_tag + path = /Users/zuber/Projekty/wolnelektury.pl-sphinx/wolnelektury/sphinx/catalogue_tag + docinfo = extern + morphology = none + stopwords = + min_word_len = 2 + charset_type = sbcs + min_prefix_len = 0 + min_infix_len = 0 +}