X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/9b97d5a2faa6f56b439dcefe9c7bb23e0a84b39c..db833ba4517084f61a64907c6d15606e7c881edd:/apps/djangosphinx/manager.py diff --git a/apps/djangosphinx/manager.py b/apps/djangosphinx/manager.py new file mode 100644 index 000000000..3fbcc9657 --- /dev/null +++ b/apps/djangosphinx/manager.py @@ -0,0 +1,648 @@ +import select +import socket +import time +import struct +import warnings +import operator +import apis.current as sphinxapi + +try: + import decimal +except ImportError: + from django.utils import _decimal as decimal # for Python 2.3 + +from django.db.models.query import QuerySet, Q +from django.conf import settings + +__all__ = ('SearchError', 'ConnectionError', 'SphinxSearch', 'SphinxRelation') + +from django.contrib.contenttypes.models import ContentType +from datetime import datetime, date + +# server settings +SPHINX_SERVER = getattr(settings, 'SPHINX_SERVER', 'localhost') +SPHINX_PORT = int(getattr(settings, 'SPHINX_PORT', 3312)) + +# These require search API 275 (Sphinx 0.9.8) +SPHINX_RETRIES = int(getattr(settings, 'SPHINX_RETRIES', 0)) +SPHINX_RETRIES_DELAY = int(getattr(settings, 'SPHINX_RETRIES_DELAY', 5)) + +MAX_INT = int(2**31-1) + +class SearchError(Exception): pass +class ConnectionError(Exception): pass + +class SphinxProxy(object): + """ + Acts exactly like a normal instance of an object except that + it will handle any special sphinx attributes in a _sphinx class. + """ + __slots__ = ('__dict__', '__instance__', '_sphinx') + + def __init__(self, instance, attributes): + object.__setattr__(self, '__instance__', instance) + object.__setattr__(self, '_sphinx', attributes) + + def _get_current_object(self): + """ + Return the current object. This is useful if you want the real object + behind the proxy at a time for performance reasons or because you want + to pass the object into a different context. + """ + return self.__instance__ + __current_object = property(_get_current_object) + + def __dict__(self): + try: + return self.__current_object.__dict__ + except RuntimeError: + return AttributeError('__dict__') + __dict__ = property(__dict__) + + def __repr__(self): + try: + obj = self.__current_object + except RuntimeError: + return '<%s unbound>' % self.__class__.__name__ + return repr(obj) + + def __nonzero__(self): + try: + return bool(self.__current_object) + except RuntimeError: + return False + + def __unicode__(self): + try: + return unicode(self.__current_oject) + except RuntimeError: + return repr(self) + + def __dir__(self): + try: + return dir(self.__current_object) + except RuntimeError: + return [] + + def __getattr__(self, name, value=None): + if name == '__members__': + return dir(self.__current_object) + elif name == '_sphinx': + return object.__getattr__(self, '_sphinx', value) + return getattr(self.__current_object, name) + + def __setattr__(self, name, value): + if name == '_sphinx': + return object.__setattr__(self, '_sphinx', value) + return setattr(self.__current_object, name, value) + + def __setitem__(self, key, value): + self.__current_object[key] = value + + def __delitem__(self, key): + del self.__current_object[key] + + def __setslice__(self, i, j, seq): + self.__current_object[i:j] = seq + + def __delslice__(self, i, j): + del self.__current_object[i:j] + + __delattr__ = lambda x, n: delattr(x.__current_object, n) + __str__ = lambda x: str(x.__current_object) + __unicode__ = lambda x: unicode(x.__current_object) + __lt__ = lambda x, o: x.__current_object < o + __le__ = lambda x, o: x.__current_object <= o + __eq__ = lambda x, o: x.__current_object == o + __ne__ = lambda x, o: x.__current_object != o + __gt__ = lambda x, o: x.__current_object > o + __ge__ = lambda x, o: x.__current_object >= o + __cmp__ = lambda x, o: cmp(x.__current_object, o) + __hash__ = lambda x: hash(x.__current_object) + # attributes are currently not callable + # __call__ = lambda x, *a, **kw: x.__current_object(*a, **kw) + __len__ = lambda x: len(x.__current_object) + __getitem__ = lambda x, i: x.__current_object[i] + __iter__ = lambda x: iter(x.__current_object) + __contains__ = lambda x, i: i in x.__current_object + __getslice__ = lambda x, i, j: x.__current_object[i:j] + __add__ = lambda x, o: x.__current_object + o + __sub__ = lambda x, o: x.__current_object - o + __mul__ = lambda x, o: x.__current_object * o + __floordiv__ = lambda x, o: x.__current_object // o + __mod__ = lambda x, o: x.__current_object % o + __divmod__ = lambda x, o: x.__current_object.__divmod__(o) + __pow__ = lambda x, o: x.__current_object ** o + __lshift__ = lambda x, o: x.__current_object << o + __rshift__ = lambda x, o: x.__current_object >> o + __and__ = lambda x, o: x.__current_object & o + __xor__ = lambda x, o: x.__current_object ^ o + __or__ = lambda x, o: x.__current_object | o + __div__ = lambda x, o: x.__current_object.__div__(o) + __truediv__ = lambda x, o: x.__current_object.__truediv__(o) + __neg__ = lambda x: -(x.__current_object) + __pos__ = lambda x: +(x.__current_object) + __abs__ = lambda x: abs(x.__current_object) + __invert__ = lambda x: ~(x.__current_object) + __complex__ = lambda x: complex(x.__current_object) + __int__ = lambda x: int(x.__current_object) + __long__ = lambda x: long(x.__current_object) + __float__ = lambda x: float(x.__current_object) + __oct__ = lambda x: oct(x.__current_object) + __hex__ = lambda x: hex(x.__current_object) + __index__ = lambda x: x.__current_object.__index__() + __coerce__ = lambda x, o: x.__coerce__(x, o) + __enter__ = lambda x: x.__enter__() + __exit__ = lambda x, *a, **kw: x.__exit__(*a, **kw) + +def to_sphinx(value): + "Convert a value into a sphinx query value" + if isinstance(value, date) or isinstance(value, datetime): + return int(time.mktime(value.timetuple())) + elif isinstance(value, decimal.Decimal) or isinstance(value, float): + return float(value) + return int(value) + +class SphinxQuerySet(object): + available_kwargs = ('rankmode', 'mode', 'weights', 'maxmatches') + + def __init__(self, model=None, **kwargs): + self._select_related = False + self._select_related_args = {} + self._select_related_fields = [] + self._filters = {} + self._excludes = {} + self._extra = {} + self._query = '' + self.__metadata = None + self._offset = 0 + self._limit = 20 + + self._groupby = None + self._sort = None + self._weights = [1, 100] + + self._maxmatches = 1000 + self._result_cache = None + self._mode = sphinxapi.SPH_MATCH_ALL + self._rankmode = getattr(sphinxapi, 'SPH_RANK_PROXIMITY_BM25', None) + self._model = model + self._anchor = {} + self.__metadata = {} + + self.set_options(**kwargs) + + if model: + self._index = kwargs.get('index', model._meta.db_table) + else: + self._index = kwargs.get('index') + + def __repr__(self): + if self._result_cache is not None: + return repr(self._get_data()) + else: + return '<%s instance>' % (self.__class__.__name__,) + + def __len__(self): + return len(self._get_data()) + + def __iter__(self): + return iter(self._get_data()) + + def __getitem__(self, k): + if not isinstance(k, (slice, int, long)): + raise TypeError + assert (not isinstance(k, slice) and (k >= 0)) \ + or (isinstance(k, slice) and (k.start is None or k.start >= 0) and (k.stop is None or k.stop >= 0)), \ + "Negative indexing is not supported." + if type(k) == slice: + if self._offset < k.start or k.stop-k.start > self._limit: + self._result_cache = None + else: + if k not in range(self._offset, self._limit+self._offset): + self._result_cache = None + if self._result_cache is None: + if type(k) == slice: + self._offset = k.start + self._limit = k.stop-k.start + return self._get_results() + else: + self._offset = k + self._limit = 1 + return self._get_results()[0] + else: + return self._result_cache[k] + + def set_options(self, **kwargs): + if 'rankmode' in kwargs: + if kwargs.get('rankmode') is None: + kwargs['rankmode'] = sphinxapi.SPH_RANK_NONE + for key in self.available_kwargs: + if key in kwargs: + setattr(self, '_%s' % (key,), kwargs[key]) + + def query(self, string): + return self._clone(_query=unicode(string).encode('utf-8')) + + def group_by(self, attribute, func, groupsort='@group desc'): + return self._clone(_groupby=attribute, _groupfunc=func, _groupsort=groupsort) + + def rank_none(self): + warnings.warn('`rank_none()` is deprecated. Use `set_options(rankmode=None)` instead.', DeprecationWarning) + return self._clone(_rankmode=sphinxapi.SPH_RANK_NONE) + + def mode(self, mode): + warnings.warn('`mode()` is deprecated. Use `set_options(mode='')` instead.', DeprecationWarning) + return self._clone(_mode=mode) + + def weights(self, weights): + warnings.warn('`mode()` is deprecated. Use `set_options(weights=[])` instead.', DeprecationWarning) + return self._clone(_weights=weights) + + def on_index(self, index): + warnings.warn('`mode()` is deprecated. Use `set_options(on_index=foo)` instead.', DeprecationWarning) + return self._clone(_index=index) + + # only works on attributes + def filter(self, **kwargs): + filters = self._filters.copy() + for k,v in kwargs.iteritems(): + if hasattr(v, 'next'): + v = list(v) + elif not (isinstance(v, list) or isinstance(v, tuple)): + v = [v,] + filters.setdefault(k, []).extend(map(to_sphinx, v)) + return self._clone(_filters=filters) + + def geoanchor(self, lat_attr, lng_attr, lat, lng): + assert(sphinxapi.VER_COMMAND_SEARCH >= 0x113, "You must upgrade sphinxapi to version 0.98 to use Geo Anchoring.") + return self._clone(_anchor=(lat_attr, lng_attr, float(lat), float(lng))) + + # this actually does nothing, its just a passthru to + # keep things looking/working generally the same + def all(self): + return self + + # only works on attributes + def exclude(self, **kwargs): + filters = self._excludes.copy() + for k,v in kwargs.iteritems(): + if hasattr(v, 'next'): + v = list(v) + elif not (isinstance(v, list) or isinstance(v, tuple)): + v = [v,] + filters.setdefault(k, []).extend(map(to_sphinx, v)) + return self._clone(_excludes=filters) + + # you cannot order by @weight (it always orders in descending) + # keywords are @id, @weight, @rank, and @relevance + def order_by(self, *args): + sort_by = [] + for arg in args: + sort = 'ASC' + if arg[0] == '-': + arg = arg[1:] + sort = 'DESC' + if arg == 'id': + arg = '@id' + sort_by.append('%s %s' % (arg, sort)) + if sort_by: + return self._clone(_sort=(sphinxapi.SPH_SORT_EXTENDED, ', '.join(sort_by))) + return self + + # pass these thru on the queryset and let django handle it + def select_related(self, *args, **kwargs): + _args = self._select_related_fields[:] + _args.extend(args) + _kwargs = self._select_related_args.copy() + _kwargs.update(kwargs) + + return self._clone( + _select_related=True, + _select_related_fields=_args, + _select_related_args=_kwargs, + ) + + def extra(self, **kwargs): + extra = self._extra.copy() + extra.update(kwargs) + return self._clone(_extra=extra) + + def count(self): + return min(self._sphinx.get('total_found', 0), self._maxmatches) + + def reset(self): + return self.__class__(self._model, self._index) + + # Internal methods + def _clone(self, **kwargs): + # Clones the queryset passing any changed args + c = self.__class__() + c.__dict__.update(self.__dict__) + c.__dict__.update(kwargs) + return c + + def _sphinx(self): + if not self.__metadata: + # We have to force execution if this is accessed beforehand + self._get_data() + return self.__metadata + _sphinx = property(_sphinx) + + def _get_data(self): + assert(self._index) + # need to find a way to make this work yet + if self._result_cache is None: + self._result_cache = list(self._get_results()) + return self._result_cache + + def _get_sphinx_results(self): + assert(self._offset + self._limit <= self._maxmatches) + + client = sphinxapi.SphinxClient() + client.SetServer(SPHINX_SERVER, SPHINX_PORT) + + if self._sort: + client.SetSortMode(*self._sort) + + if isinstance(self._weights, dict): + client.SetFieldWeights(self._weights) + else: + # assume its a list + client.SetWeights(map(int, self._weights)) + + client.SetMatchMode(self._mode) + + # 0.97 requires you to reset it + if hasattr(client, 'ResetFilters'): + client.ResetFilters() + if hasattr(client, 'ResetGroupBy'): + client.ResetGroupBy() + + def _handle_filters(filter_list, exclude=False): + for name, values in filter_list.iteritems(): + parts = len(name.split('__')) + if parts > 2: + raise NotImplementedError, 'Related object and/or multiple field lookups not supported' + elif parts == 2: + # The float handling for __gt and __lt is kind of ugly.. + name, lookup = name.split('__', 1) + is_float = isinstance(values[0], float) + if lookup == 'gt': + value = is_float and values[0] + (1.0/MAX_INT) or values[0] - 1 + args = (name, value, MAX_INT, exclude) + elif lookup == 'gte': + args = (name, values[0], MAX_INT, exclude) + elif lookup == 'lt': + value = is_float and values[0] - (1.0/MAX_INT) or values[0] - 1 + args = (name, -MAX_INT, value, exclude) + elif lookup == 'lte': + args = (name, -MAX_INT, values[0], exclude) + elif lookup == 'range': + args = (name, values[0], values[1], exclude) + else: + raise NotImplementedError, 'Related object and/or field lookup "%s" not supported' % lookup + if is_float: + client.SetFilterFloatRange(*args) + elif not exclude and self._model and name == self._model._meta.pk.column: + client.SetIDRange(*args[1:3]) + else: + client.SetFilterRange(*args) + + else: + client.SetFilter(name, values, exclude) + + # Include filters + if self._filters: + _handle_filters(self._filters) + + # Exclude filters + if self._excludes: + _handle_filters(self._excludes, True) + + if self._groupby: + client.SetGroupBy(self._groupby, self._groupfunc, self._groupsort) + + if self._anchor: + client.SetGeoAnchor(*self._anchor) + + if self._rankmode: + client.SetRankingMode(self._rankmode) + + if not self._limit > 0: + # Fix for Sphinx throwing an assertion error when you pass it an empty limiter + return [] + + + if sphinxapi.VER_COMMAND_SEARCH >= 0x113: + client.SetRetries(SPHINX_RETRIES, SPHINX_RETRIES_DELAY) + + client.SetLimits(int(self._offset), int(self._limit), int(self._maxmatches)) + + results = client.Query(self._query, self._index) + + # The Sphinx API doesn't raise exceptions + if not results: + if client.GetLastError(): + raise SearchError, client.GetLastError() + elif client.GetLastWarning(): + raise SearchError, client.GetLastWarning() + return results + + def _get_results(self): + results = self._get_sphinx_results() + if not results or not results['matches']: + results = [] + elif self._model: + queryset = self._model.objects.all() + if self._select_related: + queryset = queryset.select_related(*self._select_related_fields, **self._select_related_args) + if self._extra: + queryset = queryset.extra(**self._extra) + pks = getattr(self._model._meta, 'pks', None) + if pks is None or len(pks) == 1: + queryset = queryset.filter(pk__in=[r['id'] for r in results['matches']]) + queryset = dict([(o.pk, o) for o in queryset]) + else: + for r in results['matches']: + r['id'] = ', '.join([unicode(r['attrs'][p.column]) for p in pks]) + q = reduce(operator.or_, [reduce(operator.and_, [Q(**{p.name: r['attrs'][p.column]}) for p in pks]) for r in results['matches']]) + if q: + queryset = queryset.filter(q) + queryset = dict([(', '.join([unicode(p) for p in o.pks]), o) for o in queryset]) + else: + queryset = None + + if queryset: + self.__metadata = { + 'total': results['total'], + 'total_found': results['total_found'], + 'words': results['words'], + } + results = [SphinxProxy(queryset[r['id']], r) for r in results['matches'] if r['id'] in queryset] + else: + results = [] + else: + "We did a query without a model, lets see if there's a content_type" + results['attrs'] = dict(results['attrs']) + if 'content_type' in results['attrs']: + "Now we have to do one query per content_type" + objcache = {} + for r in results['matches']: + ct = r['attrs']['content_type'] + if ct not in objcache: + objcache[ct] = {} + objcache[ct][r['id']] = None + for ct in objcache: + queryset = ContentType.objects.get(pk=ct).model_class().objects.filter(pk__in=objcache[ct]) + for o in queryset: + objcache[ct][o.id] = o + results = [objcache[r['attrs']['content_type']][r['id']] for r in results['matches']] + else: + results = results['matches'] + self._result_cache = results + return results + +class SphinxModelManager(object): + def __init__(self, model, **kwargs): + self._model = model + self._index = kwargs.pop('index', model._meta.db_table) + self._kwargs = kwargs + + def _get_query_set(self): + return SphinxQuerySet(self._model, index=self._index, **self._kwargs) + + def get_index(self): + return self._index + + def all(self): + return self._get_query_set() + + def filter(self, **kwargs): + return self._get_query_set().filter(**kwargs) + + def query(self, *args, **kwargs): + return self._get_query_set().query(*args, **kwargs) + + def on_index(self, *args, **kwargs): + return self._get_query_set().on_index(*args, **kwargs) + + def geoanchor(self, *args, **kwargs): + return self._get_query_set().geoanchor(*args, **kwargs) + +class SphinxInstanceManager(object): + """Collection of tools useful for objects which are in a Sphinx index.""" + def __init__(self, instance, index): + self._instance = instance + self._index = index + + def update(self, **kwargs): + assert(sphinxapi.VER_COMMAND_SEARCH >= 0x113, "You must upgrade sphinxapi to version 0.98 to use Geo Anchoring.") + sphinxapi.UpdateAttributes(index, kwargs.keys(), dict(self.instance.pk, map(to_sphinx, kwargs.values()))) + + +class SphinxSearch(object): + def __init__(self, index=None, **kwargs): + self._kwargs = kwargs + self._sphinx = None + self._index = index + self.model = None + + def __call__(self, index, **kwargs): + warnings.warn('For non-model searches use a SphinxQuerySet instance.', DeprecationWarning) + return SphinxQuerySet(index=index, **kwargs) + + def __get__(self, instance, model, **kwargs): + if instance: + return SphinxInstanceManager(instance, index) + return self._sphinx + + def contribute_to_class(self, model, name, **kwargs): + if self._index is None: + self._index = model._meta.db_table + self._sphinx = SphinxModelManager(model, index=self._index, **self._kwargs) + self.model = model + if getattr(model, '__sphinx_indexes__', None) is None: + setattr(model, '__sphinx_indexes__', [self._index]) + else: + model.__sphinx_indexes__.append(self._index) + setattr(model, name, self._sphinx) + +class SphinxRelationProxy(SphinxProxy): + def count(self): + return min(self._sphinx['attrs']['@count'], self._maxmatches) + +class SphinxRelation(SphinxSearch): + """ + Adds "related model" support to django-sphinx -- + http://code.google.com/p/django-sphinx/ + http://www.sphinxsearch.com/ + + Example -- + + class MySearch(SphinxSearch): + myrelatedobject = SphinxRelation(RelatedModel) + anotherone = SphinxRelation(AnotherModel) + ... + + class MyModel(models.Model): + search = MySearch('index') + + """ + def __init__(self, model=None, attr=None, sort='@count desc', **kwargs): + if model: + self._related_model = model + self._related_attr = attr or model.__name__.lower() + self._related_sort = sort + super(SphinxRelation, self).__init__(**kwargs) + + def __get__(self, instance, instance_model, **kwargs): + self._mode = instance._mode + self._rankmode = instance._rankmode + self._index = instance._index + self._query = instance._query + self._filters = instance._filters + self._excludes = instance._excludes + self._model = self._related_model + self._groupby = self._related_attr + self._groupsort = self._related_sort + self._groupfunc = sphinxapi.SPH_GROUPBY_ATTR + return self + + def _get_results(self): + results = self._get_sphinx_results() + if not results: return [] + if results['matches'] and self._model: + ids = [] + for r in results['matches']: + value = r['attrs']['@groupby'] + if isinstance(value, (int, long)): + ids.append(value) + else: + ids.extend() + qs = self._model.objects.filter(pk__in=set(ids)) + if self._select_related: + qs = qs.select_related(*self._select_related_fields, + **self._select_related_args) + if self._extra: + qs = qs.extra(**self._extra) + queryset = dict([(o.id, o) for o in qs]) + self.__metadata = { + 'total': results['total'], + 'total_found': results['total_found'], + 'words': results['words'], + } + results = [ SphinxRelationProxy(queryset[k['attrs']['@groupby']], k) \ + for k in results['matches'] \ + if k['attrs']['@groupby'] in queryset ] + else: + results = [] + self._result_cache = results + return results + + def _sphinx(self): + if not self.__metadata: + # We have to force execution if this is accessed beforehand + self._get_data() + return self.__metadata + _sphinx = property(_sphinx) \ No newline at end of file