apps/search/custom.py

   1
   2 from sunburnt import sunburnt
   3 from lxml import etree
   4 import urllib
   5 import warnings
   6 from sunburnt import search
   7 import copy
   8
   9
  10 class TermVectorOptions(search.Options):
  11     def __init__(self, schema, original=None):
  12         self.schema = schema
  13         if original is None:
  14             self.fields = set()
  15             self.positions = False
  16         else:
  17             self.fields = copy.copy(original.fields)
  18             self.positions = copy.copy(original.positions)
  19
  20     def update(self, positions=False, fields=None):
  21         if fields is None:
  22             fields = []
  23         if isinstance(fields, basestring):
  24             fields = [fields]
  25         self.schema.check_fields(fields, {"stored": True})
  26         self.fields.update(fields)
  27         self.positions = positions
  28
  29     def options(self):
  30         opts = {}
  31         if self.positions or self.fields:
  32             opts['tv'] = 'true'
  33         if self.positions:
  34             opts['tv.positions'] = 'true'
  35         if self.fields:
  36             opts['tv.fl'] = ','.join(sorted(self.fields))
  37         return opts
  38
  39
  40 class CustomSolrConnection(sunburnt.SolrConnection):
  41     def __init__(self, *args, **kw):
  42         super(CustomSolrConnection, self).__init__(*args, **kw)
  43         self.analysis_url = self.url + "analysis/field/"
  44
  45     def analyze(self, params):
  46         qs = urllib.urlencode(params)
  47         url = "%s?%s" % (self.analysis_url, qs)
  48         if len(url) > self.max_length_get_url:
  49             warnings.warn("Long query URL encountered - POSTing instead of "
  50                 "GETting. This query will not be cached at the HTTP layer")
  51             url = self.analysis_url
  52             kwargs = dict(
  53                 method="POST",
  54                 body=qs,
  55                 headers={"Content-Type": "application/x-www-form-urlencoded"},
  56             )
  57         else:
  58             kwargs = dict(method="GET")
  59         r, c = self.request(url, **kwargs)
  60         if r.status != 200:
  61             raise sunburnt.SolrError(r, c)
  62         return c
  63
  64
  65 # monkey patching sunburnt SolrSearch
  66 search.SolrSearch.option_modules += ('term_vectorer',)
  67
  68
  69 def __term_vector(self, positions=False, fields=None):
  70     newself = self.clone()
  71     newself.term_vectorer.update(positions, fields)
  72     return newself
  73 setattr(search.SolrSearch, 'term_vector', __term_vector)
  74
  75
  76 def __patched__init_common_modules(self):
  77     __original__init_common_modules(self)
  78     self.term_vectorer = TermVectorOptions(self.schema)
  79 __original__init_common_modules = search.SolrSearch._init_common_modules
  80 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
  81
  82
  83 class CustomSolrInterface(sunburnt.SolrInterface):
  84     # just copied from parent and SolrConnection -> CustomSolrConnection
  85     def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
  86         self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
  87         self.schemadoc = schemadoc
  88         if 'w' not in mode:
  89             self.writeable = False
  90         elif 'r' not in mode:
  91             self.readable = False
  92         self.init_schema()
  93
  94     def _analyze(self, **kwargs):
  95         if not self.readable:
  96             raise TypeError("This Solr instance is only for writing")
  97         args = {
  98             'analysis_showmatch': True
  99             }
 100         if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field']
 101         if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text']
 102         if 'q' in kwargs: args['q'] = kwargs['q']
 103         if 'query' in kwargs: args['q'] = kwargs['q']
 104
 105         params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
 106
 107         content = self.conn.analyze(params)
 108         doc = etree.fromstring(content)
 109         return doc
 110
 111     def highlight(self, **kwargs):
 112         doc = self._analyze(**kwargs)
 113         analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
 114         matches = set()
 115         for wrd in analyzed:
 116             start = int(wrd.xpath("int[@name='start']")[0].text)
 117             end = int(wrd.xpath("int[@name='end']")[0].text)
 118             matches.add((start, end))
 119
 120         if matches:
 121             return self.substring(kwargs['text'], matches,
 122                 margins=kwargs.get('margins', 30),
 123                 mark=kwargs.get('mark', ("<b>", "</b>")))
 124         else:
 125             return None
 126
 127     def analyze(self, **kwargs):
 128         doc = self._analyze(**kwargs)
 129         terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
 130         terms = map(lambda n: unicode(n.text), terms)
 131         return terms
 132
 133     def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
 134         start = None
 135         end = None
 136         totlen = len(text)
 137         matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
 138         (start, end) = matches_margins[0]
 139
 140         for (s, e) in matches_margins[1:]:
 141             if end < s or start > e:
 142                 continue
 143             start = min(start, s)
 144             end = max(end, e)
 145
 146         snip = text[start:end]
 147         matches = list(matches)
 148         matches.sort(lambda a, b: cmp(b[0], a[0]))
 149         for (s, e) in matches:
 150             off = - start
 151             snip = snip[:e + off] + mark[1] + snip[e + off:]
 152             snip = snip[:s + off] + mark[0] + snip[s + off:]
 153             # maybe break on word boundaries
 154
 155         return snip
 156