apps/search/highlight.py

   1
   2 from sunburnt import sunburnt
   3 from lxml import etree
   4 import urllib
   5 import warnings
   6
   7
   8 class HLSolrConnection(sunburnt.SolrConnection):
   9     def __init__(self, *args, **kw):
  10         super(HLSolrConnection, self).__init__(*args, **kw)
  11         self.analysis_url = self.url + "analysis/field/"
  12
  13     def highlight(self, params):
  14         qs = urllib.urlencode(params)
  15         url = "%s?%s" % (self.analysis_url, qs)
  16         if len(url) > self.max_length_get_url:
  17             warnings.warn("Long query URL encountered - POSTing instead of "
  18                 "GETting. This query will not be cached at the HTTP layer")
  19             url = self.analysis_url
  20             kwargs = dict(
  21                 method="POST",
  22                 body=qs,
  23                 headers={"Content-Type": "application/x-www-form-urlencoded"},
  24             )
  25         else:
  26             kwargs = dict(method="GET")
  27         r, c = self.request(url, **kwargs)
  28         if r.status != 200:
  29             raise sunburnt.SolrError(r, c)
  30         return c
  31
  32
  33 class HLSolrInterface(sunburnt.SolrInterface):
  34     # just copied from parent and SolrConnection -> HLSolrConnection
  35     def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
  36         self.conn = HLSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
  37         self.schemadoc = schemadoc
  38         if mode == 'r':
  39             self.writeable = False
  40         elif mode == 'w':
  41             self.readable = False
  42         self.init_schema()
  43
  44     def highlight(self, **kwargs):
  45         if not self.readable:
  46             raise TypeError("This Solr instance is only for writing")
  47         args = {
  48             'analysis_fieldname': kwargs['field'],
  49             'analysis_showmatch': True,
  50             'analysis_fieldvalue': kwargs['text'],
  51             'q': kwargs['q']
  52             }
  53         params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
  54
  55         content = self.conn.highlight(params)
  56         doc = etree.fromstring(content)
  57         analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
  58         matches = set()
  59         for wrd in analyzed:
  60             start = int(wrd.xpath("int[@name='start']")[0].text)
  61             end = int(wrd.xpath("int[@name='end']")[0].text)
  62             matches.add((start, end))
  63
  64         print matches
  65         return self.substring(kwargs['text'], matches,
  66                             margins=kwargs.get('margins', 30),
  67             mark=kwargs.get('mark', ("<b>", "</b>")))
  68
  69     def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
  70         start = None
  71         end = None
  72         totlen = len(text)
  73         matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
  74         (start, end) = matches_margins[0]
  75
  76         for (s, e) in matches_margins[1:]:
  77             if end < s or start > e:
  78                 continue
  79             start = min(start, s)
  80             end = max(end, e)
  81
  82         snip = text[start:end]
  83         matches = list(matches)
  84         matches.sort(lambda a, b: cmp(b[0], a[0]))
  85         for (s, e) in matches:
  86             off = - start
  87             snip = text[:e + off] + mark[1] + snip[e + off:]
  88             snip = text[:s + off] + mark[0] + snip[s + off:]
  89             # maybe break on word boundaries
  90         return snip