src/search/custom.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from sunburnt import sunburnt
   6 from lxml import etree
   7 import urllib
   8 import warnings
   9 from sunburnt import search
  10 import copy
  11 from httplib2 import socket
  12 import re
  13
  14
  15 class TermVectorOptions(search.Options):
  16     def __init__(self, schema, original=None):
  17         self.schema = schema
  18         if original is None:
  19             self.fields = set()
  20             self.positions = False
  21         else:
  22             self.fields = copy.copy(original.fields)
  23             self.positions = copy.copy(original.positions)
  24
  25     def update(self, positions=False, fields=None):
  26         if fields is None:
  27             fields = []
  28         if isinstance(fields, basestring):
  29             fields = [fields]
  30         self.schema.check_fields(fields, {"stored": True})
  31         self.fields.update(fields)
  32         self.positions = positions
  33
  34     def options(self):
  35         opts = {}
  36         if self.positions or self.fields:
  37             opts['tv'] = 'true'
  38         if self.positions:
  39             opts['tv.positions'] = 'true'
  40         if self.fields:
  41             opts['tv.fl'] = ','.join(sorted(self.fields))
  42         return opts
  43
  44
  45 class CustomSolrConnection(sunburnt.SolrConnection):
  46     def __init__(self, *args, **kw):
  47         super(CustomSolrConnection, self).__init__(*args, **kw)
  48         self.analysis_url = self.url + "analysis/field/"
  49
  50     def analyze(self, params):
  51         qs = urllib.urlencode(params)
  52         url = "%s?%s" % (self.analysis_url, qs)
  53         if len(url) > self.max_length_get_url:
  54             warnings.warn("Long query URL encountered - POSTing instead of "
  55                 "GETting. This query will not be cached at the HTTP layer")
  56             url = self.analysis_url
  57             kwargs = dict(
  58                 method="POST",
  59                 body=qs,
  60                 headers={"Content-Type": "application/x-www-form-urlencoded"},
  61             )
  62         else:
  63             kwargs = dict(method="GET")
  64         r, c = self.request(url, **kwargs)
  65         if r.status != 200:
  66             raise sunburnt.SolrError(r, c)
  67         return c
  68
  69
  70 # monkey patching sunburnt SolrSearch
  71 search.SolrSearch.option_modules += ('term_vectorer',)
  72
  73
  74 def __term_vector(self, positions=False, fields=None):
  75     newself = self.clone()
  76     newself.term_vectorer.update(positions, fields)
  77     return newself
  78 setattr(search.SolrSearch, 'term_vector', __term_vector)
  79
  80
  81 def __patched__init_common_modules(self):
  82     __original__init_common_modules(self)
  83     self.term_vectorer = TermVectorOptions(self.schema)
  84 __original__init_common_modules = search.SolrSearch._init_common_modules
  85 setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
  86
  87
  88 class CustomSolrInterface(sunburnt.SolrInterface):
  89     # just copied from parent and SolrConnection -> CustomSolrConnection
  90     def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
  91         self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
  92         self.schemadoc = schemadoc
  93         if 'w' not in mode:
  94             self.writeable = False
  95         elif 'r' not in mode:
  96             self.readable = False
  97         try:
  98             self.init_schema()
  99         except socket.error, e:
 100             raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
 101
 102     def _analyze(self, **kwargs):
 103         if not self.readable:
 104             raise TypeError("This Solr instance is only for writing")
 105         args = {
 106             'analysis_showmatch': True
 107             }
 108         if 'field' in kwargs: args['analysis_fieldname'] = kwargs['field']
 109         if 'text' in kwargs: args['analysis_fieldvalue'] = kwargs['text']
 110         if 'q' in kwargs: args['q'] = kwargs['q']
 111         if 'query' in kwargs: args['q'] = kwargs['q']
 112
 113         params = map(lambda (k, v): (k.replace('_', '.'), v), sunburnt.params_from_dict(**args))
 114
 115         content = self.conn.analyze(params)
 116         doc = etree.fromstring(content)
 117         return doc
 118
 119     def highlight(self, **kwargs):
 120         doc = self._analyze(**kwargs)
 121         analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
 122         matches = set()
 123         for wrd in analyzed:
 124             start = int(wrd.xpath("int[@name='start']")[0].text)
 125             end = int(wrd.xpath("int[@name='end']")[0].text)
 126             matches.add((start, end))
 127
 128         if matches:
 129             return self.substring(kwargs['text'], matches,
 130                 margins=kwargs.get('margins', 30),
 131                 mark=kwargs.get('mark', ("<b>", "</b>")))
 132         else:
 133             return None
 134
 135     def analyze(self, **kwargs):
 136         doc = self._analyze(**kwargs)
 137         terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
 138         terms = map(lambda n: unicode(n.text), terms)
 139         return terms
 140
 141     def expand_margins(self, text, start, end):
 142         totlen = len(text)
 143
 144         def is_boundary(x):
 145             ws = re.compile(r"\W", re.UNICODE)
 146             return bool(ws.match(x))
 147
 148         while start > 0:
 149             if is_boundary(text[start - 1]):
 150                 break
 151             start -= 1
 152
 153         while end < totlen - 1:
 154             if is_boundary(text[end + 1]):
 155                 break
 156             end += 1
 157
 158         return (start, end)
 159
 160     def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
 161         start = None
 162         end = None
 163         totlen = len(text)
 164         matches_margins = map(lambda (s, e):
 165                               ((s, e),
 166                                (max(0, s - margins), min(totlen, e + margins))),
 167                                   matches)
 168         matches_margins = map(lambda (m, (s, e)):
 169                               (m, self.expand_margins(text, s, e)),
 170             matches_margins)
 171
 172             # lets start with first match
 173         (start, end) = matches_margins[0][1]
 174         matches = [matches_margins[0][0]]
 175
 176         for (m, (s, e)) in matches_margins[1:]:
 177             if end < s or start > e:
 178                 continue
 179             start = min(start, s)
 180             end = max(end, e)
 181             matches.append(m)
 182
 183         snip = text[start:end]
 184         matches.sort(lambda a, b: cmp(b[0], a[0]))
 185
 186         for (s, e) in matches:
 187             off = - start
 188             snip = snip[:e + off] + mark[1] + snip[e + off:]
 189             snip = snip[:s + off] + mark[0] + snip[s + off:]
 190
 191         return snip