src/search/custom.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 import re
   5 from urllib.parse import urlencode
   6 import warnings
   7 from httplib2 import socket
   8 from lxml import etree
   9 from scorched import connection, exc, search
  10
  11
  12 class CustomSolrConnection(connection.SolrConnection):
  13     def __init__(self, *args, **kw):
  14         super(CustomSolrConnection, self).__init__(*args, **kw)
  15         self.analysis_url = self.url + "analysis/field/"
  16
  17     def analyze(self, params):
  18         qs = urlencode(params)
  19         url = "%s?%s" % (self.analysis_url, qs)
  20         if len(url) > self.max_length_get_url:
  21             warnings.warn("Long query URL encountered - POSTing instead of GETting. "
  22                           "This query will not be cached at the HTTP layer")
  23             url = self.analysis_url
  24             kwargs = dict(
  25                 method="POST",
  26                 data=qs,
  27                 headers={"Content-Type": "application/x-www-form-urlencoded"},
  28             )
  29         else:
  30             kwargs = dict(method="GET")
  31         response = self.request(url=url, **kwargs)
  32         if response.status_code != 200:
  33             raise exc.SolrError(response)
  34         return response.content
  35
  36
  37 class CustomSolrInterface(connection.SolrInterface):
  38     # just copied from parent and SolrConnection -> CustomSolrConnection
  39     def __init__(self, url, http_connection=None, mode='',
  40                  retry_timeout=-1, max_length_get_url=connection.MAX_LENGTH_GET_URL,
  41                  search_timeout=()):
  42         """
  43         :param url: url to Solr
  44         :type url: str
  45         :param http_connection: optional -- already existing connection
  46         :type http_connection: requests connection
  47         :param mode: optional -- mode (readable, writable) Solr
  48         :type mode: str
  49         :param retry_timeout: optional -- timeout until retry
  50         :type retry_timeout: int
  51         :param max_length_get_url: optional -- max length until switch to post
  52         :type max_length_get_url: int
  53         :param search_timeout: (optional) How long to wait for the server to
  54                                send data before giving up, as a float, or a
  55                                (connect timeout, read timeout) tuple.
  56         :type search_timeout: float or tuple
  57         """
  58
  59         self.conn = CustomSolrConnection(
  60             url, http_connection, mode, retry_timeout, max_length_get_url)
  61         self.schema = self.init_schema()
  62         self._datefields = self._extract_datefields(self.schema)
  63
  64
  65     def _analyze(self, **kwargs):
  66         if not self.conn.readable:
  67             raise TypeError("This Solr instance is only for writing")
  68         args = {
  69             'analysis_showmatch': True
  70             }
  71         if 'field' in kwargs:
  72             args['analysis_fieldname'] = kwargs['field']
  73         if 'text' in kwargs:
  74             args['analysis_fieldvalue'] = kwargs['text']
  75         if 'q' in kwargs:
  76             args['q'] = kwargs['q']
  77         if 'query' in kwargs:
  78             args['q'] = kwargs['q']
  79
  80         params = [
  81             (k.replace('_', '.'), v)
  82             for (k, v) in search.params_from_dict(**args)
  83         ]
  84
  85         content = self.conn.analyze(params)
  86         doc = etree.fromstring(content)
  87         return doc
  88
  89     def highlight(self, **kwargs):
  90         doc = self._analyze(**kwargs)
  91         analyzed = doc.xpath("//lst[@name='index']/arr[last()]/lst[bool/@name='match']")
  92         matches = set()
  93         for wrd in analyzed:
  94             start = int(wrd.xpath("int[@name='start']")[0].text)
  95             end = int(wrd.xpath("int[@name='end']")[0].text)
  96             matches.add((start, end))
  97
  98         if matches:
  99             return self.substring(
 100                 kwargs['text'], matches, margins=kwargs.get('margins', 30), mark=kwargs.get('mark', ("<b>", "</b>")))
 101         else:
 102             return None
 103
 104     def analyze(self, **kwargs):
 105         doc = self._analyze(**kwargs)
 106         terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
 107         terms = map(lambda n: str(n.text), terms)
 108         return terms
 109
 110     def expand_margins(self, text, start, end):
 111         totlen = len(text)
 112
 113         def is_boundary(x):
 114             ws = re.compile(r"\W", re.UNICODE)
 115             return bool(ws.match(x))
 116
 117         while start > 0:
 118             if is_boundary(text[start - 1]):
 119                 break
 120             start -= 1
 121
 122         while end < totlen - 1:
 123             if is_boundary(text[end + 1]):
 124                 break
 125             end += 1
 126
 127         return start, end
 128
 129     def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
 130         totlen = len(text)
 131         matches_margins = [
 132             ((s, e), self.expand_margins(text, max(0, s - margins), min(totlen, e + margins))) for s, e in matches]
 133
 134         # lets start with first match
 135         (start, end) = matches_margins[0][1]
 136         new_matches = [matches_margins[0][0]]
 137
 138         for (m, (s, e)) in matches_margins[1:]:
 139             if end < s or start > e:
 140                 continue
 141             start = min(start, s)
 142             end = max(end, e)
 143             new_matches.append(m)
 144
 145         snip = text[start:end]
 146         new_matches.sort(key=lambda a: -a[0])
 147
 148         for (s, e) in new_matches:
 149             off = -start
 150             snip = snip[:e + off] + mark[1] + snip[e + off:]
 151             snip = snip[:s + off] + mark[0] + snip[s + off:]
 152         snip = re.sub('%s[ \t\n]+%s' % (mark[1], mark[0]), " ", snip)
 153
 154         return snip