X-Git-Url: https://git.mdrn.pl/wolnelektury.git/blobdiff_plain/8b808f24709efb16f6b6eff6abb05b41341573c7..aae232dcf008ab39db553bc1b20573fa00e3b191:/apps/search/custom.py
diff --git a/apps/search/custom.py b/apps/search/custom.py
index fcc3bace2..b3b704d0b 100644
--- a/apps/search/custom.py
+++ b/apps/search/custom.py
@@ -1,15 +1,18 @@
-
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from sunburnt import sunburnt
from lxml import etree
import urllib
import warnings
from sunburnt import search
import copy
+from httplib2 import socket
+import re
class TermVectorOptions(search.Options):
- option_name = "tv"
-
def __init__(self, schema, original=None):
self.schema = schema
if original is None:
@@ -30,7 +33,8 @@ class TermVectorOptions(search.Options):
def options(self):
opts = {}
- opts['tv'] = 'true'
+ if self.positions or self.fields:
+ opts['tv'] = 'true'
if self.positions:
opts['tv.positions'] = 'true'
if self.fields:
@@ -72,12 +76,12 @@ def __term_vector(self, positions=False, fields=None):
newself.term_vectorer.update(positions, fields)
return newself
setattr(search.SolrSearch, 'term_vector', __term_vector)
-__original__init_common_modules = search.SolrSearch._init_common_modules
def __patched__init_common_modules(self):
__original__init_common_modules(self)
self.term_vectorer = TermVectorOptions(self.schema)
+__original__init_common_modules = search.SolrSearch._init_common_modules
setattr(search.SolrSearch, '_init_common_modules', __patched__init_common_modules)
@@ -86,11 +90,14 @@ class CustomSolrInterface(sunburnt.SolrInterface):
def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
self.schemadoc = schemadoc
- if mode == 'r':
+ if 'w' not in mode:
self.writeable = False
- elif mode == 'w':
+ elif 'r' not in mode:
self.readable = False
- self.init_schema()
+ try:
+ self.init_schema()
+ except socket.error, e:
+ raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
def _analyze(self, **kwargs):
if not self.readable:
@@ -118,39 +125,67 @@ class CustomSolrInterface(sunburnt.SolrInterface):
end = int(wrd.xpath("int[@name='end']")[0].text)
matches.add((start, end))
- print matches
if matches:
return self.substring(kwargs['text'], matches,
- margins=kwargs.get('margins', 30),
- mark=kwargs.get('mark', ("", "")))
+ margins=kwargs.get('margins', 30),
+ mark=kwargs.get('mark', ("", "")))
else:
return None
def analyze(self, **kwargs):
- doc = self._analyze(self, **kwargs)
- terms = doc.xpath("/lst[@name='index']/arr[last()]/lst/str[1]")
+ doc = self._analyze(**kwargs)
+ terms = doc.xpath("//lst[@name='index']/arr[last()]/lst/str[1]")
terms = map(lambda n: unicode(n.text), terms)
return terms
+ def expand_margins(self, text, start, end):
+ totlen = len(text)
+
+ def is_boundary(x):
+ ws = re.compile(r"\W", re.UNICODE)
+ return bool(ws.match(x))
+
+ while start > 0:
+ if is_boundary(text[start - 1]):
+ break
+ start -= 1
+
+ while end < totlen - 1:
+ if is_boundary(text[end + 1]):
+ break
+ end += 1
+
+ return (start, end)
+
def substring(self, text, matches, margins=30, mark=("", "")):
start = None
end = None
totlen = len(text)
- matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
- (start, end) = matches_margins[0]
-
- for (s, e) in matches_margins[1:]:
+ matches_margins = map(lambda (s, e):
+ ((s, e),
+ (max(0, s - margins), min(totlen, e + margins))),
+ matches)
+ matches_margins = map(lambda (m, (s, e)):
+ (m, self.expand_margins(text, s, e)),
+ matches_margins)
+
+ # lets start with first match
+ (start, end) = matches_margins[0][1]
+ matches = [matches_margins[0][0]]
+
+ for (m, (s, e)) in matches_margins[1:]:
if end < s or start > e:
continue
start = min(start, s)
end = max(end, e)
+ matches.append(m)
snip = text[start:end]
- matches = list(matches)
matches.sort(lambda a, b: cmp(b[0], a[0]))
+
for (s, e) in matches:
off = - start
snip = snip[:e + off] + mark[1] + snip[e + off:]
snip = snip[:s + off] + mark[0] + snip[s + off:]
- # maybe break on word boundaries
+
return snip