-
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from sunburnt import sunburnt
from lxml import etree
import urllib
import warnings
from sunburnt import search
import copy
+from httplib2 import socket
+import re
class TermVectorOptions(search.Options):
def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
self.schemadoc = schemadoc
- if mode == 'r':
+ if 'w' not in mode:
self.writeable = False
- elif mode == 'w':
+ elif 'r' not in mode:
self.readable = False
- self.init_schema()
+ try:
+ self.init_schema()
+ except socket.error, e:
+ raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
def _analyze(self, **kwargs):
if not self.readable:
if matches:
return self.substring(kwargs['text'], matches,
- margins=kwargs.get('margins', 30),
- mark=kwargs.get('mark', ("<b>", "</b>")))
+ margins=kwargs.get('margins', 30),
+ mark=kwargs.get('mark', ("<b>", "</b>")))
else:
return None
terms = map(lambda n: unicode(n.text), terms)
return terms
+ def expand_margins(self, text, start, end):
+ totlen = len(text)
+
+ def is_boundary(x):
+ ws = re.compile(r"\W", re.UNICODE)
+ return bool(ws.match(x))
+
+ while start > 0:
+ if is_boundary(text[start - 1]):
+ break
+ start -= 1
+
+ while end < totlen - 1:
+ if is_boundary(text[end + 1]):
+ break
+ end += 1
+
+ return (start, end)
+
def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
start = None
end = None
totlen = len(text)
- matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
- (start, end) = matches_margins[0]
-
- for (s, e) in matches_margins[1:]:
+ matches_margins = map(lambda (s, e):
+ ((s, e),
+ (max(0, s - margins), min(totlen, e + margins))),
+ matches)
+ matches_margins = map(lambda (m, (s, e)):
+ (m, self.expand_margins(text, s, e)),
+ matches_margins)
+
+ # lets start with first match
+ (start, end) = matches_margins[0][1]
+ matches = [matches_margins[0][0]]
+
+ for (m, (s, e)) in matches_margins[1:]:
if end < s or start > e:
continue
start = min(start, s)
end = max(end, e)
+ matches.append(m)
snip = text[start:end]
- matches = list(matches)
matches.sort(lambda a, b: cmp(b[0], a[0]))
+
for (s, e) in matches:
off = - start
snip = snip[:e + off] + mark[1] + snip[e + off:]
snip = snip[:s + off] + mark[0] + snip[s + off:]
- # maybe break on word boundaries
- return snip
+ return snip