-
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
from sunburnt import sunburnt
from lxml import etree
import urllib
import warnings
from sunburnt import search
import copy
+from httplib2 import socket
+import re
class TermVectorOptions(search.Options):
self.writeable = False
elif 'r' not in mode:
self.readable = False
- self.init_schema()
+ try:
+ self.init_schema()
+ except socket.error, e:
+ raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
def _analyze(self, **kwargs):
if not self.readable:
terms = map(lambda n: unicode(n.text), terms)
return terms
+ def expand_margins(self, text, start, end):
+ totlen = len(text)
+
+ def is_boundary(x):
+ ws = re.compile(r"\W", re.UNICODE)
+ return bool(ws.match(x))
+
+ while start > 0:
+ if is_boundary(text[start - 1]):
+ break
+ start -= 1
+
+ while end < totlen - 1:
+ if is_boundary(text[end + 1]):
+ break
+ end += 1
+
+ return (start, end)
+
def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
start = None
end = None
((s, e),
(max(0, s - margins), min(totlen, e + margins))),
matches)
+ matches_margins = map(lambda (m, (s, e)):
+ (m, self.expand_margins(text, s, e)),
+ matches_margins)
+
+ # lets start with first match
(start, end) = matches_margins[0][1]
- matches = []
+ matches = [matches_margins[0][0]]
+
for (m, (s, e)) in matches_margins[1:]:
if end < s or start > e:
continue
start = min(start, s)
end = max(end, e)
matches.append(m)
-
+
snip = text[start:end]
matches.sort(lambda a, b: cmp(b[0], a[0]))
off = - start
snip = snip[:e + off] + mark[1] + snip[e + off:]
snip = snip[:s + off] + mark[0] + snip[s + off:]
- # maybe break on word boundaries
return snip
-