pipeline bug

[wolnelektury.git] / apps / search / custom.py
diff --git a/apps/search/custom.py b/apps/search/custom.py

index 6c16f22..b3b704d 100644 (file)
--- a/apps/search/custom.py
+++ b/apps/search/custom.py
@@ -1,10 +1,15 @@
-
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
  from sunburnt import sunburnt
  from lxml import etree
  import urllib
  import warnings
  from sunburnt import search
  import copy
  from sunburnt import sunburnt
  from lxml import etree
  import urllib
  import warnings
  from sunburnt import search
  import copy
+from httplib2 import socket
+import re
  
  
  class TermVectorOptions(search.Options):
  
  
  class TermVectorOptions(search.Options):
@@ -89,7 +94,10 @@ class CustomSolrInterface(sunburnt.SolrInterface):
              self.writeable = False
          elif 'r' not in mode:
              self.readable = False
              self.writeable = False
          elif 'r' not in mode:
              self.readable = False
-        self.init_schema()
+        try:
+            self.init_schema()
+        except socket.error, e:
+            raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
  
      def _analyze(self, **kwargs):
          if not self.readable:
  
      def _analyze(self, **kwargs):
          if not self.readable:
@@ -130,27 +138,54 @@ class CustomSolrInterface(sunburnt.SolrInterface):
          terms = map(lambda n: unicode(n.text), terms)
          return terms
  
          terms = map(lambda n: unicode(n.text), terms)
          return terms
  
+    def expand_margins(self, text, start, end):
+        totlen = len(text)
+
+        def is_boundary(x):
+            ws = re.compile(r"\W", re.UNICODE)
+            return bool(ws.match(x))
+
+        while start > 0:
+            if is_boundary(text[start - 1]):
+                break
+            start -= 1
+
+        while end < totlen - 1:
+            if is_boundary(text[end + 1]):
+                break
+            end += 1
+
+        return (start, end)
+
      def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
          start = None
          end = None
          totlen = len(text)
      def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
          start = None
          end = None
          totlen = len(text)
-        matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
-        (start, end) = matches_margins[0]
-
-        for (s, e) in matches_margins[1:]:
+        matches_margins = map(lambda (s, e):
+                              ((s, e),
+                               (max(0, s - margins), min(totlen, e + margins))),
+                                  matches)
+        matches_margins = map(lambda (m, (s, e)):
+                              (m, self.expand_margins(text, s, e)),
+            matches_margins)
+
+            # lets start with first match
+        (start, end) = matches_margins[0][1]
+        matches = [matches_margins[0][0]]
+
+        for (m, (s, e)) in matches_margins[1:]:
              if end < s or start > e:
                  continue
              start = min(start, s)
              end = max(end, e)
              if end < s or start > e:
                  continue
              start = min(start, s)
              end = max(end, e)
+            matches.append(m)
  
          snip = text[start:end]
  
          snip = text[start:end]
-        matches = list(matches)
          matches.sort(lambda a, b: cmp(b[0], a[0]))
          matches.sort(lambda a, b: cmp(b[0], a[0]))
+
          for (s, e) in matches:
              off = - start
              snip = snip[:e + off] + mark[1] + snip[e + off:]
              snip = snip[:s + off] + mark[0] + snip[s + off:]
          for (s, e) in matches:
              off = - start
              snip = snip[:e + off] + mark[1] + snip[e + off:]
              snip = snip[:s + off] + mark[0] + snip[s + off:]
-            # maybe break on word boundaries
  
          return snip
  
          return snip
-