pipeline bug

[wolnelektury.git] / apps / search / custom.py
diff --git a/apps/search/custom.py b/apps/search/custom.py

index 33ce47d..b3b704d 100644 (file)
--- a/apps/search/custom.py
+++ b/apps/search/custom.py
@@ -1,10 +1,15 @@
-
+# -*- coding: utf-8 -*-
+# This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
+# Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
+#
  from sunburnt import sunburnt
  from lxml import etree
  import urllib
  import warnings
  from sunburnt import search
  import copy
  from sunburnt import sunburnt
  from lxml import etree
  import urllib
  import warnings
  from sunburnt import search
  import copy
+from httplib2 import socket
+import re
  
  
  class TermVectorOptions(search.Options):
  
  
  class TermVectorOptions(search.Options):
@@ -85,11 +90,14 @@ class CustomSolrInterface(sunburnt.SolrInterface):
      def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
          self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
          self.schemadoc = schemadoc
      def __init__(self, url, schemadoc=None, http_connection=None, mode='', retry_timeout=-1, max_length_get_url=sunburnt.MAX_LENGTH_GET_URL):
          self.conn = CustomSolrConnection(url, http_connection, retry_timeout, max_length_get_url)
          self.schemadoc = schemadoc
-        if mode == 'r':
+        if 'w' not in mode:
              self.writeable = False
              self.writeable = False
-        elif mode == 'w':
+        elif 'r' not in mode:
              self.readable = False
              self.readable = False
-        self.init_schema()
+        try:
+            self.init_schema()
+        except socket.error, e:
+            raise socket.error, "Cannot connect to Solr server, and search indexing is enabled (%s)" % str(e)
  
      def _analyze(self, **kwargs):
          if not self.readable:
  
      def _analyze(self, **kwargs):
          if not self.readable:
@@ -119,8 +127,8 @@ class CustomSolrInterface(sunburnt.SolrInterface):
  
          if matches:
              return self.substring(kwargs['text'], matches,
  
          if matches:
              return self.substring(kwargs['text'], matches,
-                            margins=kwargs.get('margins', 30),
-            mark=kwargs.get('mark', ("<b>", "</b>")))
+                margins=kwargs.get('margins', 30),
+                mark=kwargs.get('mark', ("<b>", "</b>")))
          else:
              return None
  
          else:
              return None
  
@@ -130,26 +138,54 @@ class CustomSolrInterface(sunburnt.SolrInterface):
          terms = map(lambda n: unicode(n.text), terms)
          return terms
  
          terms = map(lambda n: unicode(n.text), terms)
          return terms
  
+    def expand_margins(self, text, start, end):
+        totlen = len(text)
+
+        def is_boundary(x):
+            ws = re.compile(r"\W", re.UNICODE)
+            return bool(ws.match(x))
+
+        while start > 0:
+            if is_boundary(text[start - 1]):
+                break
+            start -= 1
+
+        while end < totlen - 1:
+            if is_boundary(text[end + 1]):
+                break
+            end += 1
+
+        return (start, end)
+
      def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
          start = None
          end = None
          totlen = len(text)
      def substring(self, text, matches, margins=30, mark=("<b>", "</b>")):
          start = None
          end = None
          totlen = len(text)
-        matches_margins = map(lambda (s, e): (max(0, s - margins), min(totlen, e + margins)), matches)
-        (start, end) = matches_margins[0]
-
-        for (s, e) in matches_margins[1:]:
+        matches_margins = map(lambda (s, e):
+                              ((s, e),
+                               (max(0, s - margins), min(totlen, e + margins))),
+                                  matches)
+        matches_margins = map(lambda (m, (s, e)):
+                              (m, self.expand_margins(text, s, e)),
+            matches_margins)
+
+            # lets start with first match
+        (start, end) = matches_margins[0][1]
+        matches = [matches_margins[0][0]]
+
+        for (m, (s, e)) in matches_margins[1:]:
              if end < s or start > e:
                  continue
              start = min(start, s)
              end = max(end, e)
              if end < s or start > e:
                  continue
              start = min(start, s)
              end = max(end, e)
+            matches.append(m)
  
          snip = text[start:end]
  
          snip = text[start:end]
-        matches = list(matches)
          matches.sort(lambda a, b: cmp(b[0], a[0]))
          matches.sort(lambda a, b: cmp(b[0], a[0]))
+
          for (s, e) in matches:
              off = - start
              snip = snip[:e + off] + mark[1] + snip[e + off:]
              snip = snip[:s + off] + mark[0] + snip[s + off:]
          for (s, e) in matches:
              off = - start
              snip = snip[:e + off] + mark[1] + snip[e + off:]
              snip = snip[:s + off] + mark[0] + snip[s + off:]
-            # maybe break on word boundaries
-        return snip
  
  
+        return snip