1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
 
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
 
   4 from functools import reduce, total_ordering
 
   5 from itertools import chain
 
  10 from django.conf import settings
 
  11 from librarian import dcparser
 
  12 import librarian.meta.types.date
 
  13 import librarian.meta.types.person
 
  14 import librarian.meta.types.text
 
  15 from librarian.parser import WLDocument
 
  16 from lxml import etree
 
  18 import catalogue.models
 
  20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
 
  21 from wolnelektury.utils import makedirs
 
  24 log = logging.getLogger('search')
 
  27 if os.path.isfile(settings.SOLR_STOPWORDS):
 
  30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
 
  35 class SolrIndex(object):
 
  36     def __init__(self, mode=None):
 
  37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
 
  40 class Snippets(object):
 
  42     This class manages snippet files for indexed object (book)
 
  43     the snippets are concatenated together, and their positions and
 
  44     lengths are kept in lucene index fields.
 
  46     SNIPPET_DIR = "snippets"
 
  48     def __init__(self, book_id, revision=None):
 
  49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
 
  50         self.book_id = book_id
 
  51         self.revision = revision
 
  58             fn = "%d.%d" % (self.book_id, self.revision)
 
  60             fn = "%d" % self.book_id
 
  62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
 
  64     def open(self, mode='r'):
 
  66         Open the snippet file. Call .close() afterwards.
 
  72             if os.path.exists(self.path):
 
  75                     if not os.path.exists(self.path):
 
  79         self.file = open(self.path, mode)
 
  83     def add(self, snippet):
 
  85         Append a snippet (unicode) to the snippet file.
 
  86         Return a (position, length) tuple
 
  88         txt = snippet.encode('utf-8')
 
  91         pos = (self.position, l)
 
  97         Given a tuple of (position, length) return an unicode
 
  98         of the snippet stored there.
 
 100         self.file.seek(pos[0], 0)
 
 102             txt = self.file.read(pos[1]).decode('utf-8')
 
 108         """Close snippet file"""
 
 124 class Index(SolrIndex):
 
 126     Class indexing books.
 
 129         super(Index, self).__init__(mode='rw')
 
 131     def delete_query(self, *queries):
 
 133         index.delete(queries=...) doesn't work, so let's reimplement it
 
 134         using deletion of list of uids.
 
 138             if isinstance(q, scorched.search.LuceneQuery):
 
 139                 q = self.index.query(q)
 
 140             q.field_limiter.update(['uid'])
 
 144                 ids = q.paginate(start=st, rows=rows).execute()
 
 151             # FIXME: With Solr API change, this doesn't work.
 
 152             #self.index.delete(uids)
 
 157     def index_tags(self, *tags, **kw):
 
 159         Re-index global tag list.
 
 160         Removes all tags from index, then index them again.
 
 161         Indexed fields include: id, name (with and without polish stems), category
 
 163         log.debug("Indexing tags")
 
 164         remove_only = kw.get('remove_only', False)
 
 165         # first, remove tags from index.
 
 169                 q_id = self.index.Q(tag_id=tag.id)
 
 171                 if isinstance(tag, PDCounterAuthor):
 
 172                     q_cat = self.index.Q(tag_category='pd_author')
 
 173                 elif isinstance(tag, PDCounterBook):
 
 174                     q_cat = self.index.Q(tag_category='pd_book')
 
 176                     q_cat = self.index.Q(tag_category=tag.category)
 
 178                 q_id_cat = self.index.Q(q_id & q_cat)
 
 179                 tag_qs.append(q_id_cat)
 
 180             self.delete_query(*tag_qs)
 
 182             q = self.index.Q(tag_id__any=True)
 
 186             # then add them [all or just one passed]
 
 189                     catalogue.models.Tag.objects.exclude(category='set'),
 
 190                     PDCounterAuthor.objects.all(),
 
 191                     PDCounterBook.objects.all())
 
 194                 if isinstance(tag, PDCounterAuthor):
 
 196                         "tag_id": int(tag.id),
 
 197                         "tag_name": tag.name,
 
 198                         "tag_name_pl": tag.name,
 
 199                         "tag_category": 'pd_author',
 
 200                         "is_pdcounter": True,
 
 201                         "uid": "tag%d_pd_a" % tag.id
 
 203                 elif isinstance(tag, PDCounterBook):
 
 205                         "tag_id": int(tag.id),
 
 206                         "tag_name": tag.title,
 
 207                         "tag_name_pl": tag.title,
 
 208                         "tag_category": 'pd_book',
 
 209                         "is_pdcounter": True,
 
 210                         "uid": "tag%d_pd_b" % tag.id
 
 214                         "tag_id": int(tag.id),
 
 215                         "tag_name": tag.name,
 
 216                         "tag_name_pl": tag.name,
 
 217                         "tag_category": tag.category,
 
 218                         "is_pdcounter": False,
 
 219                         "uid": "tag%d" % tag.id
 
 223     def create_book_doc(self, book):
 
 225         Create a lucene document referring book id.
 
 227         doc = {'book_id': int(book.id)}
 
 228         if book.parent is not None:
 
 229             doc['parent_id'] = int(book.parent.id)
 
 232     def remove_book(self, book_or_id, remove_snippets=True):
 
 233         """Removes a book from search index.
 
 234         book - Book instance."""
 
 235         if isinstance(book_or_id, catalogue.models.Book):
 
 236             book_id = book_or_id.id
 
 240         self.delete_query(self.index.Q(book_id=book_id))
 
 243             snippets = Snippets(book_id)
 
 246     def index_book(self, book, book_info=None, overwrite=True):
 
 249         Creates a lucene document for extracted metadata
 
 250         and calls self.index_content() to index the contents of the book.
 
 253             # we don't remove snippets, since they might be still needed by
 
 254             # threads using not reopened index
 
 255             self.remove_book(book, remove_snippets=False)
 
 257         book_doc = self.create_book_doc(book)
 
 258         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 
 259             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 
 260         # let's not index it - it's only used for extracting publish date
 
 261         if 'source_name' in meta_fields:
 
 262             del meta_fields['source_name']
 
 264         for n, f in meta_fields.items():
 
 267         book_doc['uid'] = "book%s" % book_doc['book_id']
 
 268         self.index.add(book_doc)
 
 271             'title': meta_fields['title'],
 
 272             'authors': meta_fields['authors'],
 
 273             'published_date': meta_fields['published_date']
 
 276         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 
 277             if tag_name in meta_fields:
 
 278                 book_fields[tag_name] = meta_fields[tag_name]
 
 280         self.index_content(book, book_fields=book_fields)
 
 285         'dramat_wierszowany_l',
 
 286         'dramat_wierszowany_lp',
 
 287         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 
 291     ignore_content_tags = [
 
 292         'uwaga', 'extra', 'nota_red', 'abstrakt',
 
 293         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 
 295         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 
 298     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 
 300     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 
 301                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 
 303     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 
 305     def extract_metadata(self, book, book_info=None, dc_only=None):
 
 307         Extract metadata from book and returns a map of fields keyed by fieldname
 
 311         if book_info is None:
 
 312             book_info = dcparser.parse(open(book.xml_file.path))
 
 314         fields['slug'] = book.slug
 
 315         fields['is_book'] = True
 
 318         for field in dcparser.BookInfo.FIELDS:
 
 319             if dc_only and field.name not in dc_only:
 
 321             if hasattr(book_info, field.name):
 
 322                 if not getattr(book_info, field.name):
 
 324                 type_indicator = field.value_type
 
 325                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
 
 326                     s = getattr(book_info, field.name)
 
 329                     fields[field.name] = s
 
 330                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
 
 331                     p = getattr(book_info, field.name)
 
 332                     if isinstance(p, librarian.meta.types.person.Person):
 
 335                         persons = ', '.join(map(str, p))
 
 336                     fields[field.name] = persons
 
 337                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
 
 338                     dt = getattr(book_info, field.name)
 
 339                     fields[field.name] = dt
 
 343         if hasattr(book_info, 'source_name') and book_info.source_name:
 
 344             match = self.published_date_re.search(book_info.source_name)
 
 345             if match is not None:
 
 346                 pd = str(match.groups()[0])
 
 349         fields["published_date"] = pd
 
 353     # def add_gaps(self, fields, fieldname):
 
 355     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 
 356     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 
 360     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 
 361     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 
 363     def get_master(self, root):
 
 365         Returns the first master tag from an etree.
 
 367         for master in root.iter():
 
 368             if master.tag in self.master_tags:
 
 371     def index_content(self, book, book_fields):
 
 373         Walks the book XML and extract content from it.
 
 374         Adds parts for each header tag and for each fragment.
 
 376         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 
 377         root = wld.edoc.getroot()
 
 379         master = self.get_master(root)
 
 384             if node.tag not in self.ignore_content_tags:
 
 385                 yield node, None, None
 
 386                 if node.text is not None:
 
 387                     yield None, node.text, None
 
 388                 for child in list(node):
 
 389                     for b, t, e in walker(child):
 
 391                 yield None, None, node
 
 393             if node.tail is not None:
 
 394                 yield None, node.tail, None
 
 397         def fix_format(text):
 
 398             # separator = [" ", "\t", ".", ";", ","]
 
 399             if isinstance(text, list):
 
 400                 # need to join it first
 
 401                 text = filter(lambda s: s is not None, content)
 
 402                 text = ' '.join(text)
 
 403                 # for i in range(len(text)):
 
 405                 #         if text[i][0] not in separator\
 
 406                 #             and text[i - 1][-1] not in separator:
 
 407                 #          text.insert(i, " ")
 
 409             return re.sub("(?m)/$", "", text)
 
 411         def add_part(snippets, **fields):
 
 412             doc = self.create_book_doc(book)
 
 413             for n, v in book_fields.items():
 
 416             doc['header_index'] = fields["header_index"]
 
 417             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 
 418             doc['header_type'] = fields['header_type']
 
 420             doc['text'] = fields['text']
 
 423             snip_pos = snippets.add(fields["text"])
 
 425             doc['snippets_position'] = snip_pos[0]
 
 426             doc['snippets_length'] = snip_pos[1]
 
 427             if snippets.revision:
 
 428                 doc["snippets_revision"] = snippets.revision
 
 430             if 'fragment_anchor' in fields:
 
 431                 doc["fragment_anchor"] = fields['fragment_anchor']
 
 433             if 'themes' in fields:
 
 434                 doc['themes'] = fields['themes']
 
 435             doc['uid'] = "part%s-%s-%s-%s" % (
 
 436                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 
 440         snippets = Snippets(book.id).open('w')
 
 442             for header, position in zip(list(master), range(len(master))):
 
 444                 if header.tag in self.skip_header_tags:
 
 446                 if header.tag is etree.Comment:
 
 453                 def all_content(text):
 
 454                     for frag in fragments.values():
 
 455                         frag['text'].append(text)
 
 457                 handle_text = [all_content]
 
 459                 for start, text, end in walker(header):
 
 461                     if start is not None and start.tag in self.footnote_tags:
 
 464                         def collect_footnote(t):
 
 467                         handle_text.append(collect_footnote)
 
 468                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 
 470                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 
 471                                        text=''.join(footnote),
 
 476                     # handle fragments and themes.
 
 477                     if start is not None and start.tag == 'begin':
 
 478                         fid = start.attrib['id'][1:]
 
 480                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 
 482                     # themes for this fragment
 
 483                     elif start is not None and start.tag == 'motyw':
 
 484                         fid = start.attrib['id'][1:]
 
 485                         handle_text.append(lambda text: None)
 
 486                         if start.text is not None:
 
 487                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 
 488                     elif end is not None and end.tag == 'motyw':
 
 491                     elif start is not None and start.tag == 'end':
 
 492                         fid = start.attrib['id'][1:]
 
 493                         if fid not in fragments:
 
 494                             continue  # a broken <end> node, skip it
 
 495                         frag = fragments[fid]
 
 496                         if not frag['themes']:
 
 497                             continue  # empty themes list.
 
 500                         doc = add_part(snippets,
 
 501                                        header_type=frag['start_header'],
 
 502                                        header_index=frag['start_section'],
 
 503                                        header_span=position - frag['start_section'] + 1,
 
 505                                        text=fix_format(frag['text']),
 
 506                                        themes=frag['themes'])
 
 511                     if text is not None and handle_text is not []:
 
 512                         hdl = handle_text[-1]
 
 515                         # in the end, add a section text.
 
 516                 doc = add_part(snippets, header_index=position,
 
 517                                header_type=header.tag, text=fix_format(content))
 
 524     def remove_picture(self, picture_or_id):
 
 525         """Removes a picture from search index."""
 
 526         if isinstance(picture_or_id, picture.models.Picture):
 
 527             picture_id = picture_or_id.id
 
 529             picture_id = picture_or_id
 
 530         self.delete_query(self.index.Q(picture_id=picture_id))
 
 532     def index_picture(self, picture, picture_info=None, overwrite=True):
 
 535         Creates a lucene document for extracted metadata
 
 536         and calls self.index_area() to index the contents of the picture.
 
 539             # we don't remove snippets, since they might be still needed by
 
 540             # threads using not reopened index
 
 541             self.remove_picture(picture)
 
 543         picture_doc = {'picture_id': int(picture.id)}
 
 544         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 
 545             'authors', 'title', 'epochs', 'kinds', 'genres'])
 
 547         picture_doc.update(meta_fields)
 
 549         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 
 550         self.index.add(picture_doc)
 
 551         del picture_doc['is_book']
 
 552         for area in picture.areas.all():
 
 553             self.index_area(area, picture_fields=picture_doc)
 
 555     def index_area(self, area, picture_fields):
 
 557         Indexes themes and objects on the area.
 
 559         doc = dict(picture_fields)
 
 560         doc['area_id'] = area.id
 
 561         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 
 562         doc['uid'] = 'area%s' % area.id
 
 567 class SearchResult(object):
 
 568     def __init__(self, doc, how_found=None, query_terms=None):
 
 571         self._processed_hits = None  # processed hits
 
 573         self.query_terms = query_terms
 
 577             self._score = doc['score']
 
 581         self.book_id = int(doc["book_id"])
 
 584             self.published_date = int(doc.get("published_date"))
 
 586             self.published_date = 0
 
 589         header_type = doc.get("header_type", None)
 
 590         # we have a content hit in some header of fragment
 
 591         if header_type is not None:
 
 592             sec = (header_type, int(doc["header_index"]))
 
 593             header_span = doc['header_span']
 
 594             header_span = header_span is not None and int(header_span) or 1
 
 595             fragment = doc.get("fragment_anchor", None)
 
 596             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 
 597             snippets_rev = doc.get('snippets_revision', None)
 
 599             hit = (sec + (header_span,), fragment, self._score, {
 
 600                 'how_found': how_found,
 
 601                 'snippets_pos': snippets_pos,
 
 602                 'snippets_revision': snippets_rev,
 
 603                 'themes': doc.get('themes', []),
 
 604                 'themes_pl': doc.get('themes_pl', [])
 
 607             self._hits.append(hit)
 
 610     def from_book(cls, book, how_found=None, query_terms=None):
 
 612             'score': book.popularity.count,
 
 616         result = cls(doc, how_found=how_found, query_terms=query_terms)
 
 621         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 
 622             (self.book_id, len(self._hits),
 
 623              len(self._processed_hits) if self._processed_hits else -1,
 
 624              self._score, len(self.snippets))
 
 627         return str(self).encode('utf-8')
 
 631         return self._score * self.boost
 
 633     def merge(self, other):
 
 634         if self.book_id != other.book_id:
 
 635             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 
 636         self._hits += other._hits
 
 637         self._score += max(other._score, 0)
 
 641         if self._book is not None:
 
 644             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 
 645         except catalogue.models.Book.DoesNotExist:
 
 649     book = property(get_book)
 
 660         if self._processed_hits is not None:
 
 661             return self._processed_hits
 
 663         # to sections and fragments
 
 664         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 
 666         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 
 668         # sections not covered by fragments
 
 669         sect = filter(lambda s: 0 == len(list(filter(
 
 670             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 
 671                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 
 673         def remove_duplicates(lst, keyfn, larger):
 
 678                     if larger(els[eif], e):
 
 683         # remove fragments with duplicated fid's and duplicated snippets
 
 684         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 
 686         # remove duplicate sections
 
 690             si = s[self.POSITION][self.POSITION_INDEX]
 
 693                 if sections[si]['score'] >= s[self.SCORE]:
 
 696             m = {'score': s[self.SCORE],
 
 697                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 
 699             m.update(s[self.OTHER])
 
 702         hits = list(sections.values())
 
 706                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 
 707             except catalogue.models.Fragment.DoesNotExist:
 
 710             # Figure out if we were searching for a token matching some word in theme name.
 
 711             themes = frag.tags.filter(category='theme')
 
 713             if self.query_terms is not None:
 
 714                 for i in range(0, len(f[self.OTHER]['themes'])):
 
 715                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 
 716                     tms = map(str.lower, tms)
 
 717                     for qt in self.query_terms:
 
 719                             themes_hit.add(f[self.OTHER]['themes'][i])
 
 722             def theme_by_name(n):
 
 723                 th = list(filter(lambda t: t.name == n, themes))
 
 728             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 
 730             m = {'score': f[self.SCORE],
 
 732                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 
 734                  'themes_hit': themes_hit
 
 736             m.update(f[self.OTHER])
 
 739         hits.sort(key=lambda h: h['score'], reverse=True)
 
 741         self._processed_hits = hits
 
 746     def aggregate(*result_lists):
 
 748         for rl in result_lists:
 
 750                 if r.book_id in books:
 
 751                     books[r.book_id].merge(r)
 
 754         return books.values()
 
 756     def get_sort_key(self):
 
 759                 self.book.sort_key_author if self.book else '',
 
 760                 self.book.sort_key if self.book else '')
 
 762     def __lt__(self, other):
 
 763         return self.get_sort_key() > other.get_sort_key()
 
 765     def __eq__(self, other):
 
 766         return self.get_sort_key() == other.get_sort_key()
 
 769         return len(self.hits)
 
 771     def snippet_pos(self, idx=0):
 
 772         return self.hits[idx]['snippets_pos']
 
 774     def snippet_revision(self, idx=0):
 
 776             return self.hits[idx]['snippets_revision']
 
 777         except (IndexError, KeyError):
 
 782 class PictureResult(object):
 
 783     def __init__(self, doc, how_found=None, query_terms=None):
 
 785         self.query_terms = query_terms
 
 788         self._processed_hits = None
 
 791             self._score = doc['score']
 
 795         self.picture_id = int(doc["picture_id"])
 
 797         if doc.get('area_id'):
 
 798             hit = (self._score, {
 
 799                 'how_found': how_found,
 
 800                 'area_id': doc['area_id'],
 
 801                 'themes': doc.get('themes', []),
 
 802                 'themes_pl': doc.get('themes_pl', []),
 
 805             self._hits.append(hit)
 
 808         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 
 815         return self._score * self.boost
 
 817     def merge(self, other):
 
 818         if self.picture_id != other.picture_id:
 
 820                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 
 821         self._hits += other._hits
 
 822         self._score += max(other._score, 0)
 
 830         if self._processed_hits is not None:
 
 831             return self._processed_hits
 
 834         for hit in self._hits:
 
 836                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 
 837             except picture.models.PictureArea.DoesNotExist:
 
 840             # Figure out if we were searching for a token matching some word in theme name.
 
 842             if self.query_terms is not None:
 
 843                 for i in range(0, len(hit[self.OTHER]['themes'])):
 
 844                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 
 845                     tms = map(str.lower, tms)
 
 846                     for qt in self.query_terms:
 
 848                             themes_hit.add(hit[self.OTHER]['themes'][i])
 
 852                 'score': hit[self.SCORE],
 
 854                 'themes_hit': themes_hit,
 
 856             m.update(hit[self.OTHER])
 
 859         hits.sort(key=lambda h: h['score'], reverse=True)
 
 861         self._processed_hits = hits
 
 864     def get_picture(self):
 
 865         if self._picture is None:
 
 866             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 
 869     picture = property(get_picture)
 
 872     def aggregate(*result_lists):
 
 874         for rl in result_lists:
 
 876                 if r.picture_id in books:
 
 877                     books[r.picture_id].merge(r)
 
 879                     books[r.picture_id] = r
 
 880         return books.values()
 
 882     def __lt__(self, other):
 
 883         return self.score < other.score
 
 885     def __eq__(self, other):
 
 886         return self.score == other.score
 
 889 class Search(SolrIndex):
 
 893     def __init__(self, default_field="text"):
 
 894         super(Search, self).__init__(mode='r')
 
 896     def make_term_query(self, query, field='text', modal=operator.or_):
 
 898         Returns term queries joined by boolean query.
 
 899         modal - applies to boolean query
 
 900         fuzzy - should the query by fuzzy.
 
 905         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 
 909     def search_by_author(self, words):
 
 910         from catalogue.models import Book
 
 911         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 
 913             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 
 914         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 
 916     def search_words(self, words, fields, required=None, book=True, picture=False):
 
 917         if book and not picture and fields == ['authors']:
 
 918             return self.search_by_author(words)
 
 921             if book or picture or (word not in stopwords):
 
 924                     q = self.index.Q(**{field: word})
 
 925                     if word_filter is None:
 
 929                 filters.append(word_filter)
 
 931             required_filter = None
 
 932             for field in required:
 
 934                     if book or picture or (word not in stopwords):
 
 935                         q = self.index.Q(**{field: word})
 
 936                         if required_filter is None:
 
 940             filters.append(required_filter)
 
 945             params['is_book'] = True
 
 947             params['picture_id__gt'] = 0
 
 949             params['book_id__gt'] = 0
 
 950         query = self.index.query(**params)
 
 951         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 
 952         result_class = PictureResult if picture else SearchResult
 
 953         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 
 955     def get_snippets(self, searchresult, query, field='text', num=1):
 
 957         Returns a snippet for found scoreDoc.
 
 959         maxnum = len(searchresult)
 
 960         if num is None or num < 0 or num > maxnum:
 
 962         book_id = searchresult.book_id
 
 963         revision = searchresult.snippet_revision()
 
 964         snippets = Snippets(book_id, revision=revision)
 
 965         snips = [None] * maxnum
 
 969             while idx < maxnum and num > 0:
 
 970                 position, length = searchresult.snippet_pos(idx)
 
 971                 if position is None or length is None:
 
 973                 text = snippets.get((int(position),
 
 975                 snip = self.index.highlight(text=text, field=field, q=query)
 
 976                 if not snip and field == 'text':
 
 977                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 
 978                 if snip not in snips:
 
 985             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
 
 987                 log.error("Book does not exist for book id = %d" % book_id)
 
 988             elif not book.get().children.exists():
 
 989                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 
 994         # remove verse end markers..
 
 995         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 
 997         searchresult.snippets = snips
 
1002     def apply_filters(query, filters):
 
1004         Apply filters to a query
 
1008         filters = filter(lambda x: x is not None, filters)
 
1010             query = query.query(f)
 
1014 if getattr(settings, 'SEARCH_MOCK', False):
 
1015     from .mock_search import Search