src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 import librarian.meta.types.date
  13 import librarian.meta.types.person
  14 import librarian.meta.types.text
  15 from librarian.parser import WLDocument
  16 from lxml import etree
  17 import scorched
  18 import catalogue.models
  19 import picture.models
  20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  21 from wolnelektury.utils import makedirs
  22 from . import custom
  23
  24 log = logging.getLogger('search')
  25
  26
  27 if os.path.isfile(settings.SOLR_STOPWORDS):
  28     stopwords = set(
  29         line.strip()
  30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  31 else:
  32     stopwords = set()
  33
  34
  35 class SolrIndex(object):
  36     def __init__(self, mode=None):
  37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  38
  39
  40 class Snippets(object):
  41     """
  42     This class manages snippet files for indexed object (book)
  43     the snippets are concatenated together, and their positions and
  44     lengths are kept in lucene index fields.
  45     """
  46     SNIPPET_DIR = "snippets"
  47
  48     def __init__(self, book_id, revision=None):
  49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  50         self.book_id = book_id
  51         self.revision = revision
  52         self.file = None
  53         self.position = None
  54
  55     @property
  56     def path(self):
  57         if self.revision:
  58             fn = "%d.%d" % (self.book_id, self.revision)
  59         else:
  60             fn = "%d" % self.book_id
  61
  62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  63
  64     def open(self, mode='r'):
  65         """
  66         Open the snippet file. Call .close() afterwards.
  67         """
  68         if 'b' not in mode:
  69             mode += 'b'
  70
  71         if 'w' in mode:
  72             if os.path.exists(self.path):
  73                 self.revision = 1
  74                 while True:
  75                     if not os.path.exists(self.path):
  76                         break
  77                     self.revision += 1
  78
  79         self.file = open(self.path, mode)
  80         self.position = 0
  81         return self
  82
  83     def add(self, snippet):
  84         """
  85         Append a snippet (unicode) to the snippet file.
  86         Return a (position, length) tuple
  87         """
  88         txt = snippet.encode('utf-8')
  89         l = len(txt)
  90         self.file.write(txt)
  91         pos = (self.position, l)
  92         self.position += l
  93         return pos
  94
  95     def get(self, pos):
  96         """
  97         Given a tuple of (position, length) return an unicode
  98         of the snippet stored there.
  99         """
 100         self.file.seek(pos[0], 0)
 101         try:
 102             txt = self.file.read(pos[1]).decode('utf-8')
 103         except:
 104             return ''
 105         return txt
 106
 107     def close(self):
 108         """Close snippet file"""
 109         if self.file:
 110             self.file.close()
 111
 112     def remove(self):
 113         self.revision = None
 114         try:
 115             os.unlink(self.path)
 116             self.revision = 0
 117             while True:
 118                 self.revision += 1
 119                 os.unlink(self.path)
 120         except OSError:
 121             pass
 122
 123
 124 class Index(SolrIndex):
 125     """
 126     Class indexing books.
 127     """
 128     def __init__(self):
 129         super(Index, self).__init__(mode='rw')
 130
 131     def delete_query(self, *queries):
 132         """
 133         index.delete(queries=...) doesn't work, so let's reimplement it
 134         using deletion of list of uids.
 135         """
 136         uids = set()
 137         for q in queries:
 138             if isinstance(q, scorched.search.LuceneQuery):
 139                 q = self.index.query(q)
 140             q.field_limiter.update(['uid'])
 141             st = 0
 142             rows = 100
 143             while True:
 144                 ids = q.paginate(start=st, rows=rows).execute()
 145                 if not len(ids):
 146                     break
 147                 for res in ids:
 148                     uids.add(res['uid'])
 149                 st += rows
 150         if uids:
 151             # FIXME: With Solr API change, this doesn't work.
 152             #self.index.delete(uids)
 153             return True
 154         else:
 155             return False
 156
 157     def index_tags(self, *tags, **kw):
 158         """
 159         Re-index global tag list.
 160         Removes all tags from index, then index them again.
 161         Indexed fields include: id, name (with and without polish stems), category
 162         """
 163         log.debug("Indexing tags")
 164         remove_only = kw.get('remove_only', False)
 165         # first, remove tags from index.
 166         if tags:
 167             tag_qs = []
 168             for tag in tags:
 169                 q_id = self.index.Q(tag_id=tag.id)
 170
 171                 if isinstance(tag, PDCounterAuthor):
 172                     q_cat = self.index.Q(tag_category='pd_author')
 173                 elif isinstance(tag, PDCounterBook):
 174                     q_cat = self.index.Q(tag_category='pd_book')
 175                 else:
 176                     q_cat = self.index.Q(tag_category=tag.category)
 177
 178                 q_id_cat = self.index.Q(q_id & q_cat)
 179                 tag_qs.append(q_id_cat)
 180             self.delete_query(*tag_qs)
 181         else:  # all
 182             q = self.index.Q(tag_id__any=True)
 183             self.delete_query(q)
 184
 185         if not remove_only:
 186             # then add them [all or just one passed]
 187             if not tags:
 188                 tags = chain(
 189                     catalogue.models.Tag.objects.exclude(category='set'),
 190                     PDCounterAuthor.objects.all(),
 191                     PDCounterBook.objects.all())
 192
 193             for tag in tags:
 194                 if isinstance(tag, PDCounterAuthor):
 195                     doc = {
 196                         "tag_id": int(tag.id),
 197                         "tag_name": tag.name,
 198                         "tag_name_pl": tag.name,
 199                         "tag_category": 'pd_author',
 200                         "is_pdcounter": True,
 201                         "uid": "tag%d_pd_a" % tag.id
 202                         }
 203                 elif isinstance(tag, PDCounterBook):
 204                     doc = {
 205                         "tag_id": int(tag.id),
 206                         "tag_name": tag.title,
 207                         "tag_name_pl": tag.title,
 208                         "tag_category": 'pd_book',
 209                         "is_pdcounter": True,
 210                         "uid": "tag%d_pd_b" % tag.id
 211                         }
 212                 else:
 213                     doc = {
 214                         "tag_id": int(tag.id),
 215                         "tag_name": tag.name,
 216                         "tag_name_pl": tag.name,
 217                         "tag_category": tag.category,
 218                         "is_pdcounter": False,
 219                         "uid": "tag%d" % tag.id
 220                         }
 221                 self.index.add(doc)
 222
 223     def create_book_doc(self, book):
 224         """
 225         Create a lucene document referring book id.
 226         """
 227         doc = {'book_id': int(book.id)}
 228         if book.parent is not None:
 229             doc['parent_id'] = int(book.parent.id)
 230         return doc
 231
 232     def remove_book(self, book_or_id, remove_snippets=True):
 233         """Removes a book from search index.
 234         book - Book instance."""
 235         if isinstance(book_or_id, catalogue.models.Book):
 236             book_id = book_or_id.id
 237         else:
 238             book_id = book_or_id
 239
 240         self.delete_query(self.index.Q(book_id=book_id))
 241
 242         if remove_snippets:
 243             snippets = Snippets(book_id)
 244             snippets.remove()
 245
 246     def index_book(self, book, book_info=None, overwrite=True):
 247         """
 248         Indexes the book.
 249         Creates a lucene document for extracted metadata
 250         and calls self.index_content() to index the contents of the book.
 251         """
 252         if overwrite:
 253             # we don't remove snippets, since they might be still needed by
 254             # threads using not reopened index
 255             self.remove_book(book, remove_snippets=False)
 256
 257         book_doc = self.create_book_doc(book)
 258         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 259             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 260         # let's not index it - it's only used for extracting publish date
 261         if 'source_name' in meta_fields:
 262             del meta_fields['source_name']
 263
 264         for n, f in meta_fields.items():
 265             book_doc[n] = f
 266
 267         book_doc['uid'] = "book%s" % book_doc['book_id']
 268         self.index.add(book_doc)
 269         del book_doc
 270         book_fields = {
 271             'title': meta_fields['title'],
 272             'authors': meta_fields['authors'],
 273             'published_date': meta_fields['published_date']
 274             }
 275
 276         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 277             if tag_name in meta_fields:
 278                 book_fields[tag_name] = meta_fields[tag_name]
 279
 280         self.index_content(book, book_fields=book_fields)
 281
 282     master_tags = [
 283         'opowiadanie',
 284         'powiesc',
 285         'dramat_wierszowany_l',
 286         'dramat_wierszowany_lp',
 287         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 288         'wywiad',
 289     ]
 290
 291     ignore_content_tags = [
 292         'uwaga', 'extra', 'nota_red', 'abstrakt',
 293         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 294         'didaskalia',
 295         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 296     ]
 297
 298     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 299
 300     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 301                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 302
 303     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 304
 305     def extract_metadata(self, book, book_info=None, dc_only=None):
 306         """
 307         Extract metadata from book and returns a map of fields keyed by fieldname
 308         """
 309         fields = {}
 310
 311         if book_info is None:
 312             book_info = dcparser.parse(open(book.xml_file.path))
 313
 314         fields['slug'] = book.slug
 315         fields['is_book'] = True
 316
 317         # validator, name
 318         for field in dcparser.BookInfo.FIELDS:
 319             if dc_only and field.name not in dc_only:
 320                 continue
 321             if hasattr(book_info, field.name):
 322                 if not getattr(book_info, field.name):
 323                     continue
 324                 type_indicator = field.value_type
 325                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
 326                     s = getattr(book_info, field.name)
 327                     if field.multiple:
 328                         s = ', '.join(s)
 329                     fields[field.name] = s
 330                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
 331                     p = getattr(book_info, field.name)
 332                     if isinstance(p, librarian.meta.types.person.Person):
 333                         persons = str(p)
 334                     else:
 335                         persons = ', '.join(map(str, p))
 336                     fields[field.name] = persons
 337                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
 338                     dt = getattr(book_info, field.name)
 339                     fields[field.name] = dt
 340
 341         # get published date
 342         pd = None
 343         if hasattr(book_info, 'source_name') and book_info.source_name:
 344             match = self.published_date_re.search(book_info.source_name)
 345             if match is not None:
 346                 pd = str(match.groups()[0])
 347         if not pd:
 348             pd = ""
 349         fields["published_date"] = pd
 350
 351         return fields
 352
 353     # def add_gaps(self, fields, fieldname):
 354     #     """
 355     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 356     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 357     #     """
 358     #     def gap():
 359     #         while True:
 360     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 361     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 362
 363     def get_master(self, root):
 364         """
 365         Returns the first master tag from an etree.
 366         """
 367         for master in root.iter():
 368             if master.tag in self.master_tags:
 369                 return master
 370
 371     def index_content(self, book, book_fields):
 372         """
 373         Walks the book XML and extract content from it.
 374         Adds parts for each header tag and for each fragment.
 375         """
 376         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 377         root = wld.edoc.getroot()
 378
 379         master = self.get_master(root)
 380         if master is None:
 381             return []
 382
 383         def walker(node):
 384             if node.tag not in self.ignore_content_tags:
 385                 yield node, None, None
 386                 if node.text is not None:
 387                     yield None, node.text, None
 388                 for child in list(node):
 389                     for b, t, e in walker(child):
 390                         yield b, t, e
 391                 yield None, None, node
 392
 393             if node.tail is not None:
 394                 yield None, node.tail, None
 395             return
 396
 397         def fix_format(text):
 398             # separator = [" ", "\t", ".", ";", ","]
 399             if isinstance(text, list):
 400                 # need to join it first
 401                 text = filter(lambda s: s is not None, content)
 402                 text = ' '.join(text)
 403                 # for i in range(len(text)):
 404                 #     if i > 0:
 405                 #         if text[i][0] not in separator\
 406                 #             and text[i - 1][-1] not in separator:
 407                 #          text.insert(i, " ")
 408
 409             return re.sub("(?m)/$", "", text)
 410
 411         def add_part(snippets, **fields):
 412             doc = self.create_book_doc(book)
 413             for n, v in book_fields.items():
 414                 doc[n] = v
 415
 416             doc['header_index'] = fields["header_index"]
 417             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 418             doc['header_type'] = fields['header_type']
 419
 420             doc['text'] = fields['text']
 421
 422             # snippets
 423             snip_pos = snippets.add(fields["text"])
 424
 425             doc['snippets_position'] = snip_pos[0]
 426             doc['snippets_length'] = snip_pos[1]
 427             if snippets.revision:
 428                 doc["snippets_revision"] = snippets.revision
 429
 430             if 'fragment_anchor' in fields:
 431                 doc["fragment_anchor"] = fields['fragment_anchor']
 432
 433             if 'themes' in fields:
 434                 doc['themes'] = fields['themes']
 435             doc['uid'] = "part%s-%s-%s-%s" % (
 436                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 437             return doc
 438
 439         fragments = {}
 440         snippets = Snippets(book.id).open('w')
 441         try:
 442             for header, position in zip(list(master), range(len(master))):
 443
 444                 if header.tag in self.skip_header_tags:
 445                     continue
 446                 if header.tag is etree.Comment:
 447                     continue
 448
 449                 # section content
 450                 content = []
 451                 footnote = []
 452
 453                 def all_content(text):
 454                     for frag in fragments.values():
 455                         frag['text'].append(text)
 456                     content.append(text)
 457                 handle_text = [all_content]
 458
 459                 for start, text, end in walker(header):
 460                     # handle footnotes
 461                     if start is not None and start.tag in self.footnote_tags:
 462                         footnote = []
 463
 464                         def collect_footnote(t):
 465                             footnote.append(t)
 466
 467                         handle_text.append(collect_footnote)
 468                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 469                         handle_text.pop()
 470                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 471                                        text=''.join(footnote),
 472                                        is_footnote=True)
 473                         self.index.add(doc)
 474                         footnote = []
 475
 476                     # handle fragments and themes.
 477                     if start is not None and start.tag == 'begin':
 478                         fid = start.attrib['id'][1:]
 479                         fragments[fid] = {
 480                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 481
 482                     # themes for this fragment
 483                     elif start is not None and start.tag == 'motyw':
 484                         fid = start.attrib['id'][1:]
 485                         handle_text.append(lambda text: None)
 486                         if start.text is not None:
 487                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 488                     elif end is not None and end.tag == 'motyw':
 489                         handle_text.pop()
 490
 491                     elif start is not None and start.tag == 'end':
 492                         fid = start.attrib['id'][1:]
 493                         if fid not in fragments:
 494                             continue  # a broken <end> node, skip it
 495                         frag = fragments[fid]
 496                         if not frag['themes']:
 497                             continue  # empty themes list.
 498                         del fragments[fid]
 499
 500                         doc = add_part(snippets,
 501                                        header_type=frag['start_header'],
 502                                        header_index=frag['start_section'],
 503                                        header_span=position - frag['start_section'] + 1,
 504                                        fragment_anchor=fid,
 505                                        text=fix_format(frag['text']),
 506                                        themes=frag['themes'])
 507                         self.index.add(doc)
 508
 509                         # Collect content.
 510
 511                     if text is not None and handle_text is not []:
 512                         hdl = handle_text[-1]
 513                         hdl(text)
 514
 515                         # in the end, add a section text.
 516                 doc = add_part(snippets, header_index=position,
 517                                header_type=header.tag, text=fix_format(content))
 518
 519                 self.index.add(doc)
 520
 521         finally:
 522             snippets.close()
 523
 524     def remove_picture(self, picture_or_id):
 525         """Removes a picture from search index."""
 526         if isinstance(picture_or_id, picture.models.Picture):
 527             picture_id = picture_or_id.id
 528         else:
 529             picture_id = picture_or_id
 530         self.delete_query(self.index.Q(picture_id=picture_id))
 531
 532     def index_picture(self, picture, picture_info=None, overwrite=True):
 533         """
 534         Indexes the picture.
 535         Creates a lucene document for extracted metadata
 536         and calls self.index_area() to index the contents of the picture.
 537         """
 538         if overwrite:
 539             # we don't remove snippets, since they might be still needed by
 540             # threads using not reopened index
 541             self.remove_picture(picture)
 542
 543         picture_doc = {'picture_id': int(picture.id)}
 544         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 545             'authors', 'title', 'epochs', 'kinds', 'genres'])
 546
 547         picture_doc.update(meta_fields)
 548
 549         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 550         self.index.add(picture_doc)
 551         del picture_doc['is_book']
 552         for area in picture.areas.all():
 553             self.index_area(area, picture_fields=picture_doc)
 554
 555     def index_area(self, area, picture_fields):
 556         """
 557         Indexes themes and objects on the area.
 558         """
 559         doc = dict(picture_fields)
 560         doc['area_id'] = area.id
 561         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 562         doc['uid'] = 'area%s' % area.id
 563         self.index.add(doc)
 564
 565
 566 @total_ordering
 567 class SearchResult(object):
 568     def __init__(self, doc, how_found=None, query_terms=None):
 569         self.boost = 1.0
 570         self._hits = []
 571         self._processed_hits = None  # processed hits
 572         self.snippets = []
 573         self.query_terms = query_terms
 574         self._book = None
 575
 576         if 'score' in doc:
 577             self._score = doc['score']
 578         else:
 579             self._score = 0
 580
 581         self.book_id = int(doc["book_id"])
 582
 583         try:
 584             self.published_date = int(doc.get("published_date"))
 585         except ValueError:
 586             self.published_date = 0
 587
 588         # content hits
 589         header_type = doc.get("header_type", None)
 590         # we have a content hit in some header of fragment
 591         if header_type is not None:
 592             sec = (header_type, int(doc["header_index"]))
 593             header_span = doc['header_span']
 594             header_span = header_span is not None and int(header_span) or 1
 595             fragment = doc.get("fragment_anchor", None)
 596             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 597             snippets_rev = doc.get('snippets_revision', None)
 598
 599             hit = (sec + (header_span,), fragment, self._score, {
 600                 'how_found': how_found,
 601                 'snippets_pos': snippets_pos,
 602                 'snippets_revision': snippets_rev,
 603                 'themes': doc.get('themes', []),
 604                 'themes_pl': doc.get('themes_pl', [])
 605                 })
 606
 607             self._hits.append(hit)
 608
 609     @classmethod
 610     def from_book(cls, book, how_found=None, query_terms=None):
 611         doc = {
 612             'score': book.popularity.count,
 613             'book_id': book.id,
 614             'published_date': 0,
 615         }
 616         result = cls(doc, how_found=how_found, query_terms=query_terms)
 617         result._book = book
 618         return result
 619
 620     def __str__(self):
 621         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 622             (self.book_id, len(self._hits),
 623              len(self._processed_hits) if self._processed_hits else -1,
 624              self._score, len(self.snippets))
 625
 626     def __bytes__(self):
 627         return str(self).encode('utf-8')
 628
 629     @property
 630     def score(self):
 631         return self._score * self.boost
 632
 633     def merge(self, other):
 634         if self.book_id != other.book_id:
 635             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 636         self._hits += other._hits
 637         self._score += max(other._score, 0)
 638         return self
 639
 640     def get_book(self):
 641         if self._book is not None:
 642             return self._book
 643         try:
 644             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 645         except catalogue.models.Book.DoesNotExist:
 646             self._book = None
 647         return self._book
 648
 649     book = property(get_book)
 650
 651     POSITION = 0
 652     FRAGMENT = 1
 653     POSITION_INDEX = 1
 654     POSITION_SPAN = 2
 655     SCORE = 2
 656     OTHER = 3
 657
 658     @property
 659     def hits(self):
 660         if self._processed_hits is not None:
 661             return self._processed_hits
 662
 663         # to sections and fragments
 664         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 665
 666         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 667
 668         # sections not covered by fragments
 669         sect = filter(lambda s: 0 == len(list(filter(
 670             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 671                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 672
 673         def remove_duplicates(lst, keyfn, larger):
 674             els = {}
 675             for e in lst:
 676                 eif = keyfn(e)
 677                 if eif in els:
 678                     if larger(els[eif], e):
 679                         continue
 680                 els[eif] = e
 681             return els.values()
 682
 683         # remove fragments with duplicated fid's and duplicated snippets
 684         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 685
 686         # remove duplicate sections
 687         sections = {}
 688
 689         for s in sect:
 690             si = s[self.POSITION][self.POSITION_INDEX]
 691             # skip existing
 692             if si in sections:
 693                 if sections[si]['score'] >= s[self.SCORE]:
 694                     continue
 695
 696             m = {'score': s[self.SCORE],
 697                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 698                  }
 699             m.update(s[self.OTHER])
 700             sections[si] = m
 701
 702         hits = list(sections.values())
 703
 704         for f in frags:
 705             try:
 706                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 707             except catalogue.models.Fragment.DoesNotExist:
 708                 # stale index
 709                 continue
 710             # Figure out if we were searching for a token matching some word in theme name.
 711             themes = frag.tags.filter(category='theme')
 712             themes_hit = set()
 713             if self.query_terms is not None:
 714                 for i in range(0, len(f[self.OTHER]['themes'])):
 715                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 716                     tms = map(str.lower, tms)
 717                     for qt in self.query_terms:
 718                         if qt in tms:
 719                             themes_hit.add(f[self.OTHER]['themes'][i])
 720                             break
 721
 722             def theme_by_name(n):
 723                 th = list(filter(lambda t: t.name == n, themes))
 724                 if th:
 725                     return th[0]
 726                 else:
 727                     return None
 728             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 729
 730             m = {'score': f[self.SCORE],
 731                  'fragment': frag,
 732                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 733                  'themes': themes,
 734                  'themes_hit': themes_hit
 735                  }
 736             m.update(f[self.OTHER])
 737             hits.append(m)
 738
 739         hits.sort(key=lambda h: h['score'], reverse=True)
 740
 741         self._processed_hits = hits
 742
 743         return hits
 744
 745     @staticmethod
 746     def aggregate(*result_lists):
 747         books = {}
 748         for rl in result_lists:
 749             for r in rl:
 750                 if r.book_id in books:
 751                     books[r.book_id].merge(r)
 752                 else:
 753                     books[r.book_id] = r
 754         return books.values()
 755
 756     def get_sort_key(self):
 757         return (-self.score,
 758                 self.published_date,
 759                 self.book.sort_key_author if self.book else '',
 760                 self.book.sort_key if self.book else '')
 761
 762     def __lt__(self, other):
 763         return self.get_sort_key() > other.get_sort_key()
 764
 765     def __eq__(self, other):
 766         return self.get_sort_key() == other.get_sort_key()
 767
 768     def __len__(self):
 769         return len(self.hits)
 770
 771     def snippet_pos(self, idx=0):
 772         return self.hits[idx]['snippets_pos']
 773
 774     def snippet_revision(self, idx=0):
 775         try:
 776             return self.hits[idx]['snippets_revision']
 777         except (IndexError, KeyError):
 778             return None
 779
 780
 781 @total_ordering
 782 class PictureResult(object):
 783     def __init__(self, doc, how_found=None, query_terms=None):
 784         self.boost = 1.0
 785         self.query_terms = query_terms
 786         self._picture = None
 787         self._hits = []
 788         self._processed_hits = None
 789
 790         if 'score' in doc:
 791             self._score = doc['score']
 792         else:
 793             self._score = 0
 794
 795         self.picture_id = int(doc["picture_id"])
 796
 797         if doc.get('area_id'):
 798             hit = (self._score, {
 799                 'how_found': how_found,
 800                 'area_id': doc['area_id'],
 801                 'themes': doc.get('themes', []),
 802                 'themes_pl': doc.get('themes_pl', []),
 803             })
 804
 805             self._hits.append(hit)
 806
 807     def __str__(self):
 808         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 809
 810     def __repr__(self):
 811         return str(self)
 812
 813     @property
 814     def score(self):
 815         return self._score * self.boost
 816
 817     def merge(self, other):
 818         if self.picture_id != other.picture_id:
 819             raise ValueError(
 820                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 821         self._hits += other._hits
 822         self._score += max(other._score, 0)
 823         return self
 824
 825     SCORE = 0
 826     OTHER = 1
 827
 828     @property
 829     def hits(self):
 830         if self._processed_hits is not None:
 831             return self._processed_hits
 832
 833         hits = []
 834         for hit in self._hits:
 835             try:
 836                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 837             except picture.models.PictureArea.DoesNotExist:
 838                 # stale index
 839                 continue
 840             # Figure out if we were searching for a token matching some word in theme name.
 841             themes_hit = set()
 842             if self.query_terms is not None:
 843                 for i in range(0, len(hit[self.OTHER]['themes'])):
 844                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 845                     tms = map(str.lower, tms)
 846                     for qt in self.query_terms:
 847                         if qt in tms:
 848                             themes_hit.add(hit[self.OTHER]['themes'][i])
 849                             break
 850
 851             m = {
 852                 'score': hit[self.SCORE],
 853                 'area': area,
 854                 'themes_hit': themes_hit,
 855             }
 856             m.update(hit[self.OTHER])
 857             hits.append(m)
 858
 859         hits.sort(key=lambda h: h['score'], reverse=True)
 860         hits = hits[:1]
 861         self._processed_hits = hits
 862         return hits
 863
 864     def get_picture(self):
 865         if self._picture is None:
 866             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 867         return self._picture
 868
 869     picture = property(get_picture)
 870
 871     @staticmethod
 872     def aggregate(*result_lists):
 873         books = {}
 874         for rl in result_lists:
 875             for r in rl:
 876                 if r.picture_id in books:
 877                     books[r.picture_id].merge(r)
 878                 else:
 879                     books[r.picture_id] = r
 880         return books.values()
 881
 882     def __lt__(self, other):
 883         return self.score < other.score
 884
 885     def __eq__(self, other):
 886         return self.score == other.score
 887
 888
 889 class Search(SolrIndex):
 890     """
 891     Search facilities.
 892     """
 893     def __init__(self, default_field="text"):
 894         super(Search, self).__init__(mode='r')
 895
 896     def make_term_query(self, query, field='text', modal=operator.or_):
 897         """
 898         Returns term queries joined by boolean query.
 899         modal - applies to boolean query
 900         fuzzy - should the query by fuzzy.
 901         """
 902         if query is None:
 903             query = ''
 904         q = self.index.Q()
 905         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 906
 907         return q
 908
 909     def search_by_author(self, words):
 910         from catalogue.models import Book
 911         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 912         for word in words:
 913             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 914         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 915
 916     def search_words(self, words, fields, required=None, book=True, picture=False):
 917         if book and not picture and fields == ['authors']:
 918             return self.search_by_author(words)
 919         filters = []
 920         for word in words:
 921             if book or picture or (word not in stopwords):
 922                 word_filter = None
 923                 for field in fields:
 924                     q = self.index.Q(**{field: word})
 925                     if word_filter is None:
 926                         word_filter = q
 927                     else:
 928                         word_filter |= q
 929                 filters.append(word_filter)
 930         if required:
 931             required_filter = None
 932             for field in required:
 933                 for word in words:
 934                     if book or picture or (word not in stopwords):
 935                         q = self.index.Q(**{field: word})
 936                         if required_filter is None:
 937                             required_filter = q
 938                         else:
 939                             required_filter |= q
 940             filters.append(required_filter)
 941         if not filters:
 942             return []
 943         params = {}
 944         if book:
 945             params['is_book'] = True
 946         if picture:
 947             params['picture_id__gt'] = 0
 948         else:
 949             params['book_id__gt'] = 0
 950         query = self.index.query(**params)
 951         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 952         result_class = PictureResult if picture else SearchResult
 953         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 954
 955     def get_snippets(self, searchresult, query, field='text', num=1):
 956         """
 957         Returns a snippet for found scoreDoc.
 958         """
 959         maxnum = len(searchresult)
 960         if num is None or num < 0 or num > maxnum:
 961             num = maxnum
 962         book_id = searchresult.book_id
 963         revision = searchresult.snippet_revision()
 964         snippets = Snippets(book_id, revision=revision)
 965         snips = [None] * maxnum
 966         try:
 967             snippets.open()
 968             idx = 0
 969             while idx < maxnum and num > 0:
 970                 position, length = searchresult.snippet_pos(idx)
 971                 if position is None or length is None:
 972                     continue
 973                 text = snippets.get((int(position),
 974                                      int(length)))
 975                 snip = self.index.highlight(text=text, field=field, q=query)
 976                 if not snip and field == 'text':
 977                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 978                 if snip not in snips:
 979                     snips[idx] = snip
 980                     if snip:
 981                         num -= 1
 982                 idx += 1
 983
 984         except IOError as e:
 985             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
 986             if not book:
 987                 log.error("Book does not exist for book id = %d" % book_id)
 988             elif not book.get().children.exists():
 989                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 990             return []
 991         finally:
 992             snippets.close()
 993
 994         # remove verse end markers..
 995         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 996
 997         searchresult.snippets = snips
 998
 999         return snips
1000
1001     @staticmethod
1002     def apply_filters(query, filters):
1003         """
1004         Apply filters to a query
1005         """
1006         if filters is None:
1007             filters = []
1008         filters = filter(lambda x: x is not None, filters)
1009         for f in filters:
1010             query = query.query(f)
1011         return query
1012
1013
1014 if getattr(settings, 'SEARCH_MOCK', False):
1015     from .mock_search import Search