src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 import librarian.meta.types.date
  13 import librarian.meta.types.person
  14 import librarian.meta.types.text
  15 from librarian.parser import WLDocument
  16 from lxml import etree
  17 import scorched
  18 import catalogue.models
  19 import picture.models
  20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  21 from wolnelektury.utils import makedirs
  22 from . import custom
  23
  24 log = logging.getLogger('search')
  25
  26
  27 if os.path.isfile(settings.SOLR_STOPWORDS):
  28     stopwords = set(
  29         line.strip()
  30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  31 else:
  32     stopwords = set()
  33
  34
  35 class SolrIndex(object):
  36     def __init__(self, mode=None):
  37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  38
  39
  40 class Snippets(object):
  41     """
  42     This class manages snippet files for indexed object (book)
  43     the snippets are concatenated together, and their positions and
  44     lengths are kept in lucene index fields.
  45     """
  46     SNIPPET_DIR = "snippets"
  47
  48     def __init__(self, book_id, revision=None):
  49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  50         self.book_id = book_id
  51         self.revision = revision
  52         self.file = None
  53         self.position = None
  54
  55     @property
  56     def path(self):
  57         if self.revision:
  58             fn = "%d.%d" % (self.book_id, self.revision)
  59         else:
  60             fn = "%d" % self.book_id
  61
  62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  63
  64     def open(self, mode='r'):
  65         """
  66         Open the snippet file. Call .close() afterwards.
  67         """
  68         if 'b' not in mode:
  69             mode += 'b'
  70
  71         if 'w' in mode:
  72             if os.path.exists(self.path):
  73                 self.revision = 1
  74                 while True:
  75                     if not os.path.exists(self.path):
  76                         break
  77                     self.revision += 1
  78
  79         self.file = open(self.path, mode)
  80         self.position = 0
  81         return self
  82
  83     def add(self, snippet):
  84         """
  85         Append a snippet (unicode) to the snippet file.
  86         Return a (position, length) tuple
  87         """
  88         txt = snippet.encode('utf-8')
  89         l = len(txt)
  90         self.file.write(txt)
  91         pos = (self.position, l)
  92         self.position += l
  93         return pos
  94
  95     def get(self, pos):
  96         """
  97         Given a tuple of (position, length) return an unicode
  98         of the snippet stored there.
  99         """
 100         self.file.seek(pos[0], 0)
 101         try:
 102             txt = self.file.read(pos[1]).decode('utf-8')
 103         except:
 104             return ''
 105         return txt
 106
 107     def close(self):
 108         """Close snippet file"""
 109         if self.file:
 110             self.file.close()
 111
 112     def remove(self):
 113         self.revision = None
 114         try:
 115             os.unlink(self.path)
 116             self.revision = 0
 117             while True:
 118                 self.revision += 1
 119                 os.unlink(self.path)
 120         except OSError:
 121             pass
 122
 123
 124 class Index(SolrIndex):
 125     """
 126     Class indexing books.
 127     """
 128     def __init__(self):
 129         super(Index, self).__init__(mode='rw')
 130
 131     def remove_snippets(self, book):
 132         book.snippet_set.all().delete()
 133
 134     def add_snippet(self, book, doc):
 135         assert book.id == doc.pop('book_id')
 136         # Fragments already exist and can be indexed where they live.
 137         if 'fragment_anchor' in doc:
 138             return
 139
 140         text = doc.pop('text')
 141         header_index = doc.pop('header_index')
 142         book.snippet_set.create(
 143             sec=header_index,
 144             text=text,
 145         )
 146
 147     def delete_query(self, *queries):
 148         """
 149         index.delete(queries=...) doesn't work, so let's reimplement it
 150         using deletion of list of uids.
 151         """
 152         uids = set()
 153         for q in queries:
 154             if isinstance(q, scorched.search.LuceneQuery):
 155                 q = self.index.query(q)
 156             q.field_limiter.update(['uid'])
 157             st = 0
 158             rows = 100
 159             while True:
 160                 ids = q.paginate(start=st, rows=rows).execute()
 161                 if not len(ids):
 162                     break
 163                 for res in ids:
 164                     uids.add(res['uid'])
 165                 st += rows
 166         if uids:
 167             # FIXME: With Solr API change, this doesn't work.
 168             #self.index.delete(uids)
 169             return True
 170         else:
 171             return False
 172
 173     def index_tags(self, *tags, **kw):
 174         """
 175         Re-index global tag list.
 176         Removes all tags from index, then index them again.
 177         Indexed fields include: id, name (with and without polish stems), category
 178         """
 179         log.debug("Indexing tags")
 180         remove_only = kw.get('remove_only', False)
 181         # first, remove tags from index.
 182         if tags:
 183             tag_qs = []
 184             for tag in tags:
 185                 q_id = self.index.Q(tag_id=tag.id)
 186
 187                 if isinstance(tag, PDCounterAuthor):
 188                     q_cat = self.index.Q(tag_category='pd_author')
 189                 elif isinstance(tag, PDCounterBook):
 190                     q_cat = self.index.Q(tag_category='pd_book')
 191                 else:
 192                     q_cat = self.index.Q(tag_category=tag.category)
 193
 194                 q_id_cat = self.index.Q(q_id & q_cat)
 195                 tag_qs.append(q_id_cat)
 196             self.delete_query(*tag_qs)
 197         else:  # all
 198             q = self.index.Q(tag_id__any=True)
 199             self.delete_query(q)
 200
 201         if not remove_only:
 202             # then add them [all or just one passed]
 203             if not tags:
 204                 tags = chain(
 205                     catalogue.models.Tag.objects.exclude(category='set'),
 206                     PDCounterAuthor.objects.all(),
 207                     PDCounterBook.objects.all())
 208
 209             for tag in tags:
 210                 if isinstance(tag, PDCounterAuthor):
 211                     doc = {
 212                         "tag_id": int(tag.id),
 213                         "tag_name": tag.name,
 214                         "tag_name_pl": tag.name,
 215                         "tag_category": 'pd_author',
 216                         "is_pdcounter": True,
 217                         "uid": "tag%d_pd_a" % tag.id
 218                         }
 219                 elif isinstance(tag, PDCounterBook):
 220                     doc = {
 221                         "tag_id": int(tag.id),
 222                         "tag_name": tag.title,
 223                         "tag_name_pl": tag.title,
 224                         "tag_category": 'pd_book',
 225                         "is_pdcounter": True,
 226                         "uid": "tag%d_pd_b" % tag.id
 227                         }
 228                 else:
 229                     doc = {
 230                         "tag_id": int(tag.id),
 231                         "tag_name": tag.name,
 232                         "tag_name_pl": tag.name,
 233                         "tag_category": tag.category,
 234                         "is_pdcounter": False,
 235                         "uid": "tag%d" % tag.id
 236                         }
 237                 self.index.add(doc)
 238
 239     def create_book_doc(self, book):
 240         """
 241         Create a lucene document referring book id.
 242         """
 243         doc = {'book_id': int(book.id)}
 244         if book.parent is not None:
 245             doc['parent_id'] = int(book.parent.id)
 246         return doc
 247
 248     def remove_book(self, book, remove_snippets=True, legacy=True):
 249         """Removes a book from search index.
 250         book - Book instance."""
 251         if legacy:
 252           self.delete_query(self.index.Q(book_id=book.id))
 253
 254           if remove_snippets:
 255             snippets = Snippets(book.id)
 256             snippets.remove()
 257         self.remove_snippets(book)
 258
 259     def index_book(self, book, book_info=None, overwrite=True, legacy=True):
 260         """
 261         Indexes the book.
 262         Creates a lucene document for extracted metadata
 263         and calls self.index_content() to index the contents of the book.
 264         """
 265         if not book.xml_file: return
 266
 267         if overwrite:
 268             # we don't remove snippets, since they might be still needed by
 269             # threads using not reopened index
 270             self.remove_book(book, remove_snippets=False, legacy=legacy)
 271
 272         book_doc = self.create_book_doc(book)
 273         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 274             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 275         # let's not index it - it's only used for extracting publish date
 276         if 'source_name' in meta_fields:
 277             del meta_fields['source_name']
 278
 279         for n, f in meta_fields.items():
 280             book_doc[n] = f
 281
 282         book_doc['uid'] = "book%s" % book_doc['book_id']
 283         if legacy:
 284             self.index.add(book_doc)
 285         del book_doc
 286         book_fields = {
 287             'title': meta_fields['title'],
 288             'authors': meta_fields['authors'],
 289             'published_date': meta_fields['published_date']
 290             }
 291
 292         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 293             if tag_name in meta_fields:
 294                 book_fields[tag_name] = meta_fields[tag_name]
 295
 296         self.index_content(book, book_fields=book_fields, legacy=legacy)
 297
 298     master_tags = [
 299         'opowiadanie',
 300         'powiesc',
 301         'dramat_wierszowany_l',
 302         'dramat_wierszowany_lp',
 303         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 304         'wywiad',
 305     ]
 306
 307     ignore_content_tags = [
 308         'uwaga', 'extra', 'nota_red', 'abstrakt',
 309         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 310         'didaskalia',
 311         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 312     ]
 313
 314     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 315
 316     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 317                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 318
 319     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 320
 321     def extract_metadata(self, book, book_info=None, dc_only=None):
 322         """
 323         Extract metadata from book and returns a map of fields keyed by fieldname
 324         """
 325         fields = {}
 326
 327         if book_info is None:
 328             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
 329
 330         fields['slug'] = book.slug
 331         fields['is_book'] = True
 332
 333         # validator, name
 334         for field in dcparser.BookInfo.FIELDS:
 335             if dc_only and field.name not in dc_only:
 336                 continue
 337             if hasattr(book_info, field.name):
 338                 if not getattr(book_info, field.name):
 339                     continue
 340                 type_indicator = field.value_type
 341                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
 342                     s = getattr(book_info, field.name)
 343                     if field.multiple:
 344                         s = ', '.join(s)
 345                     fields[field.name] = s
 346                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
 347                     p = getattr(book_info, field.name)
 348                     if isinstance(p, librarian.meta.types.person.Person):
 349                         persons = str(p)
 350                     else:
 351                         persons = ', '.join(map(str, p))
 352                     fields[field.name] = persons
 353                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
 354                     dt = getattr(book_info, field.name)
 355                     fields[field.name] = dt
 356
 357         # get published date
 358         pd = None
 359         if hasattr(book_info, 'source_name') and book_info.source_name:
 360             match = self.published_date_re.search(book_info.source_name)
 361             if match is not None:
 362                 pd = str(match.groups()[0])
 363         if not pd:
 364             pd = ""
 365         fields["published_date"] = pd
 366
 367         return fields
 368
 369     # def add_gaps(self, fields, fieldname):
 370     #     """
 371     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 372     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 373     #     """
 374     #     def gap():
 375     #         while True:
 376     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 377     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 378
 379     def get_master(self, root):
 380         """
 381         Returns the first master tag from an etree.
 382         """
 383         for master in root.iter():
 384             if master.tag in self.master_tags:
 385                 return master
 386
 387     def index_content(self, book, book_fields, legacy=True):
 388         """
 389         Walks the book XML and extract content from it.
 390         Adds parts for each header tag and for each fragment.
 391         """
 392         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 393         root = wld.edoc.getroot()
 394
 395         master = self.get_master(root)
 396         if master is None:
 397             return []
 398
 399         def walker(node):
 400             if node.tag not in self.ignore_content_tags:
 401                 yield node, None, None
 402                 if node.text is not None:
 403                     yield None, node.text, None
 404                 for child in list(node):
 405                     for b, t, e in walker(child):
 406                         yield b, t, e
 407                 yield None, None, node
 408
 409             if node.tail is not None:
 410                 yield None, node.tail, None
 411             return
 412
 413         def fix_format(text):
 414             # separator = [" ", "\t", ".", ";", ","]
 415             if isinstance(text, list):
 416                 # need to join it first
 417                 text = filter(lambda s: s is not None, content)
 418                 text = ' '.join(text)
 419                 # for i in range(len(text)):
 420                 #     if i > 0:
 421                 #         if text[i][0] not in separator\
 422                 #             and text[i - 1][-1] not in separator:
 423                 #          text.insert(i, " ")
 424
 425             return re.sub("(?m)/$", "", text)
 426
 427         def add_part(snippets, **fields):
 428             doc = self.create_book_doc(book)
 429             for n, v in book_fields.items():
 430                 doc[n] = v
 431
 432             doc['header_index'] = fields["header_index"]
 433             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 434             doc['header_type'] = fields['header_type']
 435
 436             doc['text'] = fields['text']
 437
 438             # snippets
 439             snip_pos = snippets.add(fields["text"])
 440
 441             doc['snippets_position'] = snip_pos[0]
 442             doc['snippets_length'] = snip_pos[1]
 443             if snippets.revision:
 444                 doc["snippets_revision"] = snippets.revision
 445
 446             if 'fragment_anchor' in fields:
 447                 doc["fragment_anchor"] = fields['fragment_anchor']
 448
 449             if 'themes' in fields:
 450                 doc['themes'] = fields['themes']
 451             doc['uid'] = "part%s-%s-%s-%s" % (
 452                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 453             return doc
 454
 455         fragments = {}
 456         snippets = Snippets(book.id).open('w')
 457         try:
 458             for header, position in zip(list(master), range(len(master))):
 459
 460                 if header.tag in self.skip_header_tags:
 461                     continue
 462                 if header.tag is etree.Comment:
 463                     continue
 464
 465                 # section content
 466                 content = []
 467                 footnote = []
 468
 469                 def all_content(text):
 470                     for frag in fragments.values():
 471                         frag['text'].append(text)
 472                     content.append(text)
 473                 handle_text = [all_content]
 474
 475                 for start, text, end in walker(header):
 476                     # handle footnotes
 477                     if start is not None and start.tag in self.footnote_tags:
 478                         footnote = []
 479
 480                         def collect_footnote(t):
 481                             footnote.append(t)
 482
 483                         handle_text.append(collect_footnote)
 484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 485                         handle_text.pop()
 486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 487                                        text=''.join(footnote))
 488                         self.add_snippet(book, doc)
 489                         if legacy:
 490                             self.index.add(doc)
 491                         footnote = []
 492
 493                     # handle fragments and themes.
 494                     if start is not None and start.tag == 'begin':
 495                         fid = start.attrib['id'][1:]
 496                         fragments[fid] = {
 497                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 498
 499                     # themes for this fragment
 500                     elif start is not None and start.tag == 'motyw':
 501                         fid = start.attrib['id'][1:]
 502                         handle_text.append(lambda text: None)
 503                         if start.text is not None:
 504                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 505                     elif end is not None and end.tag == 'motyw':
 506                         handle_text.pop()
 507
 508                     elif start is not None and start.tag == 'end':
 509                         fid = start.attrib['id'][1:]
 510                         if fid not in fragments:
 511                             continue  # a broken <end> node, skip it
 512                         frag = fragments[fid]
 513                         if not frag['themes']:
 514                             continue  # empty themes list.
 515                         del fragments[fid]
 516
 517                         doc = add_part(snippets,
 518                                        header_type=frag['start_header'],
 519                                        header_index=frag['start_section'],
 520                                        header_span=position - frag['start_section'] + 1,
 521                                        fragment_anchor=fid,
 522                                        text=fix_format(frag['text']),
 523                                        themes=frag['themes'])
 524                         # Add searchable fragment
 525                         self.add_snippet(book, doc)
 526                         if legacy:
 527                             self.index.add(doc)
 528
 529                         # Collect content.
 530
 531                     if text is not None and handle_text is not []:
 532                         hdl = handle_text[-1]
 533                         hdl(text)
 534
 535                         # in the end, add a section text.
 536                 doc = add_part(snippets, header_index=position,
 537                                header_type=header.tag, text=fix_format(content))
 538
 539                 self.add_snippet(book, doc)
 540                 if legacy:
 541                     self.index.add(doc)
 542
 543         finally:
 544             snippets.close()
 545
 546     def remove_picture(self, picture_or_id):
 547         """Removes a picture from search index."""
 548         if isinstance(picture_or_id, picture.models.Picture):
 549             picture_id = picture_or_id.id
 550         else:
 551             picture_id = picture_or_id
 552         self.delete_query(self.index.Q(picture_id=picture_id))
 553
 554     def index_picture(self, picture, picture_info=None, overwrite=True):
 555         """
 556         Indexes the picture.
 557         Creates a lucene document for extracted metadata
 558         and calls self.index_area() to index the contents of the picture.
 559         """
 560         if overwrite:
 561             # we don't remove snippets, since they might be still needed by
 562             # threads using not reopened index
 563             self.remove_picture(picture)
 564
 565         picture_doc = {'picture_id': int(picture.id)}
 566         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 567             'authors', 'title', 'epochs', 'kinds', 'genres'])
 568
 569         picture_doc.update(meta_fields)
 570
 571         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 572         self.index.add(picture_doc)
 573         del picture_doc['is_book']
 574         for area in picture.areas.all():
 575             self.index_area(area, picture_fields=picture_doc)
 576
 577     def index_area(self, area, picture_fields):
 578         """
 579         Indexes themes and objects on the area.
 580         """
 581         doc = dict(picture_fields)
 582         doc['area_id'] = area.id
 583         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 584         doc['uid'] = 'area%s' % area.id
 585         self.index.add(doc)
 586
 587
 588 @total_ordering
 589 class SearchResult(object):
 590     def __init__(self, doc, how_found=None, query_terms=None):
 591         self.boost = 1.0
 592         self._hits = []
 593         self._processed_hits = None  # processed hits
 594         self.snippets = []
 595         self.query_terms = query_terms
 596         self._book = None
 597
 598         if 'score' in doc:
 599             self._score = doc['score']
 600         else:
 601             self._score = 0
 602
 603         self.book_id = int(doc["book_id"])
 604
 605         try:
 606             self.published_date = int(doc.get("published_date"))
 607         except ValueError:
 608             self.published_date = 0
 609
 610         # content hits
 611         header_type = doc.get("header_type", None)
 612         # we have a content hit in some header of fragment
 613         if header_type is not None:
 614             sec = (header_type, int(doc["header_index"]))
 615             header_span = doc['header_span']
 616             header_span = header_span is not None and int(header_span) or 1
 617             fragment = doc.get("fragment_anchor", None)
 618             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 619             snippets_rev = doc.get('snippets_revision', None)
 620
 621             hit = (sec + (header_span,), fragment, self._score, {
 622                 'how_found': how_found,
 623                 'snippets_pos': snippets_pos,
 624                 'snippets_revision': snippets_rev,
 625                 'themes': doc.get('themes', []),
 626                 'themes_pl': doc.get('themes_pl', [])
 627                 })
 628
 629             self._hits.append(hit)
 630
 631     @classmethod
 632     def from_book(cls, book, how_found=None, query_terms=None):
 633         doc = {
 634             'score': book.popularity.count,
 635             'book_id': book.id,
 636             'published_date': 0,
 637         }
 638         result = cls(doc, how_found=how_found, query_terms=query_terms)
 639         result._book = book
 640         return result
 641
 642     def __str__(self):
 643         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 644             (self.book_id, len(self._hits),
 645              len(self._processed_hits) if self._processed_hits else -1,
 646              self._score, len(self.snippets))
 647
 648     def __bytes__(self):
 649         return str(self).encode('utf-8')
 650
 651     @property
 652     def score(self):
 653         return self._score * self.boost
 654
 655     def merge(self, other):
 656         if self.book_id != other.book_id:
 657             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 658         self._hits += other._hits
 659         self._score += max(other._score, 0)
 660         return self
 661
 662     def get_book(self):
 663         if self._book is not None:
 664             return self._book
 665         try:
 666             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 667         except catalogue.models.Book.DoesNotExist:
 668             self._book = None
 669         return self._book
 670
 671     book = property(get_book)
 672
 673     POSITION = 0
 674     FRAGMENT = 1
 675     POSITION_INDEX = 1
 676     POSITION_SPAN = 2
 677     SCORE = 2
 678     OTHER = 3
 679
 680     @property
 681     def hits(self):
 682         if self._processed_hits is not None:
 683             return self._processed_hits
 684
 685         # to sections and fragments
 686         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 687
 688         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 689
 690         # sections not covered by fragments
 691         sect = filter(lambda s: 0 == len(list(filter(
 692             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 693                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 694
 695         def remove_duplicates(lst, keyfn, larger):
 696             els = {}
 697             for e in lst:
 698                 eif = keyfn(e)
 699                 if eif in els:
 700                     if larger(els[eif], e):
 701                         continue
 702                 els[eif] = e
 703             return els.values()
 704
 705         # remove fragments with duplicated fid's and duplicated snippets
 706         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 707
 708         # remove duplicate sections
 709         sections = {}
 710
 711         for s in sect:
 712             si = s[self.POSITION][self.POSITION_INDEX]
 713             # skip existing
 714             if si in sections:
 715                 if sections[si]['score'] >= s[self.SCORE]:
 716                     continue
 717
 718             m = {'score': s[self.SCORE],
 719                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 720                  }
 721             m.update(s[self.OTHER])
 722             sections[si] = m
 723
 724         hits = list(sections.values())
 725
 726         for f in frags:
 727             try:
 728                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 729             except catalogue.models.Fragment.DoesNotExist:
 730                 # stale index
 731                 continue
 732             # Figure out if we were searching for a token matching some word in theme name.
 733             themes = frag.tags.filter(category='theme')
 734             themes_hit = set()
 735             if self.query_terms is not None:
 736                 for i in range(0, len(f[self.OTHER]['themes'])):
 737                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 738                     tms = map(str.lower, tms)
 739                     for qt in self.query_terms:
 740                         if qt in tms:
 741                             themes_hit.add(f[self.OTHER]['themes'][i])
 742                             break
 743
 744             def theme_by_name(n):
 745                 th = list(filter(lambda t: t.name == n, themes))
 746                 if th:
 747                     return th[0]
 748                 else:
 749                     return None
 750             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 751
 752             m = {'score': f[self.SCORE],
 753                  'fragment': frag,
 754                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 755                  'themes': themes,
 756                  'themes_hit': themes_hit
 757                  }
 758             m.update(f[self.OTHER])
 759             hits.append(m)
 760
 761         hits.sort(key=lambda h: h['score'], reverse=True)
 762
 763         self._processed_hits = hits
 764
 765         return hits
 766
 767     @staticmethod
 768     def aggregate(*result_lists):
 769         books = {}
 770         for rl in result_lists:
 771             for r in rl:
 772                 if r.book_id in books:
 773                     books[r.book_id].merge(r)
 774                 else:
 775                     books[r.book_id] = r
 776         return books.values()
 777
 778     def get_sort_key(self):
 779         return (-self.score,
 780                 self.published_date,
 781                 self.book.sort_key_author if self.book else '',
 782                 self.book.sort_key if self.book else '')
 783
 784     def __lt__(self, other):
 785         return self.get_sort_key() > other.get_sort_key()
 786
 787     def __eq__(self, other):
 788         return self.get_sort_key() == other.get_sort_key()
 789
 790     def __len__(self):
 791         return len(self.hits)
 792
 793     def snippet_pos(self, idx=0):
 794         return self.hits[idx]['snippets_pos']
 795
 796     def snippet_revision(self, idx=0):
 797         try:
 798             return self.hits[idx]['snippets_revision']
 799         except (IndexError, KeyError):
 800             return None
 801
 802
 803 @total_ordering
 804 class PictureResult(object):
 805     def __init__(self, doc, how_found=None, query_terms=None):
 806         self.boost = 1.0
 807         self.query_terms = query_terms
 808         self._picture = None
 809         self._hits = []
 810         self._processed_hits = None
 811
 812         if 'score' in doc:
 813             self._score = doc['score']
 814         else:
 815             self._score = 0
 816
 817         self.picture_id = int(doc["picture_id"])
 818
 819         if doc.get('area_id'):
 820             hit = (self._score, {
 821                 'how_found': how_found,
 822                 'area_id': doc['area_id'],
 823                 'themes': doc.get('themes', []),
 824                 'themes_pl': doc.get('themes_pl', []),
 825             })
 826
 827             self._hits.append(hit)
 828
 829     def __str__(self):
 830         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 831
 832     def __repr__(self):
 833         return str(self)
 834
 835     @property
 836     def score(self):
 837         return self._score * self.boost
 838
 839     def merge(self, other):
 840         if self.picture_id != other.picture_id:
 841             raise ValueError(
 842                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 843         self._hits += other._hits
 844         self._score += max(other._score, 0)
 845         return self
 846
 847     SCORE = 0
 848     OTHER = 1
 849
 850     @property
 851     def hits(self):
 852         if self._processed_hits is not None:
 853             return self._processed_hits
 854
 855         hits = []
 856         for hit in self._hits:
 857             try:
 858                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 859             except picture.models.PictureArea.DoesNotExist:
 860                 # stale index
 861                 continue
 862             # Figure out if we were searching for a token matching some word in theme name.
 863             themes_hit = set()
 864             if self.query_terms is not None:
 865                 for i in range(0, len(hit[self.OTHER]['themes'])):
 866                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 867                     tms = map(str.lower, tms)
 868                     for qt in self.query_terms:
 869                         if qt in tms:
 870                             themes_hit.add(hit[self.OTHER]['themes'][i])
 871                             break
 872
 873             m = {
 874                 'score': hit[self.SCORE],
 875                 'area': area,
 876                 'themes_hit': themes_hit,
 877             }
 878             m.update(hit[self.OTHER])
 879             hits.append(m)
 880
 881         hits.sort(key=lambda h: h['score'], reverse=True)
 882         hits = hits[:1]
 883         self._processed_hits = hits
 884         return hits
 885
 886     def get_picture(self):
 887         if self._picture is None:
 888             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 889         return self._picture
 890
 891     picture = property(get_picture)
 892
 893     @staticmethod
 894     def aggregate(*result_lists):
 895         books = {}
 896         for rl in result_lists:
 897             for r in rl:
 898                 if r.picture_id in books:
 899                     books[r.picture_id].merge(r)
 900                 else:
 901                     books[r.picture_id] = r
 902         return books.values()
 903
 904     def __lt__(self, other):
 905         return self.score < other.score
 906
 907     def __eq__(self, other):
 908         return self.score == other.score
 909
 910
 911 class Search(SolrIndex):
 912     """
 913     Search facilities.
 914     """
 915     def __init__(self, default_field="text"):
 916         super(Search, self).__init__(mode='r')
 917
 918     def make_term_query(self, query, field='text', modal=operator.or_):
 919         """
 920         Returns term queries joined by boolean query.
 921         modal - applies to boolean query
 922         fuzzy - should the query by fuzzy.
 923         """
 924         if query is None:
 925             query = ''
 926         q = self.index.Q()
 927         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 928
 929         return q
 930
 931     def search_by_author(self, words):
 932         from catalogue.models import Book
 933         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 934         for word in words:
 935             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 936         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 937
 938     def search_words(self, words, fields, required=None, book=True, picture=False):
 939         if book and not picture and fields == ['authors']:
 940             return self.search_by_author(words)
 941         filters = []
 942         for word in words:
 943             if book or picture or (word not in stopwords):
 944                 word_filter = None
 945                 for field in fields:
 946                     q = self.index.Q(**{field: word})
 947                     if word_filter is None:
 948                         word_filter = q
 949                     else:
 950                         word_filter |= q
 951                 filters.append(word_filter)
 952         if required:
 953             required_filter = None
 954             for field in required:
 955                 for word in words:
 956                     if book or picture or (word not in stopwords):
 957                         q = self.index.Q(**{field: word})
 958                         if required_filter is None:
 959                             required_filter = q
 960                         else:
 961                             required_filter |= q
 962             filters.append(required_filter)
 963         if not filters:
 964             return []
 965         params = {}
 966         if book:
 967             params['is_book'] = True
 968         if picture:
 969             params['picture_id__gt'] = 0
 970         else:
 971             params['book_id__gt'] = 0
 972         query = self.index.query(**params)
 973         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 974         result_class = PictureResult if picture else SearchResult
 975         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 976
 977     def get_snippets(self, searchresult, query, field='text', num=1):
 978         """
 979         Returns a snippet for found scoreDoc.
 980         """
 981         maxnum = len(searchresult)
 982         if num is None or num < 0 or num > maxnum:
 983             num = maxnum
 984         book_id = searchresult.book_id
 985         revision = searchresult.snippet_revision()
 986         snippets = Snippets(book_id, revision=revision)
 987         snips = [None] * maxnum
 988         try:
 989             snippets.open()
 990             idx = 0
 991             while idx < maxnum and num > 0:
 992                 position, length = searchresult.snippet_pos(idx)
 993                 if position is None or length is None:
 994                     continue
 995                 text = snippets.get((int(position),
 996                                      int(length)))
 997                 snip = self.index.highlight(text=text, field=field, q=query)
 998                 if not snip and field == 'text':
 999                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
1000                 if snip not in snips:
1001                     snips[idx] = snip
1002                     if snip:
1003                         num -= 1
1004                 idx += 1
1005
1006         except IOError as e:
1007             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1008             if not book:
1009                 log.error("Book does not exist for book id = %d" % book_id)
1010             elif not book.get().children.exists():
1011                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1012             return []
1013         finally:
1014             snippets.close()
1015
1016         # remove verse end markers..
1017         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1018
1019         searchresult.snippets = snips
1020
1021         return snips
1022
1023     @staticmethod
1024     def apply_filters(query, filters):
1025         """
1026         Apply filters to a query
1027         """
1028         if filters is None:
1029             filters = []
1030         filters = filter(lambda x: x is not None, filters)
1031         for f in filters:
1032             query = query.query(f)
1033         return query
1034
1035
1036 if getattr(settings, 'SEARCH_MOCK', False):
1037     from .mock_search import Search