src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 import librarian.meta.types.date
  13 import librarian.meta.types.person
  14 import librarian.meta.types.text
  15 from librarian.parser import WLDocument
  16 from lxml import etree
  17 import scorched
  18 import catalogue.models
  19 import picture.models
  20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  21 from wolnelektury.utils import makedirs
  22 from . import custom
  23
  24 log = logging.getLogger('search')
  25
  26
  27 if os.path.isfile(settings.SOLR_STOPWORDS):
  28     stopwords = set(
  29         line.strip()
  30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  31 else:
  32     stopwords = set()
  33
  34
  35 class SolrIndex(object):
  36     def __init__(self, mode=None):
  37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  38
  39
  40 class Snippets(object):
  41     """
  42     This class manages snippet files for indexed object (book)
  43     the snippets are concatenated together, and their positions and
  44     lengths are kept in lucene index fields.
  45     """
  46     SNIPPET_DIR = "snippets"
  47
  48     def __init__(self, book_id, revision=None):
  49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  50         self.book_id = book_id
  51         self.revision = revision
  52         self.file = None
  53         self.position = None
  54
  55     @property
  56     def path(self):
  57         if self.revision:
  58             fn = "%d.%d" % (self.book_id, self.revision)
  59         else:
  60             fn = "%d" % self.book_id
  61
  62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  63
  64     def open(self, mode='r'):
  65         """
  66         Open the snippet file. Call .close() afterwards.
  67         """
  68         if 'b' not in mode:
  69             mode += 'b'
  70
  71         if 'w' in mode:
  72             if os.path.exists(self.path):
  73                 self.revision = 1
  74                 while True:
  75                     if not os.path.exists(self.path):
  76                         break
  77                     self.revision += 1
  78
  79         self.file = open(self.path, mode)
  80         self.position = 0
  81         return self
  82
  83     def add(self, snippet):
  84         """
  85         Append a snippet (unicode) to the snippet file.
  86         Return a (position, length) tuple
  87         """
  88         txt = snippet.encode('utf-8')
  89         l = len(txt)
  90         self.file.write(txt)
  91         pos = (self.position, l)
  92         self.position += l
  93         return pos
  94
  95     def get(self, pos):
  96         """
  97         Given a tuple of (position, length) return an unicode
  98         of the snippet stored there.
  99         """
 100         self.file.seek(pos[0], 0)
 101         try:
 102             txt = self.file.read(pos[1]).decode('utf-8')
 103         except:
 104             return ''
 105         return txt
 106
 107     def close(self):
 108         """Close snippet file"""
 109         if self.file:
 110             self.file.close()
 111
 112     def remove(self):
 113         self.revision = None
 114         try:
 115             os.unlink(self.path)
 116             self.revision = 0
 117             while True:
 118                 self.revision += 1
 119                 os.unlink(self.path)
 120         except OSError:
 121             pass
 122
 123
 124 class Index(SolrIndex):
 125     """
 126     Class indexing books.
 127     """
 128     def __init__(self):
 129         super(Index, self).__init__(mode='rw')
 130
 131     def remove_snippets(self, book):
 132         book.snippet_set.all().delete()
 133
 134     def add_snippet(self, book, doc):
 135         assert book.id == doc.pop('book_id')
 136         # Fragments already exist and can be indexed where they live.
 137         if 'fragment_anchor' in doc:
 138             return
 139
 140         text = doc.pop('text')
 141         header_index = doc.pop('header_index')
 142         book.snippet_set.create(
 143             sec=header_index,
 144             text=text,
 145         )
 146
 147     def delete_query(self, *queries):
 148         """
 149         index.delete(queries=...) doesn't work, so let's reimplement it
 150         using deletion of list of uids.
 151         """
 152         uids = set()
 153         for q in queries:
 154             if isinstance(q, scorched.search.LuceneQuery):
 155                 q = self.index.query(q)
 156             q.field_limiter.update(['uid'])
 157             st = 0
 158             rows = 100
 159             while True:
 160                 ids = q.paginate(start=st, rows=rows).execute()
 161                 if not len(ids):
 162                     break
 163                 for res in ids:
 164                     uids.add(res['uid'])
 165                 st += rows
 166         if uids:
 167             # FIXME: With Solr API change, this doesn't work.
 168             #self.index.delete(uids)
 169             return True
 170         else:
 171             return False
 172
 173     def index_tags(self, *tags, **kw):
 174         """
 175         Re-index global tag list.
 176         Removes all tags from index, then index them again.
 177         Indexed fields include: id, name (with and without polish stems), category
 178         """
 179         log.debug("Indexing tags")
 180         remove_only = kw.get('remove_only', False)
 181         # first, remove tags from index.
 182         if tags:
 183             tag_qs = []
 184             for tag in tags:
 185                 q_id = self.index.Q(tag_id=tag.id)
 186
 187                 if isinstance(tag, PDCounterAuthor):
 188                     q_cat = self.index.Q(tag_category='pd_author')
 189                 elif isinstance(tag, PDCounterBook):
 190                     q_cat = self.index.Q(tag_category='pd_book')
 191                 else:
 192                     q_cat = self.index.Q(tag_category=tag.category)
 193
 194                 q_id_cat = self.index.Q(q_id & q_cat)
 195                 tag_qs.append(q_id_cat)
 196             self.delete_query(*tag_qs)
 197         else:  # all
 198             q = self.index.Q(tag_id__any=True)
 199             self.delete_query(q)
 200
 201         if not remove_only:
 202             # then add them [all or just one passed]
 203             if not tags:
 204                 tags = chain(
 205                     catalogue.models.Tag.objects.exclude(category='set'),
 206                     PDCounterAuthor.objects.all(),
 207                     PDCounterBook.objects.all())
 208
 209             for tag in tags:
 210                 if isinstance(tag, PDCounterAuthor):
 211                     doc = {
 212                         "tag_id": int(tag.id),
 213                         "tag_name": tag.name,
 214                         "tag_name_pl": tag.name,
 215                         "tag_category": 'pd_author',
 216                         "is_pdcounter": True,
 217                         "uid": "tag%d_pd_a" % tag.id
 218                         }
 219                 elif isinstance(tag, PDCounterBook):
 220                     doc = {
 221                         "tag_id": int(tag.id),
 222                         "tag_name": tag.title,
 223                         "tag_name_pl": tag.title,
 224                         "tag_category": 'pd_book',
 225                         "is_pdcounter": True,
 226                         "uid": "tag%d_pd_b" % tag.id
 227                         }
 228                 else:
 229                     doc = {
 230                         "tag_id": int(tag.id),
 231                         "tag_name": tag.name,
 232                         "tag_name_pl": tag.name,
 233                         "tag_category": tag.category,
 234                         "is_pdcounter": False,
 235                         "uid": "tag%d" % tag.id
 236                         }
 237                 self.index.add(doc)
 238
 239     def create_book_doc(self, book):
 240         """
 241         Create a lucene document referring book id.
 242         """
 243         doc = {'book_id': int(book.id)}
 244         if book.parent is not None:
 245             doc['parent_id'] = int(book.parent.id)
 246         return doc
 247
 248     def remove_book(self, book, remove_snippets=True):
 249         """Removes a book from search index.
 250         book - Book instance."""
 251         self.delete_query(self.index.Q(book_id=book.id))
 252
 253         if remove_snippets:
 254             snippets = Snippets(book.id)
 255             snippets.remove()
 256         self.remove_snippets(book)
 257
 258     def index_book(self, book, book_info=None, overwrite=True):
 259         """
 260         Indexes the book.
 261         Creates a lucene document for extracted metadata
 262         and calls self.index_content() to index the contents of the book.
 263         """
 264         if not book.xml_file: return
 265
 266         if overwrite:
 267             # we don't remove snippets, since they might be still needed by
 268             # threads using not reopened index
 269             self.remove_book(book, remove_snippets=False)
 270
 271         book_doc = self.create_book_doc(book)
 272         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 273             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 274         # let's not index it - it's only used for extracting publish date
 275         if 'source_name' in meta_fields:
 276             del meta_fields['source_name']
 277
 278         for n, f in meta_fields.items():
 279             book_doc[n] = f
 280
 281         book_doc['uid'] = "book%s" % book_doc['book_id']
 282         self.index.add(book_doc)
 283         del book_doc
 284         book_fields = {
 285             'title': meta_fields['title'],
 286             'authors': meta_fields['authors'],
 287             'published_date': meta_fields['published_date']
 288             }
 289
 290         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 291             if tag_name in meta_fields:
 292                 book_fields[tag_name] = meta_fields[tag_name]
 293
 294         self.index_content(book, book_fields=book_fields)
 295
 296     master_tags = [
 297         'opowiadanie',
 298         'powiesc',
 299         'dramat_wierszowany_l',
 300         'dramat_wierszowany_lp',
 301         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 302         'wywiad',
 303     ]
 304
 305     ignore_content_tags = [
 306         'uwaga', 'extra', 'nota_red', 'abstrakt',
 307         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 308         'didaskalia',
 309         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 310     ]
 311
 312     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 313
 314     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 315                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 316
 317     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 318
 319     def extract_metadata(self, book, book_info=None, dc_only=None):
 320         """
 321         Extract metadata from book and returns a map of fields keyed by fieldname
 322         """
 323         fields = {}
 324
 325         if book_info is None:
 326             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
 327
 328         fields['slug'] = book.slug
 329         fields['is_book'] = True
 330
 331         # validator, name
 332         for field in dcparser.BookInfo.FIELDS:
 333             if dc_only and field.name not in dc_only:
 334                 continue
 335             if hasattr(book_info, field.name):
 336                 if not getattr(book_info, field.name):
 337                     continue
 338                 type_indicator = field.value_type
 339                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
 340                     s = getattr(book_info, field.name)
 341                     if field.multiple:
 342                         s = ', '.join(s)
 343                     fields[field.name] = s
 344                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
 345                     p = getattr(book_info, field.name)
 346                     if isinstance(p, librarian.meta.types.person.Person):
 347                         persons = str(p)
 348                     else:
 349                         persons = ', '.join(map(str, p))
 350                     fields[field.name] = persons
 351                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
 352                     dt = getattr(book_info, field.name)
 353                     fields[field.name] = dt
 354
 355         # get published date
 356         pd = None
 357         if hasattr(book_info, 'source_name') and book_info.source_name:
 358             match = self.published_date_re.search(book_info.source_name)
 359             if match is not None:
 360                 pd = str(match.groups()[0])
 361         if not pd:
 362             pd = ""
 363         fields["published_date"] = pd
 364
 365         return fields
 366
 367     # def add_gaps(self, fields, fieldname):
 368     #     """
 369     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 370     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 371     #     """
 372     #     def gap():
 373     #         while True:
 374     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 375     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 376
 377     def get_master(self, root):
 378         """
 379         Returns the first master tag from an etree.
 380         """
 381         for master in root.iter():
 382             if master.tag in self.master_tags:
 383                 return master
 384
 385     def index_content(self, book, book_fields):
 386         """
 387         Walks the book XML and extract content from it.
 388         Adds parts for each header tag and for each fragment.
 389         """
 390         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 391         root = wld.edoc.getroot()
 392
 393         master = self.get_master(root)
 394         if master is None:
 395             return []
 396
 397         def walker(node):
 398             if node.tag not in self.ignore_content_tags:
 399                 yield node, None, None
 400                 if node.text is not None:
 401                     yield None, node.text, None
 402                 for child in list(node):
 403                     for b, t, e in walker(child):
 404                         yield b, t, e
 405                 yield None, None, node
 406
 407             if node.tail is not None:
 408                 yield None, node.tail, None
 409             return
 410
 411         def fix_format(text):
 412             # separator = [" ", "\t", ".", ";", ","]
 413             if isinstance(text, list):
 414                 # need to join it first
 415                 text = filter(lambda s: s is not None, content)
 416                 text = ' '.join(text)
 417                 # for i in range(len(text)):
 418                 #     if i > 0:
 419                 #         if text[i][0] not in separator\
 420                 #             and text[i - 1][-1] not in separator:
 421                 #          text.insert(i, " ")
 422
 423             return re.sub("(?m)/$", "", text)
 424
 425         def add_part(snippets, **fields):
 426             doc = self.create_book_doc(book)
 427             for n, v in book_fields.items():
 428                 doc[n] = v
 429
 430             doc['header_index'] = fields["header_index"]
 431             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 432             doc['header_type'] = fields['header_type']
 433
 434             doc['text'] = fields['text']
 435
 436             # snippets
 437             snip_pos = snippets.add(fields["text"])
 438
 439             doc['snippets_position'] = snip_pos[0]
 440             doc['snippets_length'] = snip_pos[1]
 441             if snippets.revision:
 442                 doc["snippets_revision"] = snippets.revision
 443
 444             if 'fragment_anchor' in fields:
 445                 doc["fragment_anchor"] = fields['fragment_anchor']
 446
 447             if 'themes' in fields:
 448                 doc['themes'] = fields['themes']
 449             doc['uid'] = "part%s-%s-%s-%s" % (
 450                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 451             return doc
 452
 453         fragments = {}
 454         snippets = Snippets(book.id).open('w')
 455         try:
 456             for header, position in zip(list(master), range(len(master))):
 457
 458                 if header.tag in self.skip_header_tags:
 459                     continue
 460                 if header.tag is etree.Comment:
 461                     continue
 462
 463                 # section content
 464                 content = []
 465                 footnote = []
 466
 467                 def all_content(text):
 468                     for frag in fragments.values():
 469                         frag['text'].append(text)
 470                     content.append(text)
 471                 handle_text = [all_content]
 472
 473                 for start, text, end in walker(header):
 474                     # handle footnotes
 475                     if start is not None and start.tag in self.footnote_tags:
 476                         footnote = []
 477
 478                         def collect_footnote(t):
 479                             footnote.append(t)
 480
 481                         handle_text.append(collect_footnote)
 482                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 483                         handle_text.pop()
 484                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 485                                        text=''.join(footnote))
 486                         self.add_snippet(book, doc)
 487                         self.index.add(doc)
 488                         footnote = []
 489
 490                     # handle fragments and themes.
 491                     if start is not None and start.tag == 'begin':
 492                         fid = start.attrib['id'][1:]
 493                         fragments[fid] = {
 494                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 495
 496                     # themes for this fragment
 497                     elif start is not None and start.tag == 'motyw':
 498                         fid = start.attrib['id'][1:]
 499                         handle_text.append(lambda text: None)
 500                         if start.text is not None:
 501                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 502                     elif end is not None and end.tag == 'motyw':
 503                         handle_text.pop()
 504
 505                     elif start is not None and start.tag == 'end':
 506                         fid = start.attrib['id'][1:]
 507                         if fid not in fragments:
 508                             continue  # a broken <end> node, skip it
 509                         frag = fragments[fid]
 510                         if not frag['themes']:
 511                             continue  # empty themes list.
 512                         del fragments[fid]
 513
 514                         doc = add_part(snippets,
 515                                        header_type=frag['start_header'],
 516                                        header_index=frag['start_section'],
 517                                        header_span=position - frag['start_section'] + 1,
 518                                        fragment_anchor=fid,
 519                                        text=fix_format(frag['text']),
 520                                        themes=frag['themes'])
 521                         # Add searchable fragment
 522                         self.add_snippet(book, doc)
 523                         self.index.add(doc)
 524
 525                         # Collect content.
 526
 527                     if text is not None and handle_text is not []:
 528                         hdl = handle_text[-1]
 529                         hdl(text)
 530
 531                         # in the end, add a section text.
 532                 doc = add_part(snippets, header_index=position,
 533                                header_type=header.tag, text=fix_format(content))
 534
 535                 self.add_snippet(book, doc)
 536                 self.index.add(doc)
 537
 538         finally:
 539             snippets.close()
 540
 541     def remove_picture(self, picture_or_id):
 542         """Removes a picture from search index."""
 543         if isinstance(picture_or_id, picture.models.Picture):
 544             picture_id = picture_or_id.id
 545         else:
 546             picture_id = picture_or_id
 547         self.delete_query(self.index.Q(picture_id=picture_id))
 548
 549     def index_picture(self, picture, picture_info=None, overwrite=True):
 550         """
 551         Indexes the picture.
 552         Creates a lucene document for extracted metadata
 553         and calls self.index_area() to index the contents of the picture.
 554         """
 555         if overwrite:
 556             # we don't remove snippets, since they might be still needed by
 557             # threads using not reopened index
 558             self.remove_picture(picture)
 559
 560         picture_doc = {'picture_id': int(picture.id)}
 561         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 562             'authors', 'title', 'epochs', 'kinds', 'genres'])
 563
 564         picture_doc.update(meta_fields)
 565
 566         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 567         self.index.add(picture_doc)
 568         del picture_doc['is_book']
 569         for area in picture.areas.all():
 570             self.index_area(area, picture_fields=picture_doc)
 571
 572     def index_area(self, area, picture_fields):
 573         """
 574         Indexes themes and objects on the area.
 575         """
 576         doc = dict(picture_fields)
 577         doc['area_id'] = area.id
 578         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 579         doc['uid'] = 'area%s' % area.id
 580         self.index.add(doc)
 581
 582
 583 @total_ordering
 584 class SearchResult(object):
 585     def __init__(self, doc, how_found=None, query_terms=None):
 586         self.boost = 1.0
 587         self._hits = []
 588         self._processed_hits = None  # processed hits
 589         self.snippets = []
 590         self.query_terms = query_terms
 591         self._book = None
 592
 593         if 'score' in doc:
 594             self._score = doc['score']
 595         else:
 596             self._score = 0
 597
 598         self.book_id = int(doc["book_id"])
 599
 600         try:
 601             self.published_date = int(doc.get("published_date"))
 602         except ValueError:
 603             self.published_date = 0
 604
 605         # content hits
 606         header_type = doc.get("header_type", None)
 607         # we have a content hit in some header of fragment
 608         if header_type is not None:
 609             sec = (header_type, int(doc["header_index"]))
 610             header_span = doc['header_span']
 611             header_span = header_span is not None and int(header_span) or 1
 612             fragment = doc.get("fragment_anchor", None)
 613             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 614             snippets_rev = doc.get('snippets_revision', None)
 615
 616             hit = (sec + (header_span,), fragment, self._score, {
 617                 'how_found': how_found,
 618                 'snippets_pos': snippets_pos,
 619                 'snippets_revision': snippets_rev,
 620                 'themes': doc.get('themes', []),
 621                 'themes_pl': doc.get('themes_pl', [])
 622                 })
 623
 624             self._hits.append(hit)
 625
 626     @classmethod
 627     def from_book(cls, book, how_found=None, query_terms=None):
 628         doc = {
 629             'score': book.popularity.count,
 630             'book_id': book.id,
 631             'published_date': 0,
 632         }
 633         result = cls(doc, how_found=how_found, query_terms=query_terms)
 634         result._book = book
 635         return result
 636
 637     def __str__(self):
 638         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 639             (self.book_id, len(self._hits),
 640              len(self._processed_hits) if self._processed_hits else -1,
 641              self._score, len(self.snippets))
 642
 643     def __bytes__(self):
 644         return str(self).encode('utf-8')
 645
 646     @property
 647     def score(self):
 648         return self._score * self.boost
 649
 650     def merge(self, other):
 651         if self.book_id != other.book_id:
 652             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 653         self._hits += other._hits
 654         self._score += max(other._score, 0)
 655         return self
 656
 657     def get_book(self):
 658         if self._book is not None:
 659             return self._book
 660         try:
 661             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 662         except catalogue.models.Book.DoesNotExist:
 663             self._book = None
 664         return self._book
 665
 666     book = property(get_book)
 667
 668     POSITION = 0
 669     FRAGMENT = 1
 670     POSITION_INDEX = 1
 671     POSITION_SPAN = 2
 672     SCORE = 2
 673     OTHER = 3
 674
 675     @property
 676     def hits(self):
 677         if self._processed_hits is not None:
 678             return self._processed_hits
 679
 680         # to sections and fragments
 681         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 682
 683         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 684
 685         # sections not covered by fragments
 686         sect = filter(lambda s: 0 == len(list(filter(
 687             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 688                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 689
 690         def remove_duplicates(lst, keyfn, larger):
 691             els = {}
 692             for e in lst:
 693                 eif = keyfn(e)
 694                 if eif in els:
 695                     if larger(els[eif], e):
 696                         continue
 697                 els[eif] = e
 698             return els.values()
 699
 700         # remove fragments with duplicated fid's and duplicated snippets
 701         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 702
 703         # remove duplicate sections
 704         sections = {}
 705
 706         for s in sect:
 707             si = s[self.POSITION][self.POSITION_INDEX]
 708             # skip existing
 709             if si in sections:
 710                 if sections[si]['score'] >= s[self.SCORE]:
 711                     continue
 712
 713             m = {'score': s[self.SCORE],
 714                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 715                  }
 716             m.update(s[self.OTHER])
 717             sections[si] = m
 718
 719         hits = list(sections.values())
 720
 721         for f in frags:
 722             try:
 723                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 724             except catalogue.models.Fragment.DoesNotExist:
 725                 # stale index
 726                 continue
 727             # Figure out if we were searching for a token matching some word in theme name.
 728             themes = frag.tags.filter(category='theme')
 729             themes_hit = set()
 730             if self.query_terms is not None:
 731                 for i in range(0, len(f[self.OTHER]['themes'])):
 732                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 733                     tms = map(str.lower, tms)
 734                     for qt in self.query_terms:
 735                         if qt in tms:
 736                             themes_hit.add(f[self.OTHER]['themes'][i])
 737                             break
 738
 739             def theme_by_name(n):
 740                 th = list(filter(lambda t: t.name == n, themes))
 741                 if th:
 742                     return th[0]
 743                 else:
 744                     return None
 745             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 746
 747             m = {'score': f[self.SCORE],
 748                  'fragment': frag,
 749                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 750                  'themes': themes,
 751                  'themes_hit': themes_hit
 752                  }
 753             m.update(f[self.OTHER])
 754             hits.append(m)
 755
 756         hits.sort(key=lambda h: h['score'], reverse=True)
 757
 758         self._processed_hits = hits
 759
 760         return hits
 761
 762     @staticmethod
 763     def aggregate(*result_lists):
 764         books = {}
 765         for rl in result_lists:
 766             for r in rl:
 767                 if r.book_id in books:
 768                     books[r.book_id].merge(r)
 769                 else:
 770                     books[r.book_id] = r
 771         return books.values()
 772
 773     def get_sort_key(self):
 774         return (-self.score,
 775                 self.published_date,
 776                 self.book.sort_key_author if self.book else '',
 777                 self.book.sort_key if self.book else '')
 778
 779     def __lt__(self, other):
 780         return self.get_sort_key() > other.get_sort_key()
 781
 782     def __eq__(self, other):
 783         return self.get_sort_key() == other.get_sort_key()
 784
 785     def __len__(self):
 786         return len(self.hits)
 787
 788     def snippet_pos(self, idx=0):
 789         return self.hits[idx]['snippets_pos']
 790
 791     def snippet_revision(self, idx=0):
 792         try:
 793             return self.hits[idx]['snippets_revision']
 794         except (IndexError, KeyError):
 795             return None
 796
 797
 798 @total_ordering
 799 class PictureResult(object):
 800     def __init__(self, doc, how_found=None, query_terms=None):
 801         self.boost = 1.0
 802         self.query_terms = query_terms
 803         self._picture = None
 804         self._hits = []
 805         self._processed_hits = None
 806
 807         if 'score' in doc:
 808             self._score = doc['score']
 809         else:
 810             self._score = 0
 811
 812         self.picture_id = int(doc["picture_id"])
 813
 814         if doc.get('area_id'):
 815             hit = (self._score, {
 816                 'how_found': how_found,
 817                 'area_id': doc['area_id'],
 818                 'themes': doc.get('themes', []),
 819                 'themes_pl': doc.get('themes_pl', []),
 820             })
 821
 822             self._hits.append(hit)
 823
 824     def __str__(self):
 825         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 826
 827     def __repr__(self):
 828         return str(self)
 829
 830     @property
 831     def score(self):
 832         return self._score * self.boost
 833
 834     def merge(self, other):
 835         if self.picture_id != other.picture_id:
 836             raise ValueError(
 837                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 838         self._hits += other._hits
 839         self._score += max(other._score, 0)
 840         return self
 841
 842     SCORE = 0
 843     OTHER = 1
 844
 845     @property
 846     def hits(self):
 847         if self._processed_hits is not None:
 848             return self._processed_hits
 849
 850         hits = []
 851         for hit in self._hits:
 852             try:
 853                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 854             except picture.models.PictureArea.DoesNotExist:
 855                 # stale index
 856                 continue
 857             # Figure out if we were searching for a token matching some word in theme name.
 858             themes_hit = set()
 859             if self.query_terms is not None:
 860                 for i in range(0, len(hit[self.OTHER]['themes'])):
 861                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 862                     tms = map(str.lower, tms)
 863                     for qt in self.query_terms:
 864                         if qt in tms:
 865                             themes_hit.add(hit[self.OTHER]['themes'][i])
 866                             break
 867
 868             m = {
 869                 'score': hit[self.SCORE],
 870                 'area': area,
 871                 'themes_hit': themes_hit,
 872             }
 873             m.update(hit[self.OTHER])
 874             hits.append(m)
 875
 876         hits.sort(key=lambda h: h['score'], reverse=True)
 877         hits = hits[:1]
 878         self._processed_hits = hits
 879         return hits
 880
 881     def get_picture(self):
 882         if self._picture is None:
 883             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 884         return self._picture
 885
 886     picture = property(get_picture)
 887
 888     @staticmethod
 889     def aggregate(*result_lists):
 890         books = {}
 891         for rl in result_lists:
 892             for r in rl:
 893                 if r.picture_id in books:
 894                     books[r.picture_id].merge(r)
 895                 else:
 896                     books[r.picture_id] = r
 897         return books.values()
 898
 899     def __lt__(self, other):
 900         return self.score < other.score
 901
 902     def __eq__(self, other):
 903         return self.score == other.score
 904
 905
 906 class Search(SolrIndex):
 907     """
 908     Search facilities.
 909     """
 910     def __init__(self, default_field="text"):
 911         super(Search, self).__init__(mode='r')
 912
 913     def make_term_query(self, query, field='text', modal=operator.or_):
 914         """
 915         Returns term queries joined by boolean query.
 916         modal - applies to boolean query
 917         fuzzy - should the query by fuzzy.
 918         """
 919         if query is None:
 920             query = ''
 921         q = self.index.Q()
 922         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 923
 924         return q
 925
 926     def search_by_author(self, words):
 927         from catalogue.models import Book
 928         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 929         for word in words:
 930             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 931         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 932
 933     def search_words(self, words, fields, required=None, book=True, picture=False):
 934         if book and not picture and fields == ['authors']:
 935             return self.search_by_author(words)
 936         filters = []
 937         for word in words:
 938             if book or picture or (word not in stopwords):
 939                 word_filter = None
 940                 for field in fields:
 941                     q = self.index.Q(**{field: word})
 942                     if word_filter is None:
 943                         word_filter = q
 944                     else:
 945                         word_filter |= q
 946                 filters.append(word_filter)
 947         if required:
 948             required_filter = None
 949             for field in required:
 950                 for word in words:
 951                     if book or picture or (word not in stopwords):
 952                         q = self.index.Q(**{field: word})
 953                         if required_filter is None:
 954                             required_filter = q
 955                         else:
 956                             required_filter |= q
 957             filters.append(required_filter)
 958         if not filters:
 959             return []
 960         params = {}
 961         if book:
 962             params['is_book'] = True
 963         if picture:
 964             params['picture_id__gt'] = 0
 965         else:
 966             params['book_id__gt'] = 0
 967         query = self.index.query(**params)
 968         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 969         result_class = PictureResult if picture else SearchResult
 970         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 971
 972     def get_snippets(self, searchresult, query, field='text', num=1):
 973         """
 974         Returns a snippet for found scoreDoc.
 975         """
 976         maxnum = len(searchresult)
 977         if num is None or num < 0 or num > maxnum:
 978             num = maxnum
 979         book_id = searchresult.book_id
 980         revision = searchresult.snippet_revision()
 981         snippets = Snippets(book_id, revision=revision)
 982         snips = [None] * maxnum
 983         try:
 984             snippets.open()
 985             idx = 0
 986             while idx < maxnum and num > 0:
 987                 position, length = searchresult.snippet_pos(idx)
 988                 if position is None or length is None:
 989                     continue
 990                 text = snippets.get((int(position),
 991                                      int(length)))
 992                 snip = self.index.highlight(text=text, field=field, q=query)
 993                 if not snip and field == 'text':
 994                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 995                 if snip not in snips:
 996                     snips[idx] = snip
 997                     if snip:
 998                         num -= 1
 999                 idx += 1
1000
1001         except IOError as e:
1002             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1003             if not book:
1004                 log.error("Book does not exist for book id = %d" % book_id)
1005             elif not book.get().children.exists():
1006                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1007             return []
1008         finally:
1009             snippets.close()
1010
1011         # remove verse end markers..
1012         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1013
1014         searchresult.snippets = snips
1015
1016         return snips
1017
1018     @staticmethod
1019     def apply_filters(query, filters):
1020         """
1021         Apply filters to a query
1022         """
1023         if filters is None:
1024             filters = []
1025         filters = filter(lambda x: x is not None, filters)
1026         for f in filters:
1027             query = query.query(f)
1028         return query
1029
1030
1031 if getattr(settings, 'SEARCH_MOCK', False):
1032     from .mock_search import Search