src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 from librarian.parser import WLDocument
  13 from lxml import etree
  14 import scorched
  15 import catalogue.models
  16 import picture.models
  17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  18 from wolnelektury.utils import makedirs
  19 from . import custom
  20
  21 log = logging.getLogger('search')
  22
  23
  24 if os.path.isfile(settings.SOLR_STOPWORDS):
  25     stopwords = set(
  26         line.strip()
  27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  28 else:
  29     stopwords = set()
  30
  31
  32 class SolrIndex(object):
  33     def __init__(self, mode=None):
  34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  35
  36
  37 class Snippets(object):
  38     """
  39     This class manages snippet files for indexed object (book)
  40     the snippets are concatenated together, and their positions and
  41     lengths are kept in lucene index fields.
  42     """
  43     SNIPPET_DIR = "snippets"
  44
  45     def __init__(self, book_id, revision=None):
  46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  47         self.book_id = book_id
  48         self.revision = revision
  49         self.file = None
  50         self.position = None
  51
  52     @property
  53     def path(self):
  54         if self.revision:
  55             fn = "%d.%d" % (self.book_id, self.revision)
  56         else:
  57             fn = "%d" % self.book_id
  58
  59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  60
  61     def open(self, mode='r'):
  62         """
  63         Open the snippet file. Call .close() afterwards.
  64         """
  65         if 'b' not in mode:
  66             mode += 'b'
  67
  68         if 'w' in mode:
  69             if os.path.exists(self.path):
  70                 self.revision = 1
  71                 while True:
  72                     if not os.path.exists(self.path):
  73                         break
  74                     self.revision += 1
  75
  76         self.file = open(self.path, mode)
  77         self.position = 0
  78         return self
  79
  80     def add(self, snippet):
  81         """
  82         Append a snippet (unicode) to the snippet file.
  83         Return a (position, length) tuple
  84         """
  85         txt = snippet.encode('utf-8')
  86         l = len(txt)
  87         self.file.write(txt)
  88         pos = (self.position, l)
  89         self.position += l
  90         return pos
  91
  92     def get(self, pos):
  93         """
  94         Given a tuple of (position, length) return an unicode
  95         of the snippet stored there.
  96         """
  97         self.file.seek(pos[0], 0)
  98         try:
  99             txt = self.file.read(pos[1]).decode('utf-8')
 100         except:
 101             return ''
 102         return txt
 103
 104     def close(self):
 105         """Close snippet file"""
 106         if self.file:
 107             self.file.close()
 108
 109     def remove(self):
 110         self.revision = None
 111         try:
 112             os.unlink(self.path)
 113             self.revision = 0
 114             while True:
 115                 self.revision += 1
 116                 os.unlink(self.path)
 117         except OSError:
 118             pass
 119
 120
 121 class Index(SolrIndex):
 122     """
 123     Class indexing books.
 124     """
 125     def __init__(self):
 126         super(Index, self).__init__(mode='rw')
 127
 128     def delete_query(self, *queries):
 129         """
 130         index.delete(queries=...) doesn't work, so let's reimplement it
 131         using deletion of list of uids.
 132         """
 133         uids = set()
 134         for q in queries:
 135             if isinstance(q, scorched.search.LuceneQuery):
 136                 q = self.index.query(q)
 137             q.field_limiter.update(['uid'])
 138             st = 0
 139             rows = 100
 140             while True:
 141                 ids = q.paginate(start=st, rows=rows).execute()
 142                 if not len(ids):
 143                     break
 144                 for res in ids:
 145                     uids.add(res['uid'])
 146                 st += rows
 147         if uids:
 148             # FIXME: With Solr API change, this doesn't work.
 149             #self.index.delete(uids)
 150             return True
 151         else:
 152             return False
 153
 154     def index_tags(self, *tags, **kw):
 155         """
 156         Re-index global tag list.
 157         Removes all tags from index, then index them again.
 158         Indexed fields include: id, name (with and without polish stems), category
 159         """
 160         log.debug("Indexing tags")
 161         remove_only = kw.get('remove_only', False)
 162         # first, remove tags from index.
 163         if tags:
 164             tag_qs = []
 165             for tag in tags:
 166                 q_id = self.index.Q(tag_id=tag.id)
 167
 168                 if isinstance(tag, PDCounterAuthor):
 169                     q_cat = self.index.Q(tag_category='pd_author')
 170                 elif isinstance(tag, PDCounterBook):
 171                     q_cat = self.index.Q(tag_category='pd_book')
 172                 else:
 173                     q_cat = self.index.Q(tag_category=tag.category)
 174
 175                 q_id_cat = self.index.Q(q_id & q_cat)
 176                 tag_qs.append(q_id_cat)
 177             self.delete_query(*tag_qs)
 178         else:  # all
 179             q = self.index.Q(tag_id__any=True)
 180             self.delete_query(q)
 181
 182         if not remove_only:
 183             # then add them [all or just one passed]
 184             if not tags:
 185                 tags = chain(
 186                     catalogue.models.Tag.objects.exclude(category='set'),
 187                     PDCounterAuthor.objects.all(),
 188                     PDCounterBook.objects.all())
 189
 190             for tag in tags:
 191                 if isinstance(tag, PDCounterAuthor):
 192                     doc = {
 193                         "tag_id": int(tag.id),
 194                         "tag_name": tag.name,
 195                         "tag_name_pl": tag.name,
 196                         "tag_category": 'pd_author',
 197                         "is_pdcounter": True,
 198                         "uid": "tag%d_pd_a" % tag.id
 199                         }
 200                 elif isinstance(tag, PDCounterBook):
 201                     doc = {
 202                         "tag_id": int(tag.id),
 203                         "tag_name": tag.title,
 204                         "tag_name_pl": tag.title,
 205                         "tag_category": 'pd_book',
 206                         "is_pdcounter": True,
 207                         "uid": "tag%d_pd_b" % tag.id
 208                         }
 209                 else:
 210                     doc = {
 211                         "tag_id": int(tag.id),
 212                         "tag_name": tag.name,
 213                         "tag_name_pl": tag.name,
 214                         "tag_category": tag.category,
 215                         "is_pdcounter": False,
 216                         "uid": "tag%d" % tag.id
 217                         }
 218                 self.index.add(doc)
 219
 220     def create_book_doc(self, book):
 221         """
 222         Create a lucene document referring book id.
 223         """
 224         doc = {'book_id': int(book.id)}
 225         if book.parent is not None:
 226             doc['parent_id'] = int(book.parent.id)
 227         return doc
 228
 229     def remove_book(self, book_or_id, remove_snippets=True):
 230         """Removes a book from search index.
 231         book - Book instance."""
 232         if isinstance(book_or_id, catalogue.models.Book):
 233             book_id = book_or_id.id
 234         else:
 235             book_id = book_or_id
 236
 237         self.delete_query(self.index.Q(book_id=book_id))
 238
 239         if remove_snippets:
 240             snippets = Snippets(book_id)
 241             snippets.remove()
 242
 243     def index_book(self, book, book_info=None, overwrite=True):
 244         """
 245         Indexes the book.
 246         Creates a lucene document for extracted metadata
 247         and calls self.index_content() to index the contents of the book.
 248         """
 249         if overwrite:
 250             # we don't remove snippets, since they might be still needed by
 251             # threads using not reopened index
 252             self.remove_book(book, remove_snippets=False)
 253
 254         book_doc = self.create_book_doc(book)
 255         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 256             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 257         # let's not index it - it's only used for extracting publish date
 258         if 'source_name' in meta_fields:
 259             del meta_fields['source_name']
 260
 261         for n, f in meta_fields.items():
 262             book_doc[n] = f
 263
 264         book_doc['uid'] = "book%s" % book_doc['book_id']
 265         self.index.add(book_doc)
 266         del book_doc
 267         book_fields = {
 268             'title': meta_fields['title'],
 269             'authors': meta_fields['authors'],
 270             'published_date': meta_fields['published_date']
 271             }
 272
 273         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 274             if tag_name in meta_fields:
 275                 book_fields[tag_name] = meta_fields[tag_name]
 276
 277         self.index_content(book, book_fields=book_fields)
 278
 279     master_tags = [
 280         'opowiadanie',
 281         'powiesc',
 282         'dramat_wierszowany_l',
 283         'dramat_wierszowany_lp',
 284         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 285         'wywiad',
 286     ]
 287
 288     ignore_content_tags = [
 289         'uwaga', 'extra', 'nota_red', 'abstrakt',
 290         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 291         'didaskalia',
 292         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 293     ]
 294
 295     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 296
 297     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 298                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 299
 300     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 301
 302     def extract_metadata(self, book, book_info=None, dc_only=None):
 303         """
 304         Extract metadata from book and returns a map of fields keyed by fieldname
 305         """
 306         fields = {}
 307
 308         if book_info is None:
 309             book_info = dcparser.parse(open(book.xml_file.path))
 310
 311         fields['slug'] = book.slug
 312         fields['is_book'] = True
 313
 314         # validator, name
 315         for field in dcparser.BookInfo.FIELDS:
 316             if dc_only and field.name not in dc_only:
 317                 continue
 318             if hasattr(book_info, field.name):
 319                 if not getattr(book_info, field.name):
 320                     continue
 321                 # since no type information is available, we use validator
 322                 type_indicator = field.validator
 323                 if type_indicator == dcparser.as_unicode:
 324                     s = getattr(book_info, field.name)
 325                     if field.multiple:
 326                         s = ', '.join(s)
 327                     fields[field.name] = s
 328                 elif type_indicator == dcparser.as_person:
 329                     p = getattr(book_info, field.name)
 330                     if isinstance(p, dcparser.Person):
 331                         persons = str(p)
 332                     else:
 333                         persons = ', '.join(map(str, p))
 334                     fields[field.name] = persons
 335                 elif type_indicator == dcparser.as_date:
 336                     dt = getattr(book_info, field.name)
 337                     fields[field.name] = dt
 338
 339         # get published date
 340         pd = None
 341         if hasattr(book_info, 'source_name') and book_info.source_name:
 342             match = self.published_date_re.search(book_info.source_name)
 343             if match is not None:
 344                 pd = str(match.groups()[0])
 345         if not pd:
 346             pd = ""
 347         fields["published_date"] = pd
 348
 349         return fields
 350
 351     # def add_gaps(self, fields, fieldname):
 352     #     """
 353     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 354     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 355     #     """
 356     #     def gap():
 357     #         while True:
 358     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 359     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 360
 361     def get_master(self, root):
 362         """
 363         Returns the first master tag from an etree.
 364         """
 365         for master in root.iter():
 366             if master.tag in self.master_tags:
 367                 return master
 368
 369     def index_content(self, book, book_fields):
 370         """
 371         Walks the book XML and extract content from it.
 372         Adds parts for each header tag and for each fragment.
 373         """
 374         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 375         root = wld.edoc.getroot()
 376
 377         master = self.get_master(root)
 378         if master is None:
 379             return []
 380
 381         def walker(node):
 382             if node.tag not in self.ignore_content_tags:
 383                 yield node, None, None
 384                 if node.text is not None:
 385                     yield None, node.text, None
 386                 for child in list(node):
 387                     for b, t, e in walker(child):
 388                         yield b, t, e
 389                 yield None, None, node
 390
 391             if node.tail is not None:
 392                 yield None, node.tail, None
 393             return
 394
 395         def fix_format(text):
 396             # separator = [" ", "\t", ".", ";", ","]
 397             if isinstance(text, list):
 398                 # need to join it first
 399                 text = filter(lambda s: s is not None, content)
 400                 text = ' '.join(text)
 401                 # for i in range(len(text)):
 402                 #     if i > 0:
 403                 #         if text[i][0] not in separator\
 404                 #             and text[i - 1][-1] not in separator:
 405                 #          text.insert(i, " ")
 406
 407             return re.sub("(?m)/$", "", text)
 408
 409         def add_part(snippets, **fields):
 410             doc = self.create_book_doc(book)
 411             for n, v in book_fields.items():
 412                 doc[n] = v
 413
 414             doc['header_index'] = fields["header_index"]
 415             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 416             doc['header_type'] = fields['header_type']
 417
 418             doc['text'] = fields['text']
 419
 420             # snippets
 421             snip_pos = snippets.add(fields["text"])
 422
 423             doc['snippets_position'] = snip_pos[0]
 424             doc['snippets_length'] = snip_pos[1]
 425             if snippets.revision:
 426                 doc["snippets_revision"] = snippets.revision
 427
 428             if 'fragment_anchor' in fields:
 429                 doc["fragment_anchor"] = fields['fragment_anchor']
 430
 431             if 'themes' in fields:
 432                 doc['themes'] = fields['themes']
 433             doc['uid'] = "part%s-%s-%s-%s" % (
 434                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 435             return doc
 436
 437         fragments = {}
 438         snippets = Snippets(book.id).open('w')
 439         try:
 440             for header, position in zip(list(master), range(len(master))):
 441
 442                 if header.tag in self.skip_header_tags:
 443                     continue
 444                 if header.tag is etree.Comment:
 445                     continue
 446
 447                 # section content
 448                 content = []
 449                 footnote = []
 450
 451                 def all_content(text):
 452                     for frag in fragments.values():
 453                         frag['text'].append(text)
 454                     content.append(text)
 455                 handle_text = [all_content]
 456
 457                 for start, text, end in walker(header):
 458                     # handle footnotes
 459                     if start is not None and start.tag in self.footnote_tags:
 460                         footnote = []
 461
 462                         def collect_footnote(t):
 463                             footnote.append(t)
 464
 465                         handle_text.append(collect_footnote)
 466                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 467                         handle_text.pop()
 468                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 469                                        text=''.join(footnote),
 470                                        is_footnote=True)
 471                         self.index.add(doc)
 472                         footnote = []
 473
 474                     # handle fragments and themes.
 475                     if start is not None and start.tag == 'begin':
 476                         fid = start.attrib['id'][1:]
 477                         fragments[fid] = {
 478                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 479
 480                     # themes for this fragment
 481                     elif start is not None and start.tag == 'motyw':
 482                         fid = start.attrib['id'][1:]
 483                         handle_text.append(lambda text: None)
 484                         if start.text is not None:
 485                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 486                     elif end is not None and end.tag == 'motyw':
 487                         handle_text.pop()
 488
 489                     elif start is not None and start.tag == 'end':
 490                         fid = start.attrib['id'][1:]
 491                         if fid not in fragments:
 492                             continue  # a broken <end> node, skip it
 493                         frag = fragments[fid]
 494                         if not frag['themes']:
 495                             continue  # empty themes list.
 496                         del fragments[fid]
 497
 498                         doc = add_part(snippets,
 499                                        header_type=frag['start_header'],
 500                                        header_index=frag['start_section'],
 501                                        header_span=position - frag['start_section'] + 1,
 502                                        fragment_anchor=fid,
 503                                        text=fix_format(frag['text']),
 504                                        themes=frag['themes'])
 505                         self.index.add(doc)
 506
 507                         # Collect content.
 508
 509                     if text is not None and handle_text is not []:
 510                         hdl = handle_text[-1]
 511                         hdl(text)
 512
 513                         # in the end, add a section text.
 514                 doc = add_part(snippets, header_index=position,
 515                                header_type=header.tag, text=fix_format(content))
 516
 517                 self.index.add(doc)
 518
 519         finally:
 520             snippets.close()
 521
 522     def remove_picture(self, picture_or_id):
 523         """Removes a picture from search index."""
 524         if isinstance(picture_or_id, picture.models.Picture):
 525             picture_id = picture_or_id.id
 526         else:
 527             picture_id = picture_or_id
 528         self.delete_query(self.index.Q(picture_id=picture_id))
 529
 530     def index_picture(self, picture, picture_info=None, overwrite=True):
 531         """
 532         Indexes the picture.
 533         Creates a lucene document for extracted metadata
 534         and calls self.index_area() to index the contents of the picture.
 535         """
 536         if overwrite:
 537             # we don't remove snippets, since they might be still needed by
 538             # threads using not reopened index
 539             self.remove_picture(picture)
 540
 541         picture_doc = {'picture_id': int(picture.id)}
 542         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 543             'authors', 'title', 'epochs', 'kinds', 'genres'])
 544
 545         picture_doc.update(meta_fields)
 546
 547         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 548         self.index.add(picture_doc)
 549         del picture_doc['is_book']
 550         for area in picture.areas.all():
 551             self.index_area(area, picture_fields=picture_doc)
 552
 553     def index_area(self, area, picture_fields):
 554         """
 555         Indexes themes and objects on the area.
 556         """
 557         doc = dict(picture_fields)
 558         doc['area_id'] = area.id
 559         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 560         doc['uid'] = 'area%s' % area.id
 561         self.index.add(doc)
 562
 563
 564 @total_ordering
 565 class SearchResult(object):
 566     def __init__(self, doc, how_found=None, query_terms=None):
 567         self.boost = 1.0
 568         self._hits = []
 569         self._processed_hits = None  # processed hits
 570         self.snippets = []
 571         self.query_terms = query_terms
 572         self._book = None
 573
 574         if 'score' in doc:
 575             self._score = doc['score']
 576         else:
 577             self._score = 0
 578
 579         self.book_id = int(doc["book_id"])
 580
 581         try:
 582             self.published_date = int(doc.get("published_date"))
 583         except ValueError:
 584             self.published_date = 0
 585
 586         # content hits
 587         header_type = doc.get("header_type", None)
 588         # we have a content hit in some header of fragment
 589         if header_type is not None:
 590             sec = (header_type, int(doc["header_index"]))
 591             header_span = doc['header_span']
 592             header_span = header_span is not None and int(header_span) or 1
 593             fragment = doc.get("fragment_anchor", None)
 594             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 595             snippets_rev = doc.get('snippets_revision', None)
 596
 597             hit = (sec + (header_span,), fragment, self._score, {
 598                 'how_found': how_found,
 599                 'snippets_pos': snippets_pos,
 600                 'snippets_revision': snippets_rev,
 601                 'themes': doc.get('themes', []),
 602                 'themes_pl': doc.get('themes_pl', [])
 603                 })
 604
 605             self._hits.append(hit)
 606
 607     @classmethod
 608     def from_book(cls, book, how_found=None, query_terms=None):
 609         doc = {
 610             'score': book.popularity.count,
 611             'book_id': book.id,
 612             'published_date': 0,
 613         }
 614         result = cls(doc, how_found=how_found, query_terms=query_terms)
 615         result._book = book
 616         return result
 617
 618     def __str__(self):
 619         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 620             (self.book_id, len(self._hits),
 621              len(self._processed_hits) if self._processed_hits else -1,
 622              self._score, len(self.snippets))
 623
 624     def __bytes__(self):
 625         return str(self).encode('utf-8')
 626
 627     @property
 628     def score(self):
 629         return self._score * self.boost
 630
 631     def merge(self, other):
 632         if self.book_id != other.book_id:
 633             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 634         self._hits += other._hits
 635         self._score += max(other._score, 0)
 636         return self
 637
 638     def get_book(self):
 639         if self._book is not None:
 640             return self._book
 641         try:
 642             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 643         except catalogue.models.Book.DoesNotExist:
 644             self._book = None
 645         return self._book
 646
 647     book = property(get_book)
 648
 649     POSITION = 0
 650     FRAGMENT = 1
 651     POSITION_INDEX = 1
 652     POSITION_SPAN = 2
 653     SCORE = 2
 654     OTHER = 3
 655
 656     @property
 657     def hits(self):
 658         if self._processed_hits is not None:
 659             return self._processed_hits
 660
 661         # to sections and fragments
 662         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 663
 664         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 665
 666         # sections not covered by fragments
 667         sect = filter(lambda s: 0 == len(list(filter(
 668             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 669                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 670
 671         def remove_duplicates(lst, keyfn, larger):
 672             els = {}
 673             for e in lst:
 674                 eif = keyfn(e)
 675                 if eif in els:
 676                     if larger(els[eif], e):
 677                         continue
 678                 els[eif] = e
 679             return els.values()
 680
 681         # remove fragments with duplicated fid's and duplicated snippets
 682         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 683
 684         # remove duplicate sections
 685         sections = {}
 686
 687         for s in sect:
 688             si = s[self.POSITION][self.POSITION_INDEX]
 689             # skip existing
 690             if si in sections:
 691                 if sections[si]['score'] >= s[self.SCORE]:
 692                     continue
 693
 694             m = {'score': s[self.SCORE],
 695                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 696                  }
 697             m.update(s[self.OTHER])
 698             sections[si] = m
 699
 700         hits = list(sections.values())
 701
 702         for f in frags:
 703             try:
 704                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 705             except catalogue.models.Fragment.DoesNotExist:
 706                 # stale index
 707                 continue
 708             # Figure out if we were searching for a token matching some word in theme name.
 709             themes = frag.tags.filter(category='theme')
 710             themes_hit = set()
 711             if self.query_terms is not None:
 712                 for i in range(0, len(f[self.OTHER]['themes'])):
 713                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 714                     tms = map(str.lower, tms)
 715                     for qt in self.query_terms:
 716                         if qt in tms:
 717                             themes_hit.add(f[self.OTHER]['themes'][i])
 718                             break
 719
 720             def theme_by_name(n):
 721                 th = list(filter(lambda t: t.name == n, themes))
 722                 if th:
 723                     return th[0]
 724                 else:
 725                     return None
 726             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 727
 728             m = {'score': f[self.SCORE],
 729                  'fragment': frag,
 730                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 731                  'themes': themes,
 732                  'themes_hit': themes_hit
 733                  }
 734             m.update(f[self.OTHER])
 735             hits.append(m)
 736
 737         hits.sort(key=lambda h: h['score'], reverse=True)
 738
 739         self._processed_hits = hits
 740
 741         return hits
 742
 743     @staticmethod
 744     def aggregate(*result_lists):
 745         books = {}
 746         for rl in result_lists:
 747             for r in rl:
 748                 if r.book_id in books:
 749                     books[r.book_id].merge(r)
 750                 else:
 751                     books[r.book_id] = r
 752         return books.values()
 753
 754     def get_sort_key(self):
 755         return (-self.score,
 756                 self.published_date,
 757                 self.book.sort_key_author if self.book else '',
 758                 self.book.sort_key if self.book else '')
 759
 760     def __lt__(self, other):
 761         return self.get_sort_key() > other.get_sort_key()
 762
 763     def __eq__(self, other):
 764         return self.get_sort_key() == other.get_sort_key()
 765
 766     def __len__(self):
 767         return len(self.hits)
 768
 769     def snippet_pos(self, idx=0):
 770         return self.hits[idx]['snippets_pos']
 771
 772     def snippet_revision(self, idx=0):
 773         try:
 774             return self.hits[idx]['snippets_revision']
 775         except (IndexError, KeyError):
 776             return None
 777
 778
 779 @total_ordering
 780 class PictureResult(object):
 781     def __init__(self, doc, how_found=None, query_terms=None):
 782         self.boost = 1.0
 783         self.query_terms = query_terms
 784         self._picture = None
 785         self._hits = []
 786         self._processed_hits = None
 787
 788         if 'score' in doc:
 789             self._score = doc['score']
 790         else:
 791             self._score = 0
 792
 793         self.picture_id = int(doc["picture_id"])
 794
 795         if doc.get('area_id'):
 796             hit = (self._score, {
 797                 'how_found': how_found,
 798                 'area_id': doc['area_id'],
 799                 'themes': doc.get('themes', []),
 800                 'themes_pl': doc.get('themes_pl', []),
 801             })
 802
 803             self._hits.append(hit)
 804
 805     def __str__(self):
 806         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 807
 808     def __repr__(self):
 809         return str(self)
 810
 811     @property
 812     def score(self):
 813         return self._score * self.boost
 814
 815     def merge(self, other):
 816         if self.picture_id != other.picture_id:
 817             raise ValueError(
 818                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 819         self._hits += other._hits
 820         self._score += max(other._score, 0)
 821         return self
 822
 823     SCORE = 0
 824     OTHER = 1
 825
 826     @property
 827     def hits(self):
 828         if self._processed_hits is not None:
 829             return self._processed_hits
 830
 831         hits = []
 832         for hit in self._hits:
 833             try:
 834                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 835             except picture.models.PictureArea.DoesNotExist:
 836                 # stale index
 837                 continue
 838             # Figure out if we were searching for a token matching some word in theme name.
 839             themes_hit = set()
 840             if self.query_terms is not None:
 841                 for i in range(0, len(hit[self.OTHER]['themes'])):
 842                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 843                     tms = map(str.lower, tms)
 844                     for qt in self.query_terms:
 845                         if qt in tms:
 846                             themes_hit.add(hit[self.OTHER]['themes'][i])
 847                             break
 848
 849             m = {
 850                 'score': hit[self.SCORE],
 851                 'area': area,
 852                 'themes_hit': themes_hit,
 853             }
 854             m.update(hit[self.OTHER])
 855             hits.append(m)
 856
 857         hits.sort(key=lambda h: h['score'], reverse=True)
 858         hits = hits[:1]
 859         self._processed_hits = hits
 860         return hits
 861
 862     def get_picture(self):
 863         if self._picture is None:
 864             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 865         return self._picture
 866
 867     picture = property(get_picture)
 868
 869     @staticmethod
 870     def aggregate(*result_lists):
 871         books = {}
 872         for rl in result_lists:
 873             for r in rl:
 874                 if r.picture_id in books:
 875                     books[r.picture_id].merge(r)
 876                 else:
 877                     books[r.picture_id] = r
 878         return books.values()
 879
 880     def __lt__(self, other):
 881         return self.score < other.score
 882
 883     def __eq__(self, other):
 884         return self.score == other.score
 885
 886
 887 class Search(SolrIndex):
 888     """
 889     Search facilities.
 890     """
 891     def __init__(self, default_field="text"):
 892         super(Search, self).__init__(mode='r')
 893
 894     def make_term_query(self, query, field='text', modal=operator.or_):
 895         """
 896         Returns term queries joined by boolean query.
 897         modal - applies to boolean query
 898         fuzzy - should the query by fuzzy.
 899         """
 900         if query is None:
 901             query = ''
 902         q = self.index.Q()
 903         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 904
 905         return q
 906
 907     def search_by_author(self, words):
 908         from catalogue.models import Book
 909         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 910         for word in words:
 911             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 912         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 913
 914     def search_words(self, words, fields, required=None, book=True, picture=False):
 915         if book and not picture and fields == ['authors']:
 916             return self.search_by_author(words)
 917         filters = []
 918         for word in words:
 919             if book or picture or (word not in stopwords):
 920                 word_filter = None
 921                 for field in fields:
 922                     q = self.index.Q(**{field: word})
 923                     if word_filter is None:
 924                         word_filter = q
 925                     else:
 926                         word_filter |= q
 927                 filters.append(word_filter)
 928         if required:
 929             required_filter = None
 930             for field in required:
 931                 for word in words:
 932                     if book or picture or (word not in stopwords):
 933                         q = self.index.Q(**{field: word})
 934                         if required_filter is None:
 935                             required_filter = q
 936                         else:
 937                             required_filter |= q
 938             filters.append(required_filter)
 939         if not filters:
 940             return []
 941         params = {}
 942         if book:
 943             params['is_book'] = True
 944         if picture:
 945             params['picture_id__gt'] = 0
 946         else:
 947             params['book_id__gt'] = 0
 948         query = self.index.query(**params)
 949         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 950         result_class = PictureResult if picture else SearchResult
 951         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 952
 953     def get_snippets(self, searchresult, query, field='text', num=1):
 954         """
 955         Returns a snippet for found scoreDoc.
 956         """
 957         maxnum = len(searchresult)
 958         if num is None or num < 0 or num > maxnum:
 959             num = maxnum
 960         book_id = searchresult.book_id
 961         revision = searchresult.snippet_revision()
 962         snippets = Snippets(book_id, revision=revision)
 963         snips = [None] * maxnum
 964         try:
 965             snippets.open()
 966             idx = 0
 967             while idx < maxnum and num > 0:
 968                 position, length = searchresult.snippet_pos(idx)
 969                 if position is None or length is None:
 970                     continue
 971                 text = snippets.get((int(position),
 972                                      int(length)))
 973                 snip = self.index.highlight(text=text, field=field, q=query)
 974                 if not snip and field == 'text':
 975                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 976                 if snip not in snips:
 977                     snips[idx] = snip
 978                     if snip:
 979                         num -= 1
 980                 idx += 1
 981
 982         except IOError as e:
 983             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
 984             if not book:
 985                 log.error("Book does not exist for book id = %d" % book_id)
 986             elif not book.get().children.exists():
 987                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 988             return []
 989         finally:
 990             snippets.close()
 991
 992         # remove verse end markers..
 993         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 994
 995         searchresult.snippets = snips
 996
 997         return snips
 998
 999     @staticmethod
1000     def apply_filters(query, filters):
1001         """
1002         Apply filters to a query
1003         """
1004         if filters is None:
1005             filters = []
1006         filters = filter(lambda x: x is not None, filters)
1007         for f in filters:
1008             query = query.query(f)
1009         return query
1010
1011
1012 if getattr(settings, 'SEARCH_MOCK', False):
1013     from .mock_search import Search