src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 from librarian.parser import WLDocument
  13 from lxml import etree
  14 import scorched
  15 import catalogue.models
  16 import picture.models
  17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  18 from wolnelektury.utils import makedirs
  19 from . import custom
  20
  21 log = logging.getLogger('search')
  22
  23
  24 if os.path.isfile(settings.SOLR_STOPWORDS):
  25     stopwords = set(
  26         line.strip()
  27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  28 else:
  29     stopwords = set()
  30
  31
  32 class SolrIndex(object):
  33     def __init__(self, mode=None):
  34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  35
  36
  37 class Snippets(object):
  38     """
  39     This class manages snippet files for indexed object (book)
  40     the snippets are concatenated together, and their positions and
  41     lengths are kept in lucene index fields.
  42     """
  43     SNIPPET_DIR = "snippets"
  44
  45     def __init__(self, book_id, revision=None):
  46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  47         self.book_id = book_id
  48         self.revision = revision
  49         self.file = None
  50         self.position = None
  51
  52     @property
  53     def path(self):
  54         if self.revision:
  55             fn = "%d.%d" % (self.book_id, self.revision)
  56         else:
  57             fn = "%d" % self.book_id
  58
  59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  60
  61     def open(self, mode='r'):
  62         """
  63         Open the snippet file. Call .close() afterwards.
  64         """
  65         if 'b' not in mode:
  66             mode += 'b'
  67
  68         if 'w' in mode:
  69             if os.path.exists(self.path):
  70                 self.revision = 1
  71                 while True:
  72                     if not os.path.exists(self.path):
  73                         break
  74                     self.revision += 1
  75
  76         self.file = open(self.path, mode)
  77         self.position = 0
  78         return self
  79
  80     def add(self, snippet):
  81         """
  82         Append a snippet (unicode) to the snippet file.
  83         Return a (position, length) tuple
  84         """
  85         txt = snippet.encode('utf-8')
  86         l = len(txt)
  87         self.file.write(txt)
  88         pos = (self.position, l)
  89         self.position += l
  90         return pos
  91
  92     def get(self, pos):
  93         """
  94         Given a tuple of (position, length) return an unicode
  95         of the snippet stored there.
  96         """
  97         self.file.seek(pos[0], 0)
  98         txt = self.file.read(pos[1]).decode('utf-8')
  99         return txt
 100
 101     def close(self):
 102         """Close snippet file"""
 103         if self.file:
 104             self.file.close()
 105
 106     def remove(self):
 107         self.revision = None
 108         try:
 109             os.unlink(self.path)
 110             self.revision = 0
 111             while True:
 112                 self.revision += 1
 113                 os.unlink(self.path)
 114         except OSError:
 115             pass
 116
 117
 118 class Index(SolrIndex):
 119     """
 120     Class indexing books.
 121     """
 122     def __init__(self):
 123         super(Index, self).__init__(mode='rw')
 124
 125     def delete_query(self, *queries):
 126         """
 127         index.delete(queries=...) doesn't work, so let's reimplement it
 128         using deletion of list of uids.
 129         """
 130         uids = set()
 131         for q in queries:
 132             if isinstance(q, scorched.search.LuceneQuery):
 133                 q = self.index.query(q)
 134             q.field_limiter.update(['uid'])
 135             st = 0
 136             rows = 100
 137             while True:
 138                 ids = q.paginate(start=st, rows=rows).execute()
 139                 if not len(ids):
 140                     break
 141                 for res in ids:
 142                     uids.add(res['uid'])
 143                 st += rows
 144         if uids:
 145             # FIXME: With Solr API change, this doesn't work.
 146             #self.index.delete(uids)
 147             return True
 148         else:
 149             return False
 150
 151     def index_tags(self, *tags, **kw):
 152         """
 153         Re-index global tag list.
 154         Removes all tags from index, then index them again.
 155         Indexed fields include: id, name (with and without polish stems), category
 156         """
 157         log.debug("Indexing tags")
 158         remove_only = kw.get('remove_only', False)
 159         # first, remove tags from index.
 160         if tags:
 161             tag_qs = []
 162             for tag in tags:
 163                 q_id = self.index.Q(tag_id=tag.id)
 164
 165                 if isinstance(tag, PDCounterAuthor):
 166                     q_cat = self.index.Q(tag_category='pd_author')
 167                 elif isinstance(tag, PDCounterBook):
 168                     q_cat = self.index.Q(tag_category='pd_book')
 169                 else:
 170                     q_cat = self.index.Q(tag_category=tag.category)
 171
 172                 q_id_cat = self.index.Q(q_id & q_cat)
 173                 tag_qs.append(q_id_cat)
 174             self.delete_query(*tag_qs)
 175         else:  # all
 176             q = self.index.Q(tag_id__any=True)
 177             self.delete_query(q)
 178
 179         if not remove_only:
 180             # then add them [all or just one passed]
 181             if not tags:
 182                 tags = chain(
 183                     catalogue.models.Tag.objects.exclude(category='set'),
 184                     PDCounterAuthor.objects.all(),
 185                     PDCounterBook.objects.all())
 186
 187             for tag in tags:
 188                 if isinstance(tag, PDCounterAuthor):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.name,
 192                         "tag_name_pl": tag.name,
 193                         "tag_category": 'pd_author',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_a" % tag.id
 196                         }
 197                 elif isinstance(tag, PDCounterBook):
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.title,
 201                         "tag_name_pl": tag.title,
 202                         "tag_category": 'pd_book',
 203                         "is_pdcounter": True,
 204                         "uid": "tag%d_pd_b" % tag.id
 205                         }
 206                 else:
 207                     doc = {
 208                         "tag_id": int(tag.id),
 209                         "tag_name": tag.name,
 210                         "tag_name_pl": tag.name,
 211                         "tag_category": tag.category,
 212                         "is_pdcounter": False,
 213                         "uid": "tag%d" % tag.id
 214                         }
 215                 self.index.add(doc)
 216
 217     def create_book_doc(self, book):
 218         """
 219         Create a lucene document referring book id.
 220         """
 221         doc = {'book_id': int(book.id)}
 222         if book.parent is not None:
 223             doc['parent_id'] = int(book.parent.id)
 224         return doc
 225
 226     def remove_book(self, book_or_id, remove_snippets=True):
 227         """Removes a book from search index.
 228         book - Book instance."""
 229         if isinstance(book_or_id, catalogue.models.Book):
 230             book_id = book_or_id.id
 231         else:
 232             book_id = book_or_id
 233
 234         self.delete_query(self.index.Q(book_id=book_id))
 235
 236         if remove_snippets:
 237             snippets = Snippets(book_id)
 238             snippets.remove()
 239
 240     def index_book(self, book, book_info=None, overwrite=True):
 241         """
 242         Indexes the book.
 243         Creates a lucene document for extracted metadata
 244         and calls self.index_content() to index the contents of the book.
 245         """
 246         if overwrite:
 247             # we don't remove snippets, since they might be still needed by
 248             # threads using not reopened index
 249             self.remove_book(book, remove_snippets=False)
 250
 251         book_doc = self.create_book_doc(book)
 252         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 253             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 254         # let's not index it - it's only used for extracting publish date
 255         if 'source_name' in meta_fields:
 256             del meta_fields['source_name']
 257
 258         for n, f in meta_fields.items():
 259             book_doc[n] = f
 260
 261         book_doc['uid'] = "book%s" % book_doc['book_id']
 262         self.index.add(book_doc)
 263         del book_doc
 264         book_fields = {
 265             'title': meta_fields['title'],
 266             'authors': meta_fields['authors'],
 267             'published_date': meta_fields['published_date']
 268             }
 269
 270         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 271             if tag_name in meta_fields:
 272                 book_fields[tag_name] = meta_fields[tag_name]
 273
 274         self.index_content(book, book_fields=book_fields)
 275
 276     master_tags = [
 277         'opowiadanie',
 278         'powiesc',
 279         'dramat_wierszowany_l',
 280         'dramat_wierszowany_lp',
 281         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 282         'wywiad',
 283     ]
 284
 285     ignore_content_tags = [
 286         'uwaga', 'extra', 'nota_red', 'abstrakt',
 287         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 288         'didaskalia',
 289         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 290     ]
 291
 292     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 293
 294     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 295                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 296
 297     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 298
 299     def extract_metadata(self, book, book_info=None, dc_only=None):
 300         """
 301         Extract metadata from book and returns a map of fields keyed by fieldname
 302         """
 303         fields = {}
 304
 305         if book_info is None:
 306             book_info = dcparser.parse(open(book.xml_file.path))
 307
 308         fields['slug'] = book.slug
 309         fields['is_book'] = True
 310
 311         # validator, name
 312         for field in dcparser.BookInfo.FIELDS:
 313             if dc_only and field.name not in dc_only:
 314                 continue
 315             if hasattr(book_info, field.name):
 316                 if not getattr(book_info, field.name):
 317                     continue
 318                 # since no type information is available, we use validator
 319                 type_indicator = field.validator
 320                 if type_indicator == dcparser.as_unicode:
 321                     s = getattr(book_info, field.name)
 322                     if field.multiple:
 323                         s = ', '.join(s)
 324                     fields[field.name] = s
 325                 elif type_indicator == dcparser.as_person:
 326                     p = getattr(book_info, field.name)
 327                     if isinstance(p, dcparser.Person):
 328                         persons = str(p)
 329                     else:
 330                         persons = ', '.join(map(str, p))
 331                     fields[field.name] = persons
 332                 elif type_indicator == dcparser.as_date:
 333                     dt = getattr(book_info, field.name)
 334                     fields[field.name] = dt
 335
 336         # get published date
 337         pd = None
 338         if hasattr(book_info, 'source_name') and book_info.source_name:
 339             match = self.published_date_re.search(book_info.source_name)
 340             if match is not None:
 341                 pd = str(match.groups()[0])
 342         if not pd:
 343             pd = ""
 344         fields["published_date"] = pd
 345
 346         return fields
 347
 348     # def add_gaps(self, fields, fieldname):
 349     #     """
 350     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 351     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 352     #     """
 353     #     def gap():
 354     #         while True:
 355     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 356     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 357
 358     def get_master(self, root):
 359         """
 360         Returns the first master tag from an etree.
 361         """
 362         for master in root.iter():
 363             if master.tag in self.master_tags:
 364                 return master
 365
 366     def index_content(self, book, book_fields):
 367         """
 368         Walks the book XML and extract content from it.
 369         Adds parts for each header tag and for each fragment.
 370         """
 371         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 372         root = wld.edoc.getroot()
 373
 374         master = self.get_master(root)
 375         if master is None:
 376             return []
 377
 378         def walker(node):
 379             if node.tag not in self.ignore_content_tags:
 380                 yield node, None, None
 381                 if node.text is not None:
 382                     yield None, node.text, None
 383                 for child in list(node):
 384                     for b, t, e in walker(child):
 385                         yield b, t, e
 386                 yield None, None, node
 387
 388             if node.tail is not None:
 389                 yield None, node.tail, None
 390             return
 391
 392         def fix_format(text):
 393             # separator = [u" ", u"\t", u".", u";", u","]
 394             if isinstance(text, list):
 395                 # need to join it first
 396                 text = filter(lambda s: s is not None, content)
 397                 text = u' '.join(text)
 398                 # for i in range(len(text)):
 399                 #     if i > 0:
 400                 #         if text[i][0] not in separator\
 401                 #             and text[i - 1][-1] not in separator:
 402                 #          text.insert(i, u" ")
 403
 404             return re.sub("(?m)/$", "", text)
 405
 406         def add_part(snippets, **fields):
 407             doc = self.create_book_doc(book)
 408             for n, v in book_fields.items():
 409                 doc[n] = v
 410
 411             doc['header_index'] = fields["header_index"]
 412             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 413             doc['header_type'] = fields['header_type']
 414
 415             doc['text'] = fields['text']
 416
 417             # snippets
 418             snip_pos = snippets.add(fields["text"])
 419
 420             doc['snippets_position'] = snip_pos[0]
 421             doc['snippets_length'] = snip_pos[1]
 422             if snippets.revision:
 423                 doc["snippets_revision"] = snippets.revision
 424
 425             if 'fragment_anchor' in fields:
 426                 doc["fragment_anchor"] = fields['fragment_anchor']
 427
 428             if 'themes' in fields:
 429                 doc['themes'] = fields['themes']
 430             doc['uid'] = "part%s-%s-%s-%s" % (
 431                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 432             return doc
 433
 434         fragments = {}
 435         snippets = Snippets(book.id).open('w')
 436         try:
 437             for header, position in zip(list(master), range(len(master))):
 438
 439                 if header.tag in self.skip_header_tags:
 440                     continue
 441                 if header.tag is etree.Comment:
 442                     continue
 443
 444                 # section content
 445                 content = []
 446                 footnote = []
 447
 448                 def all_content(text):
 449                     for frag in fragments.values():
 450                         frag['text'].append(text)
 451                     content.append(text)
 452                 handle_text = [all_content]
 453
 454                 for start, text, end in walker(header):
 455                     # handle footnotes
 456                     if start is not None and start.tag in self.footnote_tags:
 457                         footnote = []
 458
 459                         def collect_footnote(t):
 460                             footnote.append(t)
 461
 462                         handle_text.append(collect_footnote)
 463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 464                         handle_text.pop()
 465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 466                                        text=u''.join(footnote),
 467                                        is_footnote=True)
 468                         self.index.add(doc)
 469                         footnote = []
 470
 471                     # handle fragments and themes.
 472                     if start is not None and start.tag == 'begin':
 473                         fid = start.attrib['id'][1:]
 474                         fragments[fid] = {
 475                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 476
 477                     # themes for this fragment
 478                     elif start is not None and start.tag == 'motyw':
 479                         fid = start.attrib['id'][1:]
 480                         handle_text.append(lambda text: None)
 481                         if start.text is not None:
 482                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 483                     elif end is not None and end.tag == 'motyw':
 484                         handle_text.pop()
 485
 486                     elif start is not None and start.tag == 'end':
 487                         fid = start.attrib['id'][1:]
 488                         if fid not in fragments:
 489                             continue  # a broken <end> node, skip it
 490                         frag = fragments[fid]
 491                         if not frag['themes']:
 492                             continue  # empty themes list.
 493                         del fragments[fid]
 494
 495                         doc = add_part(snippets,
 496                                        header_type=frag['start_header'],
 497                                        header_index=frag['start_section'],
 498                                        header_span=position - frag['start_section'] + 1,
 499                                        fragment_anchor=fid,
 500                                        text=fix_format(frag['text']),
 501                                        themes=frag['themes'])
 502                         self.index.add(doc)
 503
 504                         # Collect content.
 505
 506                     if text is not None and handle_text is not []:
 507                         hdl = handle_text[-1]
 508                         hdl(text)
 509
 510                         # in the end, add a section text.
 511                 doc = add_part(snippets, header_index=position,
 512                                header_type=header.tag, text=fix_format(content))
 513
 514                 self.index.add(doc)
 515
 516         finally:
 517             snippets.close()
 518
 519     def remove_picture(self, picture_or_id):
 520         """Removes a picture from search index."""
 521         if isinstance(picture_or_id, picture.models.Picture):
 522             picture_id = picture_or_id.id
 523         else:
 524             picture_id = picture_or_id
 525         self.delete_query(self.index.Q(picture_id=picture_id))
 526
 527     def index_picture(self, picture, picture_info=None, overwrite=True):
 528         """
 529         Indexes the picture.
 530         Creates a lucene document for extracted metadata
 531         and calls self.index_area() to index the contents of the picture.
 532         """
 533         if overwrite:
 534             # we don't remove snippets, since they might be still needed by
 535             # threads using not reopened index
 536             self.remove_picture(picture)
 537
 538         picture_doc = {'picture_id': int(picture.id)}
 539         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 540             'authors', 'title', 'epochs', 'kinds', 'genres'])
 541
 542         picture_doc.update(meta_fields)
 543
 544         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 545         self.index.add(picture_doc)
 546         del picture_doc['is_book']
 547         for area in picture.areas.all():
 548             self.index_area(area, picture_fields=picture_doc)
 549
 550     def index_area(self, area, picture_fields):
 551         """
 552         Indexes themes and objects on the area.
 553         """
 554         doc = dict(picture_fields)
 555         doc['area_id'] = area.id
 556         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 557         doc['uid'] = 'area%s' % area.id
 558         self.index.add(doc)
 559
 560
 561 @total_ordering
 562 class SearchResult(object):
 563     def __init__(self, doc, how_found=None, query_terms=None):
 564         self.boost = 1.0
 565         self._hits = []
 566         self._processed_hits = None  # processed hits
 567         self.snippets = []
 568         self.query_terms = query_terms
 569         self._book = None
 570
 571         if 'score' in doc:
 572             self._score = doc['score']
 573         else:
 574             self._score = 0
 575
 576         self.book_id = int(doc["book_id"])
 577
 578         try:
 579             self.published_date = int(doc.get("published_date"))
 580         except ValueError:
 581             self.published_date = 0
 582
 583         # content hits
 584         header_type = doc.get("header_type", None)
 585         # we have a content hit in some header of fragment
 586         if header_type is not None:
 587             sec = (header_type, int(doc["header_index"]))
 588             header_span = doc['header_span']
 589             header_span = header_span is not None and int(header_span) or 1
 590             fragment = doc.get("fragment_anchor", None)
 591             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 592             snippets_rev = doc.get('snippets_revision', None)
 593
 594             hit = (sec + (header_span,), fragment, self._score, {
 595                 'how_found': how_found,
 596                 'snippets_pos': snippets_pos,
 597                 'snippets_revision': snippets_rev,
 598                 'themes': doc.get('themes', []),
 599                 'themes_pl': doc.get('themes_pl', [])
 600                 })
 601
 602             self._hits.append(hit)
 603
 604     @classmethod
 605     def from_book(cls, book, how_found=None, query_terms=None):
 606         doc = {
 607             'score': book.popularity.count,
 608             'book_id': book.id,
 609             'published_date': 0,
 610         }
 611         result = cls(doc, how_found=how_found, query_terms=query_terms)
 612         result._book = book
 613         return result
 614
 615     def __str__(self):
 616         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 617             (self.book_id, len(self._hits),
 618              len(self._processed_hits) if self._processed_hits else -1,
 619              self._score, len(self.snippets))
 620
 621     def __bytes__(self):
 622         return str(self).encode('utf-8')
 623
 624     @property
 625     def score(self):
 626         return self._score * self.boost
 627
 628     def merge(self, other):
 629         if self.book_id != other.book_id:
 630             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 631         self._hits += other._hits
 632         self._score += max(other._score, 0)
 633         return self
 634
 635     def get_book(self):
 636         if self._book is not None:
 637             return self._book
 638         try:
 639             self._book = catalogue.models.Book.objects.get(id=self.book_id)
 640         except catalogue.models.Book.DoesNotExist:
 641             self._book = None
 642         return self._book
 643
 644     book = property(get_book)
 645
 646     POSITION = 0
 647     FRAGMENT = 1
 648     POSITION_INDEX = 1
 649     POSITION_SPAN = 2
 650     SCORE = 2
 651     OTHER = 3
 652
 653     @property
 654     def hits(self):
 655         if self._processed_hits is not None:
 656             return self._processed_hits
 657
 658         # to sections and fragments
 659         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 660
 661         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 662
 663         # sections not covered by fragments
 664         sect = filter(lambda s: 0 == len(list(filter(
 665             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 666                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 667
 668         def remove_duplicates(lst, keyfn, compare):
 669             els = {}
 670             for e in lst:
 671                 eif = keyfn(e)
 672                 if eif in els:
 673                     if compare(els[eif], e) >= 1:
 674                         continue
 675                 els[eif] = e
 676             return els.values()
 677
 678         # remove fragments with duplicated fid's and duplicated snippets
 679         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 680         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 681         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 682
 683         # remove duplicate sections
 684         sections = {}
 685
 686         for s in sect:
 687             si = s[self.POSITION][self.POSITION_INDEX]
 688             # skip existing
 689             if si in sections:
 690                 if sections[si]['score'] >= s[self.SCORE]:
 691                     continue
 692
 693             m = {'score': s[self.SCORE],
 694                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 695                  }
 696             m.update(s[self.OTHER])
 697             sections[si] = m
 698
 699         hits = list(sections.values())
 700
 701         for f in frags:
 702             try:
 703                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 704             except catalogue.models.Fragment.DoesNotExist:
 705                 # stale index
 706                 continue
 707             # Figure out if we were searching for a token matching some word in theme name.
 708             themes = frag.tags.filter(category='theme')
 709             themes_hit = set()
 710             if self.query_terms is not None:
 711                 for i in range(0, len(f[self.OTHER]['themes'])):
 712                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 713                     tms = map(str.lower, tms)
 714                     for qt in self.query_terms:
 715                         if qt in tms:
 716                             themes_hit.add(f[self.OTHER]['themes'][i])
 717                             break
 718
 719             def theme_by_name(n):
 720                 th = list(filter(lambda t: t.name == n, themes))
 721                 if th:
 722                     return th[0]
 723                 else:
 724                     return None
 725             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 726
 727             m = {'score': f[self.SCORE],
 728                  'fragment': frag,
 729                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 730                  'themes': themes,
 731                  'themes_hit': themes_hit
 732                  }
 733             m.update(f[self.OTHER])
 734             hits.append(m)
 735
 736         hits.sort(key=lambda h: h['score'], reverse=True)
 737
 738         self._processed_hits = hits
 739
 740         return hits
 741
 742     @staticmethod
 743     def aggregate(*result_lists):
 744         books = {}
 745         for rl in result_lists:
 746             for r in rl:
 747                 if r.book_id in books:
 748                     books[r.book_id].merge(r)
 749                 else:
 750                     books[r.book_id] = r
 751         return books.values()
 752
 753     def get_sort_key(self):
 754         return (-self.score,
 755                 self.published_date,
 756                 self.book.sort_key_author if self.book else '',
 757                 self.book.sort_key if self.book else '')
 758
 759     def __lt__(self, other):
 760         return self.get_sort_key() > other.get_sort_key()
 761
 762     def __eq__(self, other):
 763         return self.get_sort_key() == other.get_sort_key()
 764
 765     def __len__(self):
 766         return len(self.hits)
 767
 768     def snippet_pos(self, idx=0):
 769         return self.hits[idx]['snippets_pos']
 770
 771     def snippet_revision(self, idx=0):
 772         try:
 773             return self.hits[idx]['snippets_revision']
 774         except (IndexError, KeyError):
 775             return None
 776
 777
 778 @total_ordering
 779 class PictureResult(object):
 780     def __init__(self, doc, how_found=None, query_terms=None):
 781         self.boost = 1.0
 782         self.query_terms = query_terms
 783         self._picture = None
 784         self._hits = []
 785         self._processed_hits = None
 786
 787         if 'score' in doc:
 788             self._score = doc['score']
 789         else:
 790             self._score = 0
 791
 792         self.picture_id = int(doc["picture_id"])
 793
 794         if doc.get('area_id'):
 795             hit = (self._score, {
 796                 'how_found': how_found,
 797                 'area_id': doc['area_id'],
 798                 'themes': doc.get('themes', []),
 799                 'themes_pl': doc.get('themes_pl', []),
 800             })
 801
 802             self._hits.append(hit)
 803
 804     def __str__(self):
 805         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
 806
 807     def __repr__(self):
 808         return str(self)
 809
 810     @property
 811     def score(self):
 812         return self._score * self.boost
 813
 814     def merge(self, other):
 815         if self.picture_id != other.picture_id:
 816             raise ValueError(
 817                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 818         self._hits += other._hits
 819         self._score += max(other._score, 0)
 820         return self
 821
 822     SCORE = 0
 823     OTHER = 1
 824
 825     @property
 826     def hits(self):
 827         if self._processed_hits is not None:
 828             return self._processed_hits
 829
 830         hits = []
 831         for hit in self._hits:
 832             try:
 833                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 834             except picture.models.PictureArea.DoesNotExist:
 835                 # stale index
 836                 continue
 837             # Figure out if we were searching for a token matching some word in theme name.
 838             themes_hit = set()
 839             if self.query_terms is not None:
 840                 for i in range(0, len(hit[self.OTHER]['themes'])):
 841                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 842                     tms = map(str.lower, tms)
 843                     for qt in self.query_terms:
 844                         if qt in tms:
 845                             themes_hit.add(hit[self.OTHER]['themes'][i])
 846                             break
 847
 848             m = {
 849                 'score': hit[self.SCORE],
 850                 'area': area,
 851                 'themes_hit': themes_hit,
 852             }
 853             m.update(hit[self.OTHER])
 854             hits.append(m)
 855
 856         hits.sort(key=lambda h: h['score'], reverse=True)
 857         hits = hits[:1]
 858         self._processed_hits = hits
 859         return hits
 860
 861     def get_picture(self):
 862         if self._picture is None:
 863             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 864         return self._picture
 865
 866     picture = property(get_picture)
 867
 868     @staticmethod
 869     def aggregate(*result_lists):
 870         books = {}
 871         for rl in result_lists:
 872             for r in rl:
 873                 if r.picture_id in books:
 874                     books[r.picture_id].merge(r)
 875                 else:
 876                     books[r.picture_id] = r
 877         return books.values()
 878
 879     def __lt__(self, other):
 880         return self.score < other.score
 881
 882     def __eq__(self, other):
 883         return self.score == other.score
 884
 885
 886 class Search(SolrIndex):
 887     """
 888     Search facilities.
 889     """
 890     def __init__(self, default_field="text"):
 891         super(Search, self).__init__(mode='r')
 892
 893     def make_term_query(self, query, field='text', modal=operator.or_):
 894         """
 895         Returns term queries joined by boolean query.
 896         modal - applies to boolean query
 897         fuzzy - should the query by fuzzy.
 898         """
 899         if query is None:
 900             query = ''
 901         q = self.index.Q()
 902         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 903
 904         return q
 905
 906     def search_by_author(self, words):
 907         from catalogue.models import Book
 908         books = Book.objects.filter(parent=None).order_by('-popularity__count')
 909         for word in words:
 910             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 911         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 912
 913     def search_words(self, words, fields, required=None, book=True, picture=False):
 914         if book and not picture and fields == ['authors']:
 915             return self.search_by_author(words)
 916         filters = []
 917         for word in words:
 918             if book or picture or (word not in stopwords):
 919                 word_filter = None
 920                 for field in fields:
 921                     q = self.index.Q(**{field: word})
 922                     if word_filter is None:
 923                         word_filter = q
 924                     else:
 925                         word_filter |= q
 926                 filters.append(word_filter)
 927         if required:
 928             required_filter = None
 929             for field in required:
 930                 for word in words:
 931                     if book or picture or (word not in stopwords):
 932                         q = self.index.Q(**{field: word})
 933                         if required_filter is None:
 934                             required_filter = q
 935                         else:
 936                             required_filter |= q
 937             filters.append(required_filter)
 938         if not filters:
 939             return []
 940         params = {}
 941         if book:
 942             params['is_book'] = True
 943         if picture:
 944             params['picture_id__gt'] = 0
 945         else:
 946             params['book_id__gt'] = 0
 947         query = self.index.query(**params)
 948         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 949         result_class = PictureResult if picture else SearchResult
 950         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 951
 952     def get_snippets(self, searchresult, query, field='text', num=1):
 953         """
 954         Returns a snippet for found scoreDoc.
 955         """
 956         maxnum = len(searchresult)
 957         if num is None or num < 0 or num > maxnum:
 958             num = maxnum
 959         book_id = searchresult.book_id
 960         revision = searchresult.snippet_revision()
 961         snippets = Snippets(book_id, revision=revision)
 962         snips = [None] * maxnum
 963         try:
 964             snippets.open()
 965             idx = 0
 966             while idx < maxnum and num > 0:
 967                 position, length = searchresult.snippet_pos(idx)
 968                 if position is None or length is None:
 969                     continue
 970                 text = snippets.get((int(position),
 971                                      int(length)))
 972                 snip = self.index.highlight(text=text, field=field, q=query)
 973                 if not snip and field == 'text':
 974                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 975                 if snip not in snips:
 976                     snips[idx] = snip
 977                     if snip:
 978                         num -= 1
 979                 idx += 1
 980
 981         except IOError as e:
 982             book = catalogue.models.Book.objects.filter(id=book_id)
 983             if not book:
 984                 log.error("Book does not exist for book id = %d" % book_id)
 985             elif not book.get().children.exists():
 986                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 987             return []
 988         finally:
 989             snippets.close()
 990
 991         # remove verse end markers..
 992         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 993
 994         searchresult.snippets = snips
 995
 996         return snips
 997
 998     @staticmethod
 999     def apply_filters(query, filters):
1000         """
1001         Apply filters to a query
1002         """
1003         if filters is None:
1004             filters = []
1005         filters = filter(lambda x: x is not None, filters)
1006         for f in filters:
1007             query = query.query(f)
1008         return query
1009
1010
1011 if getattr(settings, 'SEARCH_MOCK', False):
1012     from .mock_search import Search