src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 from librarian.parser import WLDocument
  13 from lxml import etree
  14 import scorched
  15 import catalogue.models
  16 import picture.models
  17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  18 from wolnelektury.utils import makedirs
  19 from . import custom
  20
  21 log = logging.getLogger('search')
  22
  23
  24 if os.path.isfile(settings.SOLR_STOPWORDS):
  25     stopwords = set(
  26         line.strip()
  27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  28 else:
  29     stopwords = set()
  30
  31
  32 class SolrIndex(object):
  33     def __init__(self, mode=None):
  34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  35
  36
  37 class Snippets(object):
  38     """
  39     This class manages snippet files for indexed object (book)
  40     the snippets are concatenated together, and their positions and
  41     lengths are kept in lucene index fields.
  42     """
  43     SNIPPET_DIR = "snippets"
  44
  45     def __init__(self, book_id, revision=None):
  46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  47         self.book_id = book_id
  48         self.revision = revision
  49         self.file = None
  50         self.position = None
  51
  52     @property
  53     def path(self):
  54         if self.revision:
  55             fn = "%d.%d" % (self.book_id, self.revision)
  56         else:
  57             fn = "%d" % self.book_id
  58
  59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  60
  61     def open(self, mode='r'):
  62         """
  63         Open the snippet file. Call .close() afterwards.
  64         """
  65         if 'b' not in mode:
  66             mode += 'b'
  67
  68         if 'w' in mode:
  69             if os.path.exists(self.path):
  70                 self.revision = 1
  71                 while True:
  72                     if not os.path.exists(self.path):
  73                         break
  74                     self.revision += 1
  75
  76         self.file = open(self.path, mode)
  77         self.position = 0
  78         return self
  79
  80     def add(self, snippet):
  81         """
  82         Append a snippet (unicode) to the snippet file.
  83         Return a (position, length) tuple
  84         """
  85         txt = snippet.encode('utf-8')
  86         l = len(txt)
  87         self.file.write(txt)
  88         pos = (self.position, l)
  89         self.position += l
  90         return pos
  91
  92     def get(self, pos):
  93         """
  94         Given a tuple of (position, length) return an unicode
  95         of the snippet stored there.
  96         """
  97         self.file.seek(pos[0], 0)
  98         txt = self.file.read(pos[1]).decode('utf-8')
  99         return txt
 100
 101     def close(self):
 102         """Close snippet file"""
 103         if self.file:
 104             self.file.close()
 105
 106     def remove(self):
 107         self.revision = None
 108         try:
 109             os.unlink(self.path)
 110             self.revision = 0
 111             while True:
 112                 self.revision += 1
 113                 os.unlink(self.path)
 114         except OSError:
 115             pass
 116
 117
 118 class Index(SolrIndex):
 119     """
 120     Class indexing books.
 121     """
 122     def __init__(self):
 123         super(Index, self).__init__(mode='rw')
 124
 125     def delete_query(self, *queries):
 126         """
 127         index.delete(queries=...) doesn't work, so let's reimplement it
 128         using deletion of list of uids.
 129         """
 130         uids = set()
 131         for q in queries:
 132             if isinstance(q, scorched.search.LuceneQuery):
 133                 q = self.index.query(q)
 134             q.field_limiter.update(['uid'])
 135             st = 0
 136             rows = 100
 137             while True:
 138                 ids = q.paginate(start=st, rows=rows).execute()
 139                 if not len(ids):
 140                     break
 141                 for res in ids:
 142                     uids.add(res['uid'])
 143                 st += rows
 144         if uids:
 145             self.index.delete(uids)
 146             return True
 147         else:
 148             return False
 149
 150     def index_tags(self, *tags, **kw):
 151         """
 152         Re-index global tag list.
 153         Removes all tags from index, then index them again.
 154         Indexed fields include: id, name (with and without polish stems), category
 155         """
 156         log.debug("Indexing tags")
 157         remove_only = kw.get('remove_only', False)
 158         # first, remove tags from index.
 159         if tags:
 160             tag_qs = []
 161             for tag in tags:
 162                 q_id = self.index.Q(tag_id=tag.id)
 163
 164                 if isinstance(tag, PDCounterAuthor):
 165                     q_cat = self.index.Q(tag_category='pd_author')
 166                 elif isinstance(tag, PDCounterBook):
 167                     q_cat = self.index.Q(tag_category='pd_book')
 168                 else:
 169                     q_cat = self.index.Q(tag_category=tag.category)
 170
 171                 q_id_cat = self.index.Q(q_id & q_cat)
 172                 tag_qs.append(q_id_cat)
 173             self.delete_query(*tag_qs)
 174         else:  # all
 175             q = self.index.Q(tag_id__any=True)
 176             self.delete_query(q)
 177
 178         if not remove_only:
 179             # then add them [all or just one passed]
 180             if not tags:
 181                 tags = chain(
 182                     catalogue.models.Tag.objects.exclude(category='set'),
 183                     PDCounterAuthor.objects.all(),
 184                     PDCounterBook.objects.all())
 185
 186             for tag in tags:
 187                 if isinstance(tag, PDCounterAuthor):
 188                     doc = {
 189                         "tag_id": int(tag.id),
 190                         "tag_name": tag.name,
 191                         "tag_name_pl": tag.name,
 192                         "tag_category": 'pd_author',
 193                         "is_pdcounter": True,
 194                         "uid": "tag%d_pd_a" % tag.id
 195                         }
 196                 elif isinstance(tag, PDCounterBook):
 197                     doc = {
 198                         "tag_id": int(tag.id),
 199                         "tag_name": tag.title,
 200                         "tag_name_pl": tag.title,
 201                         "tag_category": 'pd_book',
 202                         "is_pdcounter": True,
 203                         "uid": "tag%d_pd_b" % tag.id
 204                         }
 205                 else:
 206                     doc = {
 207                         "tag_id": int(tag.id),
 208                         "tag_name": tag.name,
 209                         "tag_name_pl": tag.name,
 210                         "tag_category": tag.category,
 211                         "is_pdcounter": False,
 212                         "uid": "tag%d" % tag.id
 213                         }
 214                 self.index.add(doc)
 215
 216     def create_book_doc(self, book):
 217         """
 218         Create a lucene document referring book id.
 219         """
 220         doc = {'book_id': int(book.id)}
 221         if book.parent is not None:
 222             doc['parent_id'] = int(book.parent.id)
 223         return doc
 224
 225     def remove_book(self, book_or_id, remove_snippets=True):
 226         """Removes a book from search index.
 227         book - Book instance."""
 228         if isinstance(book_or_id, catalogue.models.Book):
 229             book_id = book_or_id.id
 230         else:
 231             book_id = book_or_id
 232
 233         self.delete_query(self.index.Q(book_id=book_id))
 234
 235         if remove_snippets:
 236             snippets = Snippets(book_id)
 237             snippets.remove()
 238
 239     def index_book(self, book, book_info=None, overwrite=True):
 240         """
 241         Indexes the book.
 242         Creates a lucene document for extracted metadata
 243         and calls self.index_content() to index the contents of the book.
 244         """
 245         if overwrite:
 246             # we don't remove snippets, since they might be still needed by
 247             # threads using not reopened index
 248             self.remove_book(book, remove_snippets=False)
 249
 250         book_doc = self.create_book_doc(book)
 251         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 252             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 253         # let's not index it - it's only used for extracting publish date
 254         if 'source_name' in meta_fields:
 255             del meta_fields['source_name']
 256
 257         for n, f in meta_fields.items():
 258             book_doc[n] = f
 259
 260         book_doc['uid'] = "book%s" % book_doc['book_id']
 261         self.index.add(book_doc)
 262         del book_doc
 263         book_fields = {
 264             'title': meta_fields['title'],
 265             'authors': meta_fields['authors'],
 266             'published_date': meta_fields['published_date']
 267             }
 268
 269         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 270             if tag_name in meta_fields:
 271                 book_fields[tag_name] = meta_fields[tag_name]
 272
 273         self.index_content(book, book_fields=book_fields)
 274
 275     master_tags = [
 276         'opowiadanie',
 277         'powiesc',
 278         'dramat_wierszowany_l',
 279         'dramat_wierszowany_lp',
 280         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 281         'wywiad',
 282     ]
 283
 284     ignore_content_tags = [
 285         'uwaga', 'extra', 'nota_red', 'abstrakt',
 286         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 287         'didaskalia',
 288         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 289     ]
 290
 291     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 292
 293     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 294                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 295
 296     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 297
 298     def extract_metadata(self, book, book_info=None, dc_only=None):
 299         """
 300         Extract metadata from book and returns a map of fields keyed by fieldname
 301         """
 302         fields = {}
 303
 304         if book_info is None:
 305             book_info = dcparser.parse(open(book.xml_file.path))
 306
 307         fields['slug'] = book.slug
 308         fields['is_book'] = True
 309
 310         # validator, name
 311         for field in dcparser.BookInfo.FIELDS:
 312             if dc_only and field.name not in dc_only:
 313                 continue
 314             if hasattr(book_info, field.name):
 315                 if not getattr(book_info, field.name):
 316                     continue
 317                 # since no type information is available, we use validator
 318                 type_indicator = field.validator
 319                 if type_indicator == dcparser.as_unicode:
 320                     s = getattr(book_info, field.name)
 321                     if field.multiple:
 322                         s = ', '.join(s)
 323                     fields[field.name] = s
 324                 elif type_indicator == dcparser.as_person:
 325                     p = getattr(book_info, field.name)
 326                     if isinstance(p, dcparser.Person):
 327                         persons = str(p)
 328                     else:
 329                         persons = ', '.join(map(str, p))
 330                     fields[field.name] = persons
 331                 elif type_indicator == dcparser.as_date:
 332                     dt = getattr(book_info, field.name)
 333                     fields[field.name] = dt
 334
 335         # get published date
 336         pd = None
 337         if hasattr(book_info, 'source_name') and book_info.source_name:
 338             match = self.published_date_re.search(book_info.source_name)
 339             if match is not None:
 340                 pd = str(match.groups()[0])
 341         if not pd:
 342             pd = ""
 343         fields["published_date"] = pd
 344
 345         return fields
 346
 347     # def add_gaps(self, fields, fieldname):
 348     #     """
 349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 351     #     """
 352     #     def gap():
 353     #         while True:
 354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 356
 357     def get_master(self, root):
 358         """
 359         Returns the first master tag from an etree.
 360         """
 361         for master in root.iter():
 362             if master.tag in self.master_tags:
 363                 return master
 364
 365     def index_content(self, book, book_fields):
 366         """
 367         Walks the book XML and extract content from it.
 368         Adds parts for each header tag and for each fragment.
 369         """
 370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 371         root = wld.edoc.getroot()
 372
 373         master = self.get_master(root)
 374         if master is None:
 375             return []
 376
 377         def walker(node):
 378             if node.tag not in self.ignore_content_tags:
 379                 yield node, None, None
 380                 if node.text is not None:
 381                     yield None, node.text, None
 382                 for child in list(node):
 383                     for b, t, e in walker(child):
 384                         yield b, t, e
 385                 yield None, None, node
 386
 387             if node.tail is not None:
 388                 yield None, node.tail, None
 389             return
 390
 391         def fix_format(text):
 392             # separator = [u" ", u"\t", u".", u";", u","]
 393             if isinstance(text, list):
 394                 # need to join it first
 395                 text = filter(lambda s: s is not None, content)
 396                 text = u' '.join(text)
 397                 # for i in range(len(text)):
 398                 #     if i > 0:
 399                 #         if text[i][0] not in separator\
 400                 #             and text[i - 1][-1] not in separator:
 401                 #          text.insert(i, u" ")
 402
 403             return re.sub("(?m)/$", "", text)
 404
 405         def add_part(snippets, **fields):
 406             doc = self.create_book_doc(book)
 407             for n, v in book_fields.items():
 408                 doc[n] = v
 409
 410             doc['header_index'] = fields["header_index"]
 411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 412             doc['header_type'] = fields['header_type']
 413
 414             doc['text'] = fields['text']
 415
 416             # snippets
 417             snip_pos = snippets.add(fields["text"])
 418
 419             doc['snippets_position'] = snip_pos[0]
 420             doc['snippets_length'] = snip_pos[1]
 421             if snippets.revision:
 422                 doc["snippets_revision"] = snippets.revision
 423
 424             if 'fragment_anchor' in fields:
 425                 doc["fragment_anchor"] = fields['fragment_anchor']
 426
 427             if 'themes' in fields:
 428                 doc['themes'] = fields['themes']
 429             doc['uid'] = "part%s-%s-%s-%s" % (
 430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 431             return doc
 432
 433         fragments = {}
 434         snippets = Snippets(book.id).open('w')
 435         try:
 436             for header, position in zip(list(master), range(len(master))):
 437
 438                 if header.tag in self.skip_header_tags:
 439                     continue
 440                 if header.tag is etree.Comment:
 441                     continue
 442
 443                 # section content
 444                 content = []
 445                 footnote = []
 446
 447                 def all_content(text):
 448                     for frag in fragments.values():
 449                         frag['text'].append(text)
 450                     content.append(text)
 451                 handle_text = [all_content]
 452
 453                 for start, text, end in walker(header):
 454                     # handle footnotes
 455                     if start is not None and start.tag in self.footnote_tags:
 456                         footnote = []
 457
 458                         def collect_footnote(t):
 459                             footnote.append(t)
 460
 461                         handle_text.append(collect_footnote)
 462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 463                         handle_text.pop()
 464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 465                                        text=u''.join(footnote),
 466                                        is_footnote=True)
 467                         self.index.add(doc)
 468                         footnote = []
 469
 470                     # handle fragments and themes.
 471                     if start is not None and start.tag == 'begin':
 472                         fid = start.attrib['id'][1:]
 473                         fragments[fid] = {
 474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(lambda text: None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if not frag['themes']:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        text=fix_format(frag['text']),
 500                                        themes=frag['themes'])
 501                         self.index.add(doc)
 502
 503                         # Collect content.
 504
 505                     if text is not None and handle_text is not []:
 506                         hdl = handle_text[-1]
 507                         hdl(text)
 508
 509                         # in the end, add a section text.
 510                 doc = add_part(snippets, header_index=position,
 511                                header_type=header.tag, text=fix_format(content))
 512
 513                 self.index.add(doc)
 514
 515         finally:
 516             snippets.close()
 517
 518     def remove_picture(self, picture_or_id):
 519         """Removes a picture from search index."""
 520         if isinstance(picture_or_id, picture.models.Picture):
 521             picture_id = picture_or_id.id
 522         else:
 523             picture_id = picture_or_id
 524         self.delete_query(self.index.Q(picture_id=picture_id))
 525
 526     def index_picture(self, picture, picture_info=None, overwrite=True):
 527         """
 528         Indexes the picture.
 529         Creates a lucene document for extracted metadata
 530         and calls self.index_area() to index the contents of the picture.
 531         """
 532         if overwrite:
 533             # we don't remove snippets, since they might be still needed by
 534             # threads using not reopened index
 535             self.remove_picture(picture)
 536
 537         picture_doc = {'picture_id': int(picture.id)}
 538         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 539             'authors', 'title', 'epochs', 'kinds', 'genres'])
 540
 541         picture_doc.update(meta_fields)
 542
 543         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 544         self.index.add(picture_doc)
 545         del picture_doc['is_book']
 546         for area in picture.areas.all():
 547             self.index_area(area, picture_fields=picture_doc)
 548
 549     def index_area(self, area, picture_fields):
 550         """
 551         Indexes themes and objects on the area.
 552         """
 553         doc = dict(picture_fields)
 554         doc['area_id'] = area.id
 555         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 556         doc['uid'] = 'area%s' % area.id
 557         self.index.add(doc)
 558
 559
 560 @total_ordering
 561 class SearchResult(object):
 562     def __init__(self, doc, how_found=None, query_terms=None):
 563         self.boost = 1.0
 564         self._hits = []
 565         self._processed_hits = None  # processed hits
 566         self.snippets = []
 567         self.query_terms = query_terms
 568         self._book = None
 569
 570         if 'score' in doc:
 571             self._score = doc['score']
 572         else:
 573             self._score = 0
 574
 575         self.book_id = int(doc["book_id"])
 576
 577         try:
 578             self.published_date = int(doc.get("published_date"))
 579         except ValueError:
 580             self.published_date = 0
 581
 582         # content hits
 583         header_type = doc.get("header_type", None)
 584         # we have a content hit in some header of fragment
 585         if header_type is not None:
 586             sec = (header_type, int(doc["header_index"]))
 587             header_span = doc['header_span']
 588             header_span = header_span is not None and int(header_span) or 1
 589             fragment = doc.get("fragment_anchor", None)
 590             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 591             snippets_rev = doc.get('snippets_revision', None)
 592
 593             hit = (sec + (header_span,), fragment, self._score, {
 594                 'how_found': how_found,
 595                 'snippets_pos': snippets_pos,
 596                 'snippets_revision': snippets_rev,
 597                 'themes': doc.get('themes', []),
 598                 'themes_pl': doc.get('themes_pl', [])
 599                 })
 600
 601             self._hits.append(hit)
 602
 603     @classmethod
 604     def from_book(cls, book, how_found=None, query_terms=None):
 605         doc = {
 606             'score': book.popularity.count,
 607             'book_id': book.id,
 608             'published_date': 0,
 609         }
 610         result = cls(doc, how_found=how_found, query_terms=query_terms)
 611         result._book = book
 612         return result
 613
 614     def __str__(self):
 615         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 616             (self.book_id, len(self._hits),
 617              len(self._processed_hits) if self._processed_hits else -1,
 618              self._score, len(self.snippets))
 619
 620     def __bytes__(self):
 621         return str(self).encode('utf-8')
 622
 623     @property
 624     def score(self):
 625         return self._score * self.boost
 626
 627     def merge(self, other):
 628         if self.book_id != other.book_id:
 629             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 630         self._hits += other._hits
 631         self._score += max(other._score, 0)
 632         return self
 633
 634     def get_book(self):
 635         if self._book is not None:
 636             return self._book
 637         try:
 638             self._book = catalogue.models.Book.objects.get(id=self.book_id)
 639         except catalogue.models.Book.DoesNotExist:
 640             self._book = None
 641         return self._book
 642
 643     book = property(get_book)
 644
 645     POSITION = 0
 646     FRAGMENT = 1
 647     POSITION_INDEX = 1
 648     POSITION_SPAN = 2
 649     SCORE = 2
 650     OTHER = 3
 651
 652     @property
 653     def hits(self):
 654         if self._processed_hits is not None:
 655             return self._processed_hits
 656
 657         # to sections and fragments
 658         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 659
 660         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 661
 662         # sections not covered by fragments
 663         sect = filter(lambda s: 0 == len(list(filter(
 664             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 665                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 666
 667         def remove_duplicates(lst, keyfn, compare):
 668             els = {}
 669             for e in lst:
 670                 eif = keyfn(e)
 671                 if eif in els:
 672                     if compare(els[eif], e) >= 1:
 673                         continue
 674                 els[eif] = e
 675             return els.values()
 676
 677         # remove fragments with duplicated fid's and duplicated snippets
 678         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 679         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 680         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 681
 682         # remove duplicate sections
 683         sections = {}
 684
 685         for s in sect:
 686             si = s[self.POSITION][self.POSITION_INDEX]
 687             # skip existing
 688             if si in sections:
 689                 if sections[si]['score'] >= s[self.SCORE]:
 690                     continue
 691
 692             m = {'score': s[self.SCORE],
 693                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 694                  }
 695             m.update(s[self.OTHER])
 696             sections[si] = m
 697
 698         hits = list(sections.values())
 699
 700         for f in frags:
 701             try:
 702                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 703             except catalogue.models.Fragment.DoesNotExist:
 704                 # stale index
 705                 continue
 706             # Figure out if we were searching for a token matching some word in theme name.
 707             themes = frag.tags.filter(category='theme')
 708             themes_hit = set()
 709             if self.query_terms is not None:
 710                 for i in range(0, len(f[self.OTHER]['themes'])):
 711                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 712                     tms = map(str.lower, tms)
 713                     for qt in self.query_terms:
 714                         if qt in tms:
 715                             themes_hit.add(f[self.OTHER]['themes'][i])
 716                             break
 717
 718             def theme_by_name(n):
 719                 th = list(filter(lambda t: t.name == n, themes))
 720                 if th:
 721                     return th[0]
 722                 else:
 723                     return None
 724             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 725
 726             m = {'score': f[self.SCORE],
 727                  'fragment': frag,
 728                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 729                  'themes': themes,
 730                  'themes_hit': themes_hit
 731                  }
 732             m.update(f[self.OTHER])
 733             hits.append(m)
 734
 735         hits.sort(key=lambda h: h['score'], reverse=True)
 736
 737         self._processed_hits = hits
 738
 739         return hits
 740
 741     @staticmethod
 742     def aggregate(*result_lists):
 743         books = {}
 744         for rl in result_lists:
 745             for r in rl:
 746                 if r.book_id in books:
 747                     books[r.book_id].merge(r)
 748                 else:
 749                     books[r.book_id] = r
 750         return books.values()
 751
 752     def get_sort_key(self):
 753         return (-self.score,
 754                 self.published_date,
 755                 self.book.sort_key_author if self.book else '',
 756                 self.book.sort_key if self.book else '')
 757
 758     def __lt__(self, other):
 759         return self.get_sort_key() > other.get_sort_key()
 760
 761     def __eq__(self, other):
 762         return self.get_sort_key() == other.get_sort_key()
 763
 764     def __len__(self):
 765         return len(self.hits)
 766
 767     def snippet_pos(self, idx=0):
 768         return self.hits[idx]['snippets_pos']
 769
 770     def snippet_revision(self, idx=0):
 771         try:
 772             return self.hits[idx]['snippets_revision']
 773         except (IndexError, KeyError):
 774             return None
 775
 776
 777 @total_ordering
 778 class PictureResult(object):
 779     def __init__(self, doc, how_found=None, query_terms=None):
 780         self.boost = 1.0
 781         self.query_terms = query_terms
 782         self._picture = None
 783         self._hits = []
 784         self._processed_hits = None
 785
 786         if 'score' in doc:
 787             self._score = doc['score']
 788         else:
 789             self._score = 0
 790
 791         self.picture_id = int(doc["picture_id"])
 792
 793         if doc.get('area_id'):
 794             hit = (self._score, {
 795                 'how_found': how_found,
 796                 'area_id': doc['area_id'],
 797                 'themes': doc.get('themes', []),
 798                 'themes_pl': doc.get('themes_pl', []),
 799             })
 800
 801             self._hits.append(hit)
 802
 803     def __str__(self):
 804         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
 805
 806     def __repr__(self):
 807         return str(self)
 808
 809     @property
 810     def score(self):
 811         return self._score * self.boost
 812
 813     def merge(self, other):
 814         if self.picture_id != other.picture_id:
 815             raise ValueError(
 816                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 817         self._hits += other._hits
 818         self._score += max(other._score, 0)
 819         return self
 820
 821     SCORE = 0
 822     OTHER = 1
 823
 824     @property
 825     def hits(self):
 826         if self._processed_hits is not None:
 827             return self._processed_hits
 828
 829         hits = []
 830         for hit in self._hits:
 831             try:
 832                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 833             except picture.models.PictureArea.DoesNotExist:
 834                 # stale index
 835                 continue
 836             # Figure out if we were searching for a token matching some word in theme name.
 837             themes_hit = set()
 838             if self.query_terms is not None:
 839                 for i in range(0, len(hit[self.OTHER]['themes'])):
 840                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 841                     tms = map(str.lower, tms)
 842                     for qt in self.query_terms:
 843                         if qt in tms:
 844                             themes_hit.add(hit[self.OTHER]['themes'][i])
 845                             break
 846
 847             m = {
 848                 'score': hit[self.SCORE],
 849                 'area': area,
 850                 'themes_hit': themes_hit,
 851             }
 852             m.update(hit[self.OTHER])
 853             hits.append(m)
 854
 855         hits.sort(key=lambda h: h['score'], reverse=True)
 856         hits = hits[:1]
 857         self._processed_hits = hits
 858         return hits
 859
 860     def get_picture(self):
 861         if self._picture is None:
 862             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 863         return self._picture
 864
 865     picture = property(get_picture)
 866
 867     @staticmethod
 868     def aggregate(*result_lists):
 869         books = {}
 870         for rl in result_lists:
 871             for r in rl:
 872                 if r.picture_id in books:
 873                     books[r.picture_id].merge(r)
 874                 else:
 875                     books[r.picture_id] = r
 876         return books.values()
 877
 878     def __lt__(self, other):
 879         return self.score < other.score
 880
 881     def __eq__(self, other):
 882         return self.score == other.score
 883
 884
 885 class Search(SolrIndex):
 886     """
 887     Search facilities.
 888     """
 889     def __init__(self, default_field="text"):
 890         super(Search, self).__init__(mode='r')
 891
 892     def make_term_query(self, query, field='text', modal=operator.or_):
 893         """
 894         Returns term queries joined by boolean query.
 895         modal - applies to boolean query
 896         fuzzy - should the query by fuzzy.
 897         """
 898         if query is None:
 899             query = ''
 900         q = self.index.Q()
 901         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 902
 903         return q
 904
 905     def search_by_author(self, words):
 906         from catalogue.models import Book
 907         books = Book.objects.filter(parent=None).order_by('-popularity__count')
 908         for word in words:
 909             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 910         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 911
 912     def search_words(self, words, fields, required=None, book=True, picture=False):
 913         if book and not picture and fields == ['authors']:
 914             return self.search_by_author(words)
 915         filters = []
 916         for word in words:
 917             if book or picture or (word not in stopwords):
 918                 word_filter = None
 919                 for field in fields:
 920                     q = self.index.Q(**{field: word})
 921                     if word_filter is None:
 922                         word_filter = q
 923                     else:
 924                         word_filter |= q
 925                 filters.append(word_filter)
 926         if required:
 927             required_filter = None
 928             for field in required:
 929                 for word in words:
 930                     if book or picture or (word not in stopwords):
 931                         q = self.index.Q(**{field: word})
 932                         if required_filter is None:
 933                             required_filter = q
 934                         else:
 935                             required_filter |= q
 936             filters.append(required_filter)
 937         if not filters:
 938             return []
 939         params = {}
 940         if book:
 941             params['is_book'] = True
 942         if picture:
 943             params['picture_id__gt'] = 0
 944         else:
 945             params['book_id__gt'] = 0
 946         query = self.index.query(**params)
 947         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 948         result_class = PictureResult if picture else SearchResult
 949         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 950
 951     def get_snippets(self, searchresult, query, field='text', num=1):
 952         """
 953         Returns a snippet for found scoreDoc.
 954         """
 955         maxnum = len(searchresult)
 956         if num is None or num < 0 or num > maxnum:
 957             num = maxnum
 958         book_id = searchresult.book_id
 959         revision = searchresult.snippet_revision()
 960         snippets = Snippets(book_id, revision=revision)
 961         snips = [None] * maxnum
 962         try:
 963             snippets.open()
 964             idx = 0
 965             while idx < maxnum and num > 0:
 966                 position, length = searchresult.snippet_pos(idx)
 967                 if position is None or length is None:
 968                     continue
 969                 text = snippets.get((int(position),
 970                                      int(length)))
 971                 snip = self.index.highlight(text=text, field=field, q=query)
 972                 if not snip and field == 'text':
 973                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 974                 if snip not in snips:
 975                     snips[idx] = snip
 976                     if snip:
 977                         num -= 1
 978                 idx += 1
 979
 980         except IOError as e:
 981             book = catalogue.models.Book.objects.filter(id=book_id)
 982             if not book:
 983                 log.error("Book does not exist for book id = %d" % book_id)
 984             elif not book.get().children.exists():
 985                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 986             return []
 987         finally:
 988             snippets.close()
 989
 990         # remove verse end markers..
 991         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 992
 993         searchresult.snippets = snips
 994
 995         return snips
 996
 997     @staticmethod
 998     def apply_filters(query, filters):
 999         """
1000         Apply filters to a query
1001         """
1002         if filters is None:
1003             filters = []
1004         filters = filter(lambda x: x is not None, filters)
1005         for f in filters:
1006             query = query.query(f)
1007         return query
1008
1009
1010 if getattr(settings, 'SEARCH_MOCK', False):
1011     from .mock_search import Search