src/search/index.py

   1 # -*- coding: utf-8 -*-
   2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   4 #
   5 from django.conf import settings
   6
   7 import os
   8 import re
   9 from librarian import dcparser
  10 from librarian.parser import WLDocument
  11 from lxml import etree
  12 import catalogue.models
  13 import picture.models
  14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  15 from itertools import chain
  16 import sunburnt
  17 import custom
  18 import operator
  19 import logging
  20 from wolnelektury.utils import makedirs
  21
  22 log = logging.getLogger('search')
  23
  24 if os.path.isfile(settings.SOLR_STOPWORDS):
  25     stopwords = set(
  26         line.decode('utf-8').strip()
  27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  28 else:
  29     stopwords = set()
  30
  31
  32 class SolrIndex(object):
  33     def __init__(self, mode=None):
  34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  35
  36
  37 class Snippets(object):
  38     """
  39     This class manages snippet files for indexed object (book)
  40     the snippets are concatenated together, and their positions and
  41     lengths are kept in lucene index fields.
  42     """
  43     SNIPPET_DIR = "snippets"
  44
  45     def __init__(self, book_id, revision=None):
  46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  47         self.book_id = book_id
  48         self.revision = revision
  49         self.file = None
  50         self.position = None
  51
  52     @property
  53     def path(self):
  54         if self.revision:
  55             fn = "%d.%d" % (self.book_id, self.revision)
  56         else:
  57             fn = "%d" % self.book_id
  58
  59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  60
  61     def open(self, mode='r'):
  62         """
  63         Open the snippet file. Call .close() afterwards.
  64         """
  65         if 'b' not in mode:
  66             mode += 'b'
  67
  68         if 'w' in mode:
  69             if os.path.exists(self.path):
  70                 self.revision = 1
  71                 while True:
  72                     if not os.path.exists(self.path):
  73                         break
  74                     self.revision += 1
  75
  76         self.file = open(self.path, mode)
  77         self.position = 0
  78         return self
  79
  80     def add(self, snippet):
  81         """
  82         Append a snippet (unicode) to the snippet file.
  83         Return a (position, length) tuple
  84         """
  85         txt = snippet.encode('utf-8')
  86         l = len(txt)
  87         self.file.write(txt)
  88         pos = (self.position, l)
  89         self.position += l
  90         return pos
  91
  92     def get(self, pos):
  93         """
  94         Given a tuple of (position, length) return an unicode
  95         of the snippet stored there.
  96         """
  97         self.file.seek(pos[0], 0)
  98         txt = self.file.read(pos[1]).decode('utf-8')
  99         return txt
 100
 101     def close(self):
 102         """Close snippet file"""
 103         if self.file:
 104             self.file.close()
 105
 106     def remove(self):
 107         self.revision = None
 108         try:
 109             os.unlink(self.path)
 110             self.revision = 0
 111             while True:
 112                 self.revision += 1
 113                 os.unlink(self.path)
 114         except OSError:
 115             pass
 116
 117
 118 class Index(SolrIndex):
 119     """
 120     Class indexing books.
 121     """
 122     def __init__(self):
 123         super(Index, self).__init__(mode='rw')
 124
 125     def delete_query(self, *queries):
 126         """
 127         index.delete(queries=...) doesn't work, so let's reimplement it
 128         using deletion of list of uids.
 129         """
 130         uids = set()
 131         for q in queries:
 132             if isinstance(q, sunburnt.search.LuceneQuery):
 133                 q = self.index.query(q)
 134             q.field_limiter.update(['uid'])
 135             st = 0
 136             rows = 100
 137             while True:
 138                 ids = q.paginate(start=st, rows=rows).execute()
 139                 if not len(ids):
 140                     break
 141                 for res in ids:
 142                     uids.add(res['uid'])
 143                 st += rows
 144         if uids:
 145             self.index.delete(uids)
 146             return True
 147         else:
 148             return False
 149
 150     def index_tags(self, *tags, **kw):
 151         """
 152         Re-index global tag list.
 153         Removes all tags from index, then index them again.
 154         Indexed fields include: id, name (with and without polish stems), category
 155         """
 156         log.debug("Indexing tags")
 157         remove_only = kw.get('remove_only', False)
 158         # first, remove tags from index.
 159         if tags:
 160             tag_qs = []
 161             for tag in tags:
 162                 q_id = self.index.Q(tag_id=tag.id)
 163
 164                 if isinstance(tag, PDCounterAuthor):
 165                     q_cat = self.index.Q(tag_category='pd_author')
 166                 elif isinstance(tag, PDCounterBook):
 167                     q_cat = self.index.Q(tag_category='pd_book')
 168                 else:
 169                     q_cat = self.index.Q(tag_category=tag.category)
 170
 171                 q_id_cat = self.index.Q(q_id & q_cat)
 172                 tag_qs.append(q_id_cat)
 173             self.delete_query(*tag_qs)
 174         else:  # all
 175             q = self.index.Q(tag_id__any=True)
 176             self.delete_query(q)
 177
 178         if not remove_only:
 179             # then add them [all or just one passed]
 180             if not tags:
 181                 tags = chain(
 182                     catalogue.models.Tag.objects.exclude(category='set'),
 183                     PDCounterAuthor.objects.all(),
 184                     PDCounterBook.objects.all())
 185
 186             for tag in tags:
 187                 if isinstance(tag, PDCounterAuthor):
 188                     doc = {
 189                         "tag_id": int(tag.id),
 190                         "tag_name": tag.name,
 191                         "tag_name_pl": tag.name,
 192                         "tag_category": 'pd_author',
 193                         "is_pdcounter": True,
 194                         "uid": "tag%d_pd_a" % tag.id
 195                         }
 196                 elif isinstance(tag, PDCounterBook):
 197                     doc = {
 198                         "tag_id": int(tag.id),
 199                         "tag_name": tag.title,
 200                         "tag_name_pl": tag.title,
 201                         "tag_category": 'pd_book',
 202                         "is_pdcounter": True,
 203                         "uid": "tag%d_pd_b" % tag.id
 204                         }
 205                 else:
 206                     doc = {
 207                         "tag_id": int(tag.id),
 208                         "tag_name": tag.name,
 209                         "tag_name_pl": tag.name,
 210                         "tag_category": tag.category,
 211                         "is_pdcounter": False,
 212                         "uid": "tag%d" % tag.id
 213                         }
 214                 self.index.add(doc)
 215
 216     def create_book_doc(self, book):
 217         """
 218         Create a lucene document referring book id.
 219         """
 220         doc = {'book_id': int(book.id)}
 221         if book.parent is not None:
 222             doc['parent_id'] = int(book.parent.id)
 223         return doc
 224
 225     def remove_book(self, book_or_id, remove_snippets=True):
 226         """Removes a book from search index.
 227         book - Book instance."""
 228         if isinstance(book_or_id, catalogue.models.Book):
 229             book_id = book_or_id.id
 230         else:
 231             book_id = book_or_id
 232
 233         self.delete_query(self.index.Q(book_id=book_id))
 234
 235         if remove_snippets:
 236             snippets = Snippets(book_id)
 237             snippets.remove()
 238
 239     def index_book(self, book, book_info=None, overwrite=True):
 240         """
 241         Indexes the book.
 242         Creates a lucene document for extracted metadata
 243         and calls self.index_content() to index the contents of the book.
 244         """
 245         if overwrite:
 246             # we don't remove snippets, since they might be still needed by
 247             # threads using not reopened index
 248             self.remove_book(book, remove_snippets=False)
 249
 250         book_doc = self.create_book_doc(book)
 251         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 252             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 253         # let's not index it - it's only used for extracting publish date
 254         if 'source_name' in meta_fields:
 255             del meta_fields['source_name']
 256
 257         for n, f in meta_fields.items():
 258             book_doc[n] = f
 259
 260         book_doc['uid'] = "book%s" % book_doc['book_id']
 261         self.index.add(book_doc)
 262         del book_doc
 263         book_fields = {
 264             'title': meta_fields['title'],
 265             'authors': meta_fields['authors'],
 266             'published_date': meta_fields['published_date']
 267             }
 268
 269         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 270             if tag_name in meta_fields:
 271                 book_fields[tag_name] = meta_fields[tag_name]
 272
 273         self.index_content(book, book_fields=book_fields)
 274
 275     master_tags = [
 276         'opowiadanie',
 277         'powiesc',
 278         'dramat_wierszowany_l',
 279         'dramat_wierszowany_lp',
 280         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 281         'wywiad',
 282     ]
 283
 284     ignore_content_tags = [
 285         'uwaga', 'extra', 'nota_red', 'abstrakt',
 286         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 287         'didaskalia',
 288         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 289     ]
 290
 291     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 292
 293     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 294                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 295
 296     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 297
 298     def extract_metadata(self, book, book_info=None, dc_only=None):
 299         """
 300         Extract metadata from book and returns a map of fields keyed by fieldname
 301         """
 302         fields = {}
 303
 304         if book_info is None:
 305             book_info = dcparser.parse(open(book.xml_file.path))
 306
 307         fields['slug'] = book.slug
 308         fields['is_book'] = True
 309
 310         # validator, name
 311         for field in dcparser.BookInfo.FIELDS:
 312             if dc_only and field.name not in dc_only:
 313                 continue
 314             if hasattr(book_info, field.name):
 315                 if not getattr(book_info, field.name):
 316                     continue
 317                 # since no type information is available, we use validator
 318                 type_indicator = field.validator
 319                 if type_indicator == dcparser.as_unicode:
 320                     s = getattr(book_info, field.name)
 321                     if field.multiple:
 322                         s = ', '.join(s)
 323                     fields[field.name] = s
 324                 elif type_indicator == dcparser.as_person:
 325                     p = getattr(book_info, field.name)
 326                     if isinstance(p, dcparser.Person):
 327                         persons = unicode(p)
 328                     else:
 329                         persons = ', '.join(map(unicode, p))
 330                     fields[field.name] = persons
 331                 elif type_indicator == dcparser.as_date:
 332                     dt = getattr(book_info, field.name)
 333                     fields[field.name] = dt
 334
 335         # get published date
 336         pd = None
 337         if hasattr(book_info, 'source_name') and book_info.source_name:
 338             match = self.published_date_re.search(book_info.source_name)
 339             if match is not None:
 340                 pd = str(match.groups()[0])
 341         if not pd:
 342             pd = ""
 343         fields["published_date"] = pd
 344
 345         return fields
 346
 347     # def add_gaps(self, fields, fieldname):
 348     #     """
 349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 351     #     """
 352     #     def gap():
 353     #         while True:
 354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 356
 357     def get_master(self, root):
 358         """
 359         Returns the first master tag from an etree.
 360         """
 361         for master in root.iter():
 362             if master.tag in self.master_tags:
 363                 return master
 364
 365     def index_content(self, book, book_fields):
 366         """
 367         Walks the book XML and extract content from it.
 368         Adds parts for each header tag and for each fragment.
 369         """
 370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 371         root = wld.edoc.getroot()
 372
 373         master = self.get_master(root)
 374         if master is None:
 375             return []
 376
 377         def walker(node):
 378             if node.tag not in self.ignore_content_tags:
 379                 yield node, None, None
 380                 if node.text is not None:
 381                     yield None, node.text, None
 382                 for child in list(node):
 383                     for b, t, e in walker(child):
 384                         yield b, t, e
 385                 yield None, None, node
 386
 387             if node.tail is not None:
 388                 yield None, node.tail, None
 389             return
 390
 391         def fix_format(text):
 392             # separator = [u" ", u"\t", u".", u";", u","]
 393             if isinstance(text, list):
 394                 # need to join it first
 395                 text = filter(lambda s: s is not None, content)
 396                 text = u' '.join(text)
 397                 # for i in range(len(text)):
 398                 #     if i > 0:
 399                 #         if text[i][0] not in separator\
 400                 #             and text[i - 1][-1] not in separator:
 401                 #          text.insert(i, u" ")
 402
 403             return re.sub("(?m)/$", "", text)
 404
 405         def add_part(snippets, **fields):
 406             doc = self.create_book_doc(book)
 407             for n, v in book_fields.items():
 408                 doc[n] = v
 409
 410             doc['header_index'] = fields["header_index"]
 411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 412             doc['header_type'] = fields['header_type']
 413
 414             doc['text'] = fields['text']
 415
 416             # snippets
 417             snip_pos = snippets.add(fields["text"])
 418
 419             doc['snippets_position'] = snip_pos[0]
 420             doc['snippets_length'] = snip_pos[1]
 421             if snippets.revision:
 422                 doc["snippets_revision"] = snippets.revision
 423
 424             if 'fragment_anchor' in fields:
 425                 doc["fragment_anchor"] = fields['fragment_anchor']
 426
 427             if 'themes' in fields:
 428                 doc['themes'] = fields['themes']
 429             doc['uid'] = "part%s-%s-%s-%s" % (
 430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 431             return doc
 432
 433         fragments = {}
 434         snippets = Snippets(book.id).open('w')
 435         try:
 436             for header, position in zip(list(master), range(len(master))):
 437
 438                 if header.tag in self.skip_header_tags:
 439                     continue
 440                 if header.tag is etree.Comment:
 441                     continue
 442
 443                 # section content
 444                 content = []
 445                 footnote = []
 446
 447                 def all_content(text):
 448                     for frag in fragments.values():
 449                         frag['text'].append(text)
 450                     content.append(text)
 451                 handle_text = [all_content]
 452
 453                 for start, text, end in walker(header):
 454                     # handle footnotes
 455                     if start is not None and start.tag in self.footnote_tags:
 456                         footnote = []
 457
 458                         def collect_footnote(t):
 459                             footnote.append(t)
 460
 461                         handle_text.append(collect_footnote)
 462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 463                         handle_text.pop()
 464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 465                                        text=u''.join(footnote),
 466                                        is_footnote=True)
 467                         self.index.add(doc)
 468                         footnote = []
 469
 470                     # handle fragments and themes.
 471                     if start is not None and start.tag == 'begin':
 472                         fid = start.attrib['id'][1:]
 473                         fragments[fid] = {
 474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 475
 476                     # themes for this fragment
 477                     elif start is not None and start.tag == 'motyw':
 478                         fid = start.attrib['id'][1:]
 479                         handle_text.append(lambda text: None)
 480                         if start.text is not None:
 481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
 482                     elif end is not None and end.tag == 'motyw':
 483                         handle_text.pop()
 484
 485                     elif start is not None and start.tag == 'end':
 486                         fid = start.attrib['id'][1:]
 487                         if fid not in fragments:
 488                             continue  # a broken <end> node, skip it
 489                         frag = fragments[fid]
 490                         if not frag['themes']:
 491                             continue  # empty themes list.
 492                         del fragments[fid]
 493
 494                         doc = add_part(snippets,
 495                                        header_type=frag['start_header'],
 496                                        header_index=frag['start_section'],
 497                                        header_span=position - frag['start_section'] + 1,
 498                                        fragment_anchor=fid,
 499                                        text=fix_format(frag['text']),
 500                                        themes=frag['themes'])
 501                         self.index.add(doc)
 502
 503                         # Collect content.
 504
 505                     if text is not None and handle_text is not []:
 506                         hdl = handle_text[-1]
 507                         hdl(text)
 508
 509                         # in the end, add a section text.
 510                 doc = add_part(snippets, header_index=position,
 511                                header_type=header.tag, text=fix_format(content))
 512
 513                 self.index.add(doc)
 514
 515         finally:
 516             snippets.close()
 517
 518     def remove_picture(self, picture_or_id):
 519         """Removes a picture from search index."""
 520         if isinstance(picture_or_id, picture.models.Picture):
 521             picture_id = picture_or_id.id
 522         else:
 523             picture_id = picture_or_id
 524         self.delete_query(self.index.Q(picture_id=picture_id))
 525
 526     def index_picture(self, picture, picture_info=None, overwrite=True):
 527         """
 528         Indexes the picture.
 529         Creates a lucene document for extracted metadata
 530         and calls self.index_area() to index the contents of the picture.
 531         """
 532         if overwrite:
 533             # we don't remove snippets, since they might be still needed by
 534             # threads using not reopened index
 535             self.remove_picture(picture)
 536
 537         picture_doc = {'picture_id': int(picture.id)}
 538         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 539             'authors', 'title', 'epochs', 'kinds', 'genres'])
 540
 541         picture_doc.update(meta_fields)
 542
 543         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 544         self.index.add(picture_doc)
 545         del picture_doc['is_book']
 546         for area in picture.areas.all():
 547             self.index_area(area, picture_fields=picture_doc)
 548
 549     def index_area(self, area, picture_fields):
 550         """
 551         Indexes themes and objects on the area.
 552         """
 553         doc = dict(picture_fields)
 554         doc['area_id'] = area.id
 555         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 556         doc['uid'] = 'area%s' % area.id
 557         self.index.add(doc)
 558
 559
 560 class SearchResult(object):
 561     def __init__(self, doc, how_found=None, query_terms=None):
 562         self.boost = 1.0
 563         self._hits = []
 564         self._processed_hits = None  # processed hits
 565         self.snippets = []
 566         self.query_terms = query_terms
 567         self._book = None
 568
 569         if 'score' in doc:
 570             self._score = doc['score']
 571         else:
 572             self._score = 0
 573
 574         self.book_id = int(doc["book_id"])
 575
 576         try:
 577             self.published_date = int(doc.get("published_date"))
 578         except ValueError:
 579             self.published_date = 0
 580
 581         # content hits
 582         header_type = doc.get("header_type", None)
 583         # we have a content hit in some header of fragment
 584         if header_type is not None:
 585             sec = (header_type, int(doc["header_index"]))
 586             header_span = doc['header_span']
 587             header_span = header_span is not None and int(header_span) or 1
 588             fragment = doc.get("fragment_anchor", None)
 589             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 590             snippets_rev = doc.get('snippets_revision', None)
 591
 592             hit = (sec + (header_span,), fragment, self._score, {
 593                 'how_found': how_found,
 594                 'snippets_pos': snippets_pos,
 595                 'snippets_revision': snippets_rev,
 596                 'themes': doc.get('themes', []),
 597                 'themes_pl': doc.get('themes_pl', [])
 598                 })
 599
 600             self._hits.append(hit)
 601
 602     @classmethod
 603     def from_book(cls, book, how_found=None, query_terms=None):
 604         doc = {
 605             'score': book.popularity.count,
 606             'book_id': book.id,
 607             'published_date': 0,
 608         }
 609         result = cls(doc, how_found=how_found, query_terms=query_terms)
 610         result._book = book
 611         return result
 612
 613     def __unicode__(self):
 614         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 615             (self.book_id, len(self._hits),
 616              len(self._processed_hits) if self._processed_hits else -1,
 617              self._score, len(self.snippets))
 618
 619     def __str__(self):
 620         return unicode(self).encode('utf-8')
 621
 622     @property
 623     def score(self):
 624         return self._score * self.boost
 625
 626     def merge(self, other):
 627         if self.book_id != other.book_id:
 628             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 629         self._hits += other._hits
 630         self._score += max(other._score, 0)
 631         return self
 632
 633     def get_book(self):
 634         if self._book is not None:
 635             return self._book
 636         self._book = catalogue.models.Book.objects.get(id=self.book_id)
 637         return self._book
 638
 639     book = property(get_book)
 640
 641     POSITION = 0
 642     FRAGMENT = 1
 643     POSITION_INDEX = 1
 644     POSITION_SPAN = 2
 645     SCORE = 2
 646     OTHER = 3
 647
 648     @property
 649     def hits(self):
 650         if self._processed_hits is not None:
 651             return self._processed_hits
 652
 653         # to sections and fragments
 654         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 655
 656         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
 657
 658         # sections not covered by fragments
 659         sect = filter(lambda s: 0 == len(filter(
 660             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 661                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
 662
 663         def remove_duplicates(lst, keyfn, compare):
 664             els = {}
 665             for e in lst:
 666                 eif = keyfn(e)
 667                 if eif in els:
 668                     if compare(els[eif], e) >= 1:
 669                         continue
 670                 els[eif] = e
 671             return els.values()
 672
 673         # remove fragments with duplicated fid's and duplicated snippets
 674         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
 675         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
 676         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
 677
 678         # remove duplicate sections
 679         sections = {}
 680
 681         for s in sect:
 682             si = s[self.POSITION][self.POSITION_INDEX]
 683             # skip existing
 684             if si in sections:
 685                 if sections[si]['score'] >= s[self.SCORE]:
 686                     continue
 687
 688             m = {'score': s[self.SCORE],
 689                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 690                  }
 691             m.update(s[self.OTHER])
 692             sections[si] = m
 693
 694         hits = sections.values()
 695
 696         for f in frags:
 697             try:
 698                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 699             except catalogue.models.Fragment.DoesNotExist:
 700                 # stale index
 701                 continue
 702             # Figure out if we were searching for a token matching some word in theme name.
 703             themes = frag.tags.filter(category='theme')
 704             themes_hit = set()
 705             if self.query_terms is not None:
 706                 for i in range(0, len(f[self.OTHER]['themes'])):
 707                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 708                     tms = map(unicode.lower, tms)
 709                     for qt in self.query_terms:
 710                         if qt in tms:
 711                             themes_hit.add(f[self.OTHER]['themes'][i])
 712                             break
 713
 714             def theme_by_name(n):
 715                 th = filter(lambda t: t.name == n, themes)
 716                 if th:
 717                     return th[0]
 718                 else:
 719                     return None
 720             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
 721
 722             m = {'score': f[self.SCORE],
 723                  'fragment': frag,
 724                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 725                  'themes': themes,
 726                  'themes_hit': themes_hit
 727                  }
 728             m.update(f[self.OTHER])
 729             hits.append(m)
 730
 731         hits.sort(key=lambda h: h['score'], reverse=True)
 732
 733         self._processed_hits = hits
 734
 735         return hits
 736
 737     @staticmethod
 738     def aggregate(*result_lists):
 739         books = {}
 740         for rl in result_lists:
 741             for r in rl:
 742                 if r.book_id in books:
 743                     books[r.book_id].merge(r)
 744                 else:
 745                     books[r.book_id] = r
 746         return books.values()
 747
 748     def __cmp__(self, other):
 749         c = cmp(self.score, other.score)
 750         if c == 0:
 751             # this is inverted, because earlier date is better
 752             return cmp(other.published_date, self.published_date)
 753         else:
 754             return c
 755
 756     def __len__(self):
 757         return len(self.hits)
 758
 759     def snippet_pos(self, idx=0):
 760         return self.hits[idx]['snippets_pos']
 761
 762     def snippet_revision(self, idx=0):
 763         try:
 764             return self.hits[idx]['snippets_revision']
 765         except (IndexError, KeyError):
 766             return None
 767
 768
 769 class PictureResult(object):
 770     def __init__(self, doc, how_found=None, query_terms=None):
 771         self.boost = 1.0
 772         self.query_terms = query_terms
 773         self._picture = None
 774         self._hits = []
 775         self._processed_hits = None
 776
 777         if 'score' in doc:
 778             self._score = doc['score']
 779         else:
 780             self._score = 0
 781
 782         self.picture_id = int(doc["picture_id"])
 783
 784         if doc.get('area_id'):
 785             hit = (self._score, {
 786                 'how_found': how_found,
 787                 'area_id': doc['area_id'],
 788                 'themes': doc.get('themes', []),
 789                 'themes_pl': doc.get('themes_pl', []),
 790             })
 791
 792             self._hits.append(hit)
 793
 794     def __unicode__(self):
 795         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
 796
 797     def __repr__(self):
 798         return unicode(self)
 799
 800     @property
 801     def score(self):
 802         return self._score * self.boost
 803
 804     def merge(self, other):
 805         if self.picture_id != other.picture_id:
 806             raise ValueError(
 807                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 808         self._hits += other._hits
 809         self._score += max(other._score, 0)
 810         return self
 811
 812     SCORE = 0
 813     OTHER = 1
 814
 815     @property
 816     def hits(self):
 817         if self._processed_hits is not None:
 818             return self._processed_hits
 819
 820         hits = []
 821         for hit in self._hits:
 822             try:
 823                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 824             except picture.models.PictureArea.DoesNotExist:
 825                 # stale index
 826                 continue
 827             # Figure out if we were searching for a token matching some word in theme name.
 828             themes_hit = set()
 829             if self.query_terms is not None:
 830                 for i in range(0, len(hit[self.OTHER]['themes'])):
 831                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 832                     tms = map(unicode.lower, tms)
 833                     for qt in self.query_terms:
 834                         if qt in tms:
 835                             themes_hit.add(hit[self.OTHER]['themes'][i])
 836                             break
 837
 838             m = {
 839                 'score': hit[self.SCORE],
 840                 'area': area,
 841                 'themes_hit': themes_hit,
 842             }
 843             m.update(hit[self.OTHER])
 844             hits.append(m)
 845
 846         hits.sort(key=lambda h: h['score'], reverse=True)
 847         hits = hits[:1]
 848         self._processed_hits = hits
 849         return hits
 850
 851     def get_picture(self):
 852         if self._picture is None:
 853             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 854         return self._picture
 855
 856     picture = property(get_picture)
 857
 858     @staticmethod
 859     def aggregate(*result_lists):
 860         books = {}
 861         for rl in result_lists:
 862             for r in rl:
 863                 if r.picture_id in books:
 864                     books[r.picture_id].merge(r)
 865                 else:
 866                     books[r.picture_id] = r
 867         return books.values()
 868
 869     def __cmp__(self, other):
 870         return cmp(self.score, other.score)
 871
 872
 873 class Search(SolrIndex):
 874     """
 875     Search facilities.
 876     """
 877     def __init__(self, default_field="text"):
 878         super(Search, self).__init__(mode='r')
 879
 880     def make_term_query(self, query, field='text', modal=operator.or_):
 881         """
 882         Returns term queries joined by boolean query.
 883         modal - applies to boolean query
 884         fuzzy - should the query by fuzzy.
 885         """
 886         if query is None:
 887             query = ''
 888         q = self.index.Q()
 889         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 890
 891         return q
 892
 893     def search_by_author(self, words):
 894         from catalogue.models import Book
 895         books = Book.objects.filter(parent=None).order_by('-popularity__count')
 896         for word in words:
 897             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 898         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 899
 900     def search_words(self, words, fields, required=None, book=True, picture=False):
 901         if book and not picture and fields == ['authors']:
 902             return self.search_by_author(words)
 903         filters = []
 904         for word in words:
 905             if book or picture or (word not in stopwords):
 906                 word_filter = None
 907                 for field in fields:
 908                     q = self.index.Q(**{field: word})
 909                     if word_filter is None:
 910                         word_filter = q
 911                     else:
 912                         word_filter |= q
 913                 filters.append(word_filter)
 914         if required:
 915             required_filter = None
 916             for field in required:
 917                 for word in words:
 918                     if book or picture or (word not in stopwords):
 919                         q = self.index.Q(**{field: word})
 920                         if required_filter is None:
 921                             required_filter = q
 922                         else:
 923                             required_filter |= q
 924             filters.append(required_filter)
 925         if not filters:
 926             return []
 927         params = {}
 928         if book:
 929             params['is_book'] = True
 930         if picture:
 931             params['picture_id__gt'] = 0
 932         else:
 933             params['book_id__gt'] = 0
 934         query = self.index.query(**params)
 935         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 936         result_class = PictureResult if picture else SearchResult
 937         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 938
 939     def get_snippets(self, searchresult, query, field='text', num=1):
 940         """
 941         Returns a snippet for found scoreDoc.
 942         """
 943         maxnum = len(searchresult)
 944         if num is None or num < 0 or num > maxnum:
 945             num = maxnum
 946         book_id = searchresult.book_id
 947         revision = searchresult.snippet_revision()
 948         snippets = Snippets(book_id, revision=revision)
 949         snips = [None] * maxnum
 950         try:
 951             snippets.open()
 952             idx = 0
 953             while idx < maxnum and num > 0:
 954                 position, length = searchresult.snippet_pos(idx)
 955                 if position is None or length is None:
 956                     continue
 957                 text = snippets.get((int(position),
 958                                      int(length)))
 959                 snip = self.index.highlight(text=text, field=field, q=query)
 960                 if not snip and field == 'text':
 961                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 962                 if snip not in snips:
 963                     snips[idx] = snip
 964                     if snip:
 965                         num -= 1
 966                 idx += 1
 967
 968         except IOError, e:
 969             book = catalogue.models.Book.objects.filter(id=book_id)
 970             if not book:
 971                 log.error("Book does not exist for book id = %d" % book_id)
 972             elif not book.get().children.exists():
 973                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 974             return []
 975         finally:
 976             snippets.close()
 977
 978             # remove verse end markers..
 979         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
 980
 981         searchresult.snippets = snips
 982
 983         return snips
 984
 985     @staticmethod
 986     def apply_filters(query, filters):
 987         """
 988         Apply filters to a query
 989         """
 990         if filters is None:
 991             filters = []
 992         filters = filter(lambda x: x is not None, filters)
 993         for f in filters:
 994             query = query.query(f)
 995         return query
 996
 997
 998 if getattr(settings, 'SEARCH_MOCK', False):
 999     from .mock_search import Search