src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 import librarian.meta.types.person
  13 import librarian.meta.types.text
  14 from librarian.parser import WLDocument
  15 from lxml import etree
  16 import scorched
  17 import catalogue.models
  18 import picture.models
  19 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  20 from wolnelektury.utils import makedirs
  21 from . import custom
  22
  23 log = logging.getLogger('search')
  24
  25
  26 if os.path.isfile(settings.SOLR_STOPWORDS):
  27     stopwords = set(
  28         line.strip()
  29         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  30 else:
  31     stopwords = set()
  32
  33
  34 class SolrIndex(object):
  35     def __init__(self, mode=None):
  36         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  37
  38
  39 class Snippets(object):
  40     """
  41     This class manages snippet files for indexed object (book)
  42     the snippets are concatenated together, and their positions and
  43     lengths are kept in lucene index fields.
  44     """
  45     SNIPPET_DIR = "snippets"
  46
  47     def __init__(self, book_id, revision=None):
  48         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  49         self.book_id = book_id
  50         self.revision = revision
  51         self.file = None
  52         self.position = None
  53
  54     @property
  55     def path(self):
  56         if self.revision:
  57             fn = "%d.%d" % (self.book_id, self.revision)
  58         else:
  59             fn = "%d" % self.book_id
  60
  61         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  62
  63     def open(self, mode='r'):
  64         """
  65         Open the snippet file. Call .close() afterwards.
  66         """
  67         if 'b' not in mode:
  68             mode += 'b'
  69
  70         if 'w' in mode:
  71             if os.path.exists(self.path):
  72                 self.revision = 1
  73                 while True:
  74                     if not os.path.exists(self.path):
  75                         break
  76                     self.revision += 1
  77
  78         self.file = open(self.path, mode)
  79         self.position = 0
  80         return self
  81
  82     def add(self, snippet):
  83         """
  84         Append a snippet (unicode) to the snippet file.
  85         Return a (position, length) tuple
  86         """
  87         txt = snippet.encode('utf-8')
  88         l = len(txt)
  89         self.file.write(txt)
  90         pos = (self.position, l)
  91         self.position += l
  92         return pos
  93
  94     def get(self, pos):
  95         """
  96         Given a tuple of (position, length) return an unicode
  97         of the snippet stored there.
  98         """
  99         self.file.seek(pos[0], 0)
 100         try:
 101             txt = self.file.read(pos[1]).decode('utf-8')
 102         except:
 103             return ''
 104         return txt
 105
 106     def close(self):
 107         """Close snippet file"""
 108         if self.file:
 109             self.file.close()
 110
 111     def remove(self):
 112         self.revision = None
 113         try:
 114             os.unlink(self.path)
 115             self.revision = 0
 116             while True:
 117                 self.revision += 1
 118                 os.unlink(self.path)
 119         except OSError:
 120             pass
 121
 122
 123 class Index(SolrIndex):
 124     """
 125     Class indexing books.
 126     """
 127     def __init__(self):
 128         super(Index, self).__init__(mode='rw')
 129
 130     def remove_snippets(self, book):
 131         book.snippet_set.all().delete()
 132
 133     def add_snippet(self, book, doc):
 134         assert book.id == doc.pop('book_id')
 135         # Fragments already exist and can be indexed where they live.
 136         if 'fragment_anchor' in doc:
 137             return
 138
 139         text = doc.pop('text')
 140         header_index = doc.pop('header_index')
 141         book.snippet_set.create(
 142             sec=header_index,
 143             text=text,
 144         )
 145
 146     def delete_query(self, *queries):
 147         """
 148         index.delete(queries=...) doesn't work, so let's reimplement it
 149         using deletion of list of uids.
 150         """
 151         uids = set()
 152         for q in queries:
 153             if isinstance(q, scorched.search.LuceneQuery):
 154                 q = self.index.query(q)
 155             q.field_limiter.update(['uid'])
 156             st = 0
 157             rows = 100
 158             while True:
 159                 ids = q.paginate(start=st, rows=rows).execute()
 160                 if not len(ids):
 161                     break
 162                 for res in ids:
 163                     uids.add(res['uid'])
 164                 st += rows
 165         if uids:
 166             # FIXME: With Solr API change, this doesn't work.
 167             #self.index.delete(uids)
 168             return True
 169         else:
 170             return False
 171
 172     def index_tags(self, *tags, **kw):
 173         """
 174         Re-index global tag list.
 175         Removes all tags from index, then index them again.
 176         Indexed fields include: id, name (with and without polish stems), category
 177         """
 178         log.debug("Indexing tags")
 179         remove_only = kw.get('remove_only', False)
 180         # first, remove tags from index.
 181         if tags:
 182             tag_qs = []
 183             for tag in tags:
 184                 q_id = self.index.Q(tag_id=tag.id)
 185
 186                 if isinstance(tag, PDCounterAuthor):
 187                     q_cat = self.index.Q(tag_category='pd_author')
 188                 elif isinstance(tag, PDCounterBook):
 189                     q_cat = self.index.Q(tag_category='pd_book')
 190                 else:
 191                     q_cat = self.index.Q(tag_category=tag.category)
 192
 193                 q_id_cat = self.index.Q(q_id & q_cat)
 194                 tag_qs.append(q_id_cat)
 195             self.delete_query(*tag_qs)
 196         else:  # all
 197             q = self.index.Q(tag_id__any=True)
 198             self.delete_query(q)
 199
 200         if not remove_only:
 201             # then add them [all or just one passed]
 202             if not tags:
 203                 tags = chain(
 204                     catalogue.models.Tag.objects.exclude(category='set'),
 205                     PDCounterAuthor.objects.all(),
 206                     PDCounterBook.objects.all())
 207
 208             for tag in tags:
 209                 if isinstance(tag, PDCounterAuthor):
 210                     doc = {
 211                         "tag_id": int(tag.id),
 212                         "tag_name": tag.name,
 213                         "tag_name_pl": tag.name,
 214                         "tag_category": 'pd_author',
 215                         "is_pdcounter": True,
 216                         "uid": "tag%d_pd_a" % tag.id
 217                         }
 218                 elif isinstance(tag, PDCounterBook):
 219                     doc = {
 220                         "tag_id": int(tag.id),
 221                         "tag_name": tag.title,
 222                         "tag_name_pl": tag.title,
 223                         "tag_category": 'pd_book',
 224                         "is_pdcounter": True,
 225                         "uid": "tag%d_pd_b" % tag.id
 226                         }
 227                 else:
 228                     doc = {
 229                         "tag_id": int(tag.id),
 230                         "tag_name": tag.name,
 231                         "tag_name_pl": tag.name,
 232                         "tag_category": tag.category,
 233                         "is_pdcounter": False,
 234                         "uid": "tag%d" % tag.id
 235                         }
 236                 self.index.add(doc)
 237
 238     def create_book_doc(self, book):
 239         """
 240         Create a lucene document referring book id.
 241         """
 242         doc = {'book_id': int(book.id)}
 243         if book.parent is not None:
 244             doc['parent_id'] = int(book.parent.id)
 245         return doc
 246
 247     def remove_book(self, book, remove_snippets=True, legacy=True):
 248         """Removes a book from search index.
 249         book - Book instance."""
 250         if legacy:
 251           self.delete_query(self.index.Q(book_id=book.id))
 252
 253           if remove_snippets:
 254             snippets = Snippets(book.id)
 255             snippets.remove()
 256         self.remove_snippets(book)
 257
 258     def index_book(self, book, book_info=None, overwrite=True, legacy=True):
 259         """
 260         Indexes the book.
 261         Creates a lucene document for extracted metadata
 262         and calls self.index_content() to index the contents of the book.
 263         """
 264         if not book.xml_file: return
 265
 266         if overwrite:
 267             # we don't remove snippets, since they might be still needed by
 268             # threads using not reopened index
 269             self.remove_book(book, remove_snippets=False, legacy=legacy)
 270
 271         book_doc = self.create_book_doc(book)
 272         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 273             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 274         # let's not index it - it's only used for extracting publish date
 275         if 'source_name' in meta_fields:
 276             del meta_fields['source_name']
 277
 278         for n, f in meta_fields.items():
 279             book_doc[n] = f
 280
 281         book_doc['uid'] = "book%s" % book_doc['book_id']
 282         if legacy:
 283             self.index.add(book_doc)
 284         del book_doc
 285         book_fields = {
 286             'title': meta_fields['title'],
 287             'authors': meta_fields['authors'],
 288             'published_date': meta_fields['published_date']
 289             }
 290
 291         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 292             if tag_name in meta_fields:
 293                 book_fields[tag_name] = meta_fields[tag_name]
 294
 295         self.index_content(book, book_fields=book_fields, legacy=legacy)
 296
 297     master_tags = [
 298         'opowiadanie',
 299         'powiesc',
 300         'dramat_wierszowany_l',
 301         'dramat_wierszowany_lp',
 302         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 303         'wywiad',
 304     ]
 305
 306     ignore_content_tags = [
 307         'uwaga', 'extra', 'nota_red', 'abstrakt',
 308         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 309         'didaskalia',
 310         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 311     ]
 312
 313     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 314
 315     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 316                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 317
 318     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 319
 320     def extract_metadata(self, book, book_info=None, dc_only=None):
 321         """
 322         Extract metadata from book and returns a map of fields keyed by fieldname
 323         """
 324         fields = {}
 325
 326         if book_info is None:
 327             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
 328
 329         fields['slug'] = book.slug
 330         fields['is_book'] = True
 331
 332         # validator, name
 333         for field in dcparser.BookInfo.FIELDS:
 334             if dc_only and field.name not in dc_only:
 335                 continue
 336             if hasattr(book_info, field.name):
 337                 if not getattr(book_info, field.name):
 338                     continue
 339                 type_indicator = field.value_type
 340                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
 341                     s = getattr(book_info, field.name)
 342                     if field.multiple:
 343                         s = ', '.join(s)
 344                     fields[field.name] = s
 345                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
 346                     p = getattr(book_info, field.name)
 347                     if isinstance(p, librarian.meta.types.person.Person):
 348                         persons = str(p)
 349                     else:
 350                         persons = ', '.join(map(str, p))
 351                     fields[field.name] = persons
 352
 353         # get published date
 354         pd = None
 355         if hasattr(book_info, 'source_name') and book_info.source_name:
 356             match = self.published_date_re.search(book_info.source_name)
 357             if match is not None:
 358                 pd = str(match.groups()[0])
 359         if not pd:
 360             pd = ""
 361         fields["published_date"] = pd
 362
 363         return fields
 364
 365     # def add_gaps(self, fields, fieldname):
 366     #     """
 367     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 368     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 369     #     """
 370     #     def gap():
 371     #         while True:
 372     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 373     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 374
 375     def get_master(self, root):
 376         """
 377         Returns the first master tag from an etree.
 378         """
 379         for master in root.iter():
 380             if master.tag in self.master_tags:
 381                 return master
 382
 383     def index_content(self, book, book_fields, legacy=True):
 384         """
 385         Walks the book XML and extract content from it.
 386         Adds parts for each header tag and for each fragment.
 387         """
 388         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 389         root = wld.edoc.getroot()
 390
 391         master = self.get_master(root)
 392         if master is None:
 393             return []
 394
 395         def walker(node):
 396             if node.tag not in self.ignore_content_tags:
 397                 yield node, None, None
 398                 if node.text is not None:
 399                     yield None, node.text, None
 400                 for child in list(node):
 401                     for b, t, e in walker(child):
 402                         yield b, t, e
 403                 yield None, None, node
 404
 405             if node.tail is not None:
 406                 yield None, node.tail, None
 407             return
 408
 409         def fix_format(text):
 410             # separator = [" ", "\t", ".", ";", ","]
 411             if isinstance(text, list):
 412                 # need to join it first
 413                 text = filter(lambda s: s is not None, content)
 414                 text = ' '.join(text)
 415                 # for i in range(len(text)):
 416                 #     if i > 0:
 417                 #         if text[i][0] not in separator\
 418                 #             and text[i - 1][-1] not in separator:
 419                 #          text.insert(i, " ")
 420
 421             return re.sub("(?m)/$", "", text)
 422
 423         def add_part(snippets, **fields):
 424             doc = self.create_book_doc(book)
 425             for n, v in book_fields.items():
 426                 doc[n] = v
 427
 428             doc['header_index'] = fields["header_index"]
 429             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 430             doc['header_type'] = fields['header_type']
 431
 432             doc['text'] = fields['text']
 433
 434             # snippets
 435             snip_pos = snippets.add(fields["text"])
 436
 437             doc['snippets_position'] = snip_pos[0]
 438             doc['snippets_length'] = snip_pos[1]
 439             if snippets.revision:
 440                 doc["snippets_revision"] = snippets.revision
 441
 442             if 'fragment_anchor' in fields:
 443                 doc["fragment_anchor"] = fields['fragment_anchor']
 444
 445             if 'themes' in fields:
 446                 doc['themes'] = fields['themes']
 447             doc['uid'] = "part%s-%s-%s-%s" % (
 448                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 449             return doc
 450
 451         fragments = {}
 452         snippets = Snippets(book.id).open('w')
 453         try:
 454             for header, position in zip(list(master), range(len(master))):
 455
 456                 if header.tag in self.skip_header_tags:
 457                     continue
 458                 if header.tag is etree.Comment:
 459                     continue
 460
 461                 # section content
 462                 content = []
 463                 footnote = []
 464
 465                 def all_content(text):
 466                     for frag in fragments.values():
 467                         frag['text'].append(text)
 468                     content.append(text)
 469                 handle_text = [all_content]
 470
 471                 for start, text, end in walker(header):
 472                     # handle footnotes
 473                     if start is not None and start.tag in self.footnote_tags:
 474                         footnote = []
 475
 476                         def collect_footnote(t):
 477                             footnote.append(t)
 478
 479                         handle_text.append(collect_footnote)
 480                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 481                         handle_text.pop()
 482                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 483                                        text=''.join(footnote))
 484                         self.add_snippet(book, doc)
 485                         if legacy:
 486                             self.index.add(doc)
 487                         footnote = []
 488
 489                     # handle fragments and themes.
 490                     if start is not None and start.tag == 'begin':
 491                         fid = start.attrib['id'][1:]
 492                         fragments[fid] = {
 493                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 494
 495                     # themes for this fragment
 496                     elif start is not None and start.tag == 'motyw':
 497                         fid = start.attrib['id'][1:]
 498                         handle_text.append(lambda text: None)
 499                         if start.text is not None:
 500                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 501                     elif end is not None and end.tag == 'motyw':
 502                         handle_text.pop()
 503
 504                     elif start is not None and start.tag == 'end':
 505                         fid = start.attrib['id'][1:]
 506                         if fid not in fragments:
 507                             continue  # a broken <end> node, skip it
 508                         frag = fragments[fid]
 509                         if not frag['themes']:
 510                             continue  # empty themes list.
 511                         del fragments[fid]
 512
 513                         doc = add_part(snippets,
 514                                        header_type=frag['start_header'],
 515                                        header_index=frag['start_section'],
 516                                        header_span=position - frag['start_section'] + 1,
 517                                        fragment_anchor=fid,
 518                                        text=fix_format(frag['text']),
 519                                        themes=frag['themes'])
 520                         # Add searchable fragment
 521                         self.add_snippet(book, doc)
 522                         if legacy:
 523                             self.index.add(doc)
 524
 525                         # Collect content.
 526
 527                     if text is not None and handle_text is not []:
 528                         hdl = handle_text[-1]
 529                         hdl(text)
 530
 531                         # in the end, add a section text.
 532                 doc = add_part(snippets, header_index=position,
 533                                header_type=header.tag, text=fix_format(content))
 534
 535                 self.add_snippet(book, doc)
 536                 if legacy:
 537                     self.index.add(doc)
 538
 539         finally:
 540             snippets.close()
 541
 542     def remove_picture(self, picture_or_id):
 543         """Removes a picture from search index."""
 544         if isinstance(picture_or_id, picture.models.Picture):
 545             picture_id = picture_or_id.id
 546         else:
 547             picture_id = picture_or_id
 548         self.delete_query(self.index.Q(picture_id=picture_id))
 549
 550     def index_picture(self, picture, picture_info=None, overwrite=True):
 551         """
 552         Indexes the picture.
 553         Creates a lucene document for extracted metadata
 554         and calls self.index_area() to index the contents of the picture.
 555         """
 556         if overwrite:
 557             # we don't remove snippets, since they might be still needed by
 558             # threads using not reopened index
 559             self.remove_picture(picture)
 560
 561         picture_doc = {'picture_id': int(picture.id)}
 562         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 563             'authors', 'title', 'epochs', 'kinds', 'genres'])
 564
 565         picture_doc.update(meta_fields)
 566
 567         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 568         self.index.add(picture_doc)
 569         del picture_doc['is_book']
 570         for area in picture.areas.all():
 571             self.index_area(area, picture_fields=picture_doc)
 572
 573     def index_area(self, area, picture_fields):
 574         """
 575         Indexes themes and objects on the area.
 576         """
 577         doc = dict(picture_fields)
 578         doc['area_id'] = area.id
 579         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 580         doc['uid'] = 'area%s' % area.id
 581         self.index.add(doc)
 582
 583
 584 @total_ordering
 585 class SearchResult(object):
 586     def __init__(self, doc, how_found=None, query_terms=None):
 587         self.boost = 1.0
 588         self._hits = []
 589         self._processed_hits = None  # processed hits
 590         self.snippets = []
 591         self.query_terms = query_terms
 592         self._book = None
 593
 594         if 'score' in doc:
 595             self._score = doc['score']
 596         else:
 597             self._score = 0
 598
 599         self.book_id = int(doc["book_id"])
 600
 601         try:
 602             self.published_date = int(doc.get("published_date"))
 603         except ValueError:
 604             self.published_date = 0
 605
 606         # content hits
 607         header_type = doc.get("header_type", None)
 608         # we have a content hit in some header of fragment
 609         if header_type is not None:
 610             sec = (header_type, int(doc["header_index"]))
 611             header_span = doc['header_span']
 612             header_span = header_span is not None and int(header_span) or 1
 613             fragment = doc.get("fragment_anchor", None)
 614             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 615             snippets_rev = doc.get('snippets_revision', None)
 616
 617             hit = (sec + (header_span,), fragment, self._score, {
 618                 'how_found': how_found,
 619                 'snippets_pos': snippets_pos,
 620                 'snippets_revision': snippets_rev,
 621                 'themes': doc.get('themes', []),
 622                 'themes_pl': doc.get('themes_pl', [])
 623                 })
 624
 625             self._hits.append(hit)
 626
 627     @classmethod
 628     def from_book(cls, book, how_found=None, query_terms=None):
 629         doc = {
 630             'score': book.popularity.count,
 631             'book_id': book.id,
 632             'published_date': 0,
 633         }
 634         result = cls(doc, how_found=how_found, query_terms=query_terms)
 635         result._book = book
 636         return result
 637
 638     def __str__(self):
 639         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 640             (self.book_id, len(self._hits),
 641              len(self._processed_hits) if self._processed_hits else -1,
 642              self._score, len(self.snippets))
 643
 644     def __bytes__(self):
 645         return str(self).encode('utf-8')
 646
 647     @property
 648     def score(self):
 649         return self._score * self.boost
 650
 651     def merge(self, other):
 652         if self.book_id != other.book_id:
 653             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 654         self._hits += other._hits
 655         self._score += max(other._score, 0)
 656         return self
 657
 658     def get_book(self):
 659         if self._book is not None:
 660             return self._book
 661         try:
 662             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 663         except catalogue.models.Book.DoesNotExist:
 664             self._book = None
 665         return self._book
 666
 667     book = property(get_book)
 668
 669     POSITION = 0
 670     FRAGMENT = 1
 671     POSITION_INDEX = 1
 672     POSITION_SPAN = 2
 673     SCORE = 2
 674     OTHER = 3
 675
 676     @property
 677     def hits(self):
 678         if self._processed_hits is not None:
 679             return self._processed_hits
 680
 681         # to sections and fragments
 682         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 683
 684         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 685
 686         # sections not covered by fragments
 687         sect = filter(lambda s: 0 == len(list(filter(
 688             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 689                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 690
 691         def remove_duplicates(lst, keyfn, larger):
 692             els = {}
 693             for e in lst:
 694                 eif = keyfn(e)
 695                 if eif in els:
 696                     if larger(els[eif], e):
 697                         continue
 698                 els[eif] = e
 699             return els.values()
 700
 701         # remove fragments with duplicated fid's and duplicated snippets
 702         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 703
 704         # remove duplicate sections
 705         sections = {}
 706
 707         for s in sect:
 708             si = s[self.POSITION][self.POSITION_INDEX]
 709             # skip existing
 710             if si in sections:
 711                 if sections[si]['score'] >= s[self.SCORE]:
 712                     continue
 713
 714             m = {'score': s[self.SCORE],
 715                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 716                  }
 717             m.update(s[self.OTHER])
 718             sections[si] = m
 719
 720         hits = list(sections.values())
 721
 722         for f in frags:
 723             try:
 724                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 725             except catalogue.models.Fragment.DoesNotExist:
 726                 # stale index
 727                 continue
 728             # Figure out if we were searching for a token matching some word in theme name.
 729             themes = frag.tags.filter(category='theme')
 730             themes_hit = set()
 731             if self.query_terms is not None:
 732                 for i in range(0, len(f[self.OTHER]['themes'])):
 733                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 734                     tms = map(str.lower, tms)
 735                     for qt in self.query_terms:
 736                         if qt in tms:
 737                             themes_hit.add(f[self.OTHER]['themes'][i])
 738                             break
 739
 740             def theme_by_name(n):
 741                 th = list(filter(lambda t: t.name == n, themes))
 742                 if th:
 743                     return th[0]
 744                 else:
 745                     return None
 746             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 747
 748             m = {'score': f[self.SCORE],
 749                  'fragment': frag,
 750                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 751                  'themes': themes,
 752                  'themes_hit': themes_hit
 753                  }
 754             m.update(f[self.OTHER])
 755             hits.append(m)
 756
 757         hits.sort(key=lambda h: h['score'], reverse=True)
 758
 759         self._processed_hits = hits
 760
 761         return hits
 762
 763     @staticmethod
 764     def aggregate(*result_lists):
 765         books = {}
 766         for rl in result_lists:
 767             for r in rl:
 768                 if r.book_id in books:
 769                     books[r.book_id].merge(r)
 770                 else:
 771                     books[r.book_id] = r
 772         return books.values()
 773
 774     def get_sort_key(self):
 775         return (-self.score,
 776                 self.published_date,
 777                 self.book.sort_key_author if self.book else '',
 778                 self.book.sort_key if self.book else '')
 779
 780     def __lt__(self, other):
 781         return self.get_sort_key() > other.get_sort_key()
 782
 783     def __eq__(self, other):
 784         return self.get_sort_key() == other.get_sort_key()
 785
 786     def __len__(self):
 787         return len(self.hits)
 788
 789     def snippet_pos(self, idx=0):
 790         return self.hits[idx]['snippets_pos']
 791
 792     def snippet_revision(self, idx=0):
 793         try:
 794             return self.hits[idx]['snippets_revision']
 795         except (IndexError, KeyError):
 796             return None
 797
 798
 799 @total_ordering
 800 class PictureResult(object):
 801     def __init__(self, doc, how_found=None, query_terms=None):
 802         self.boost = 1.0
 803         self.query_terms = query_terms
 804         self._picture = None
 805         self._hits = []
 806         self._processed_hits = None
 807
 808         if 'score' in doc:
 809             self._score = doc['score']
 810         else:
 811             self._score = 0
 812
 813         self.picture_id = int(doc["picture_id"])
 814
 815         if doc.get('area_id'):
 816             hit = (self._score, {
 817                 'how_found': how_found,
 818                 'area_id': doc['area_id'],
 819                 'themes': doc.get('themes', []),
 820                 'themes_pl': doc.get('themes_pl', []),
 821             })
 822
 823             self._hits.append(hit)
 824
 825     def __str__(self):
 826         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 827
 828     def __repr__(self):
 829         return str(self)
 830
 831     @property
 832     def score(self):
 833         return self._score * self.boost
 834
 835     def merge(self, other):
 836         if self.picture_id != other.picture_id:
 837             raise ValueError(
 838                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 839         self._hits += other._hits
 840         self._score += max(other._score, 0)
 841         return self
 842
 843     SCORE = 0
 844     OTHER = 1
 845
 846     @property
 847     def hits(self):
 848         if self._processed_hits is not None:
 849             return self._processed_hits
 850
 851         hits = []
 852         for hit in self._hits:
 853             try:
 854                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 855             except picture.models.PictureArea.DoesNotExist:
 856                 # stale index
 857                 continue
 858             # Figure out if we were searching for a token matching some word in theme name.
 859             themes_hit = set()
 860             if self.query_terms is not None:
 861                 for i in range(0, len(hit[self.OTHER]['themes'])):
 862                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 863                     tms = map(str.lower, tms)
 864                     for qt in self.query_terms:
 865                         if qt in tms:
 866                             themes_hit.add(hit[self.OTHER]['themes'][i])
 867                             break
 868
 869             m = {
 870                 'score': hit[self.SCORE],
 871                 'area': area,
 872                 'themes_hit': themes_hit,
 873             }
 874             m.update(hit[self.OTHER])
 875             hits.append(m)
 876
 877         hits.sort(key=lambda h: h['score'], reverse=True)
 878         hits = hits[:1]
 879         self._processed_hits = hits
 880         return hits
 881
 882     def get_picture(self):
 883         if self._picture is None:
 884             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 885         return self._picture
 886
 887     picture = property(get_picture)
 888
 889     @staticmethod
 890     def aggregate(*result_lists):
 891         books = {}
 892         for rl in result_lists:
 893             for r in rl:
 894                 if r.picture_id in books:
 895                     books[r.picture_id].merge(r)
 896                 else:
 897                     books[r.picture_id] = r
 898         return books.values()
 899
 900     def __lt__(self, other):
 901         return self.score < other.score
 902
 903     def __eq__(self, other):
 904         return self.score == other.score
 905
 906
 907 class Search(SolrIndex):
 908     """
 909     Search facilities.
 910     """
 911     def __init__(self, default_field="text"):
 912         super(Search, self).__init__(mode='r')
 913
 914     def make_term_query(self, query, field='text', modal=operator.or_):
 915         """
 916         Returns term queries joined by boolean query.
 917         modal - applies to boolean query
 918         fuzzy - should the query by fuzzy.
 919         """
 920         if query is None:
 921             query = ''
 922         q = self.index.Q()
 923         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 924
 925         return q
 926
 927     def search_by_author(self, words):
 928         from catalogue.models import Book
 929         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 930         for word in words:
 931             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 932         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 933
 934     def search_words(self, words, fields, required=None, book=True, picture=False):
 935         if book and not picture and fields == ['authors']:
 936             return self.search_by_author(words)
 937         filters = []
 938         for word in words:
 939             if book or picture or (word not in stopwords):
 940                 word_filter = None
 941                 for field in fields:
 942                     q = self.index.Q(**{field: word})
 943                     if word_filter is None:
 944                         word_filter = q
 945                     else:
 946                         word_filter |= q
 947                 filters.append(word_filter)
 948         if required:
 949             required_filter = None
 950             for field in required:
 951                 for word in words:
 952                     if book or picture or (word not in stopwords):
 953                         q = self.index.Q(**{field: word})
 954                         if required_filter is None:
 955                             required_filter = q
 956                         else:
 957                             required_filter |= q
 958             filters.append(required_filter)
 959         if not filters:
 960             return []
 961         params = {}
 962         if book:
 963             params['is_book'] = True
 964         if picture:
 965             params['picture_id__gt'] = 0
 966         else:
 967             params['book_id__gt'] = 0
 968         query = self.index.query(**params)
 969         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 970         result_class = PictureResult if picture else SearchResult
 971         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 972
 973     def get_snippets(self, searchresult, query, field='text', num=1):
 974         """
 975         Returns a snippet for found scoreDoc.
 976         """
 977         maxnum = len(searchresult)
 978         if num is None or num < 0 or num > maxnum:
 979             num = maxnum
 980         book_id = searchresult.book_id
 981         revision = searchresult.snippet_revision()
 982         snippets = Snippets(book_id, revision=revision)
 983         snips = [None] * maxnum
 984         try:
 985             snippets.open()
 986             idx = 0
 987             while idx < maxnum and num > 0:
 988                 position, length = searchresult.snippet_pos(idx)
 989                 if position is None or length is None:
 990                     continue
 991                 text = snippets.get((int(position),
 992                                      int(length)))
 993                 snip = self.index.highlight(text=text, field=field, q=query)
 994                 if not snip and field == 'text':
 995                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 996                 if snip not in snips:
 997                     snips[idx] = snip
 998                     if snip:
 999                         num -= 1
1000                 idx += 1
1001
1002         except IOError as e:
1003             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1004             if not book:
1005                 log.error("Book does not exist for book id = %d" % book_id)
1006             elif not book.get().children.exists():
1007                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1008             return []
1009         finally:
1010             snippets.close()
1011
1012         # remove verse end markers..
1013         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1014
1015         searchresult.snippets = snips
1016
1017         return snips
1018
1019     @staticmethod
1020     def apply_filters(query, filters):
1021         """
1022         Apply filters to a query
1023         """
1024         if filters is None:
1025             filters = []
1026         filters = filter(lambda x: x is not None, filters)
1027         for f in filters:
1028             query = query.query(f)
1029         return query
1030
1031
1032 if getattr(settings, 'SEARCH_MOCK', False):
1033     from .mock_search import Search