src/search/index.py

   1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
   2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
   3 #
   4 from functools import reduce, total_ordering
   5 from itertools import chain
   6 import logging
   7 import operator
   8 import os
   9 import re
  10 from django.conf import settings
  11 from librarian import dcparser
  12 from librarian.parser import WLDocument
  13 from lxml import etree
  14 import scorched
  15 import catalogue.models
  16 import picture.models
  17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
  18 from wolnelektury.utils import makedirs
  19 from . import custom
  20
  21 log = logging.getLogger('search')
  22
  23
  24 if os.path.isfile(settings.SOLR_STOPWORDS):
  25     stopwords = set(
  26         line.strip()
  27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
  28 else:
  29     stopwords = set()
  30
  31
  32 class SolrIndex(object):
  33     def __init__(self, mode=None):
  34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
  35
  36
  37 class Snippets(object):
  38     """
  39     This class manages snippet files for indexed object (book)
  40     the snippets are concatenated together, and their positions and
  41     lengths are kept in lucene index fields.
  42     """
  43     SNIPPET_DIR = "snippets"
  44
  45     def __init__(self, book_id, revision=None):
  46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
  47         self.book_id = book_id
  48         self.revision = revision
  49         self.file = None
  50         self.position = None
  51
  52     @property
  53     def path(self):
  54         if self.revision:
  55             fn = "%d.%d" % (self.book_id, self.revision)
  56         else:
  57             fn = "%d" % self.book_id
  58
  59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
  60
  61     def open(self, mode='r'):
  62         """
  63         Open the snippet file. Call .close() afterwards.
  64         """
  65         if 'b' not in mode:
  66             mode += 'b'
  67
  68         if 'w' in mode:
  69             if os.path.exists(self.path):
  70                 self.revision = 1
  71                 while True:
  72                     if not os.path.exists(self.path):
  73                         break
  74                     self.revision += 1
  75
  76         self.file = open(self.path, mode)
  77         self.position = 0
  78         return self
  79
  80     def add(self, snippet):
  81         """
  82         Append a snippet (unicode) to the snippet file.
  83         Return a (position, length) tuple
  84         """
  85         txt = snippet.encode('utf-8')
  86         l = len(txt)
  87         self.file.write(txt)
  88         pos = (self.position, l)
  89         self.position += l
  90         return pos
  91
  92     def get(self, pos):
  93         """
  94         Given a tuple of (position, length) return an unicode
  95         of the snippet stored there.
  96         """
  97         self.file.seek(pos[0], 0)
  98         txt = self.file.read(pos[1]).decode('utf-8')
  99         return txt
 100
 101     def close(self):
 102         """Close snippet file"""
 103         if self.file:
 104             self.file.close()
 105
 106     def remove(self):
 107         self.revision = None
 108         try:
 109             os.unlink(self.path)
 110             self.revision = 0
 111             while True:
 112                 self.revision += 1
 113                 os.unlink(self.path)
 114         except OSError:
 115             pass
 116
 117
 118 class Index(SolrIndex):
 119     """
 120     Class indexing books.
 121     """
 122     def __init__(self):
 123         super(Index, self).__init__(mode='rw')
 124
 125     def delete_query(self, *queries):
 126         """
 127         index.delete(queries=...) doesn't work, so let's reimplement it
 128         using deletion of list of uids.
 129         """
 130         uids = set()
 131         for q in queries:
 132             if isinstance(q, scorched.search.LuceneQuery):
 133                 q = self.index.query(q)
 134             q.field_limiter.update(['uid'])
 135             st = 0
 136             rows = 100
 137             while True:
 138                 ids = q.paginate(start=st, rows=rows).execute()
 139                 if not len(ids):
 140                     break
 141                 for res in ids:
 142                     uids.add(res['uid'])
 143                 st += rows
 144         if uids:
 145             # FIXME: With Solr API change, this doesn't work.
 146             #self.index.delete(uids)
 147             return True
 148         else:
 149             return False
 150
 151     def index_tags(self, *tags, **kw):
 152         """
 153         Re-index global tag list.
 154         Removes all tags from index, then index them again.
 155         Indexed fields include: id, name (with and without polish stems), category
 156         """
 157         log.debug("Indexing tags")
 158         remove_only = kw.get('remove_only', False)
 159         # first, remove tags from index.
 160         if tags:
 161             tag_qs = []
 162             for tag in tags:
 163                 q_id = self.index.Q(tag_id=tag.id)
 164
 165                 if isinstance(tag, PDCounterAuthor):
 166                     q_cat = self.index.Q(tag_category='pd_author')
 167                 elif isinstance(tag, PDCounterBook):
 168                     q_cat = self.index.Q(tag_category='pd_book')
 169                 else:
 170                     q_cat = self.index.Q(tag_category=tag.category)
 171
 172                 q_id_cat = self.index.Q(q_id & q_cat)
 173                 tag_qs.append(q_id_cat)
 174             self.delete_query(*tag_qs)
 175         else:  # all
 176             q = self.index.Q(tag_id__any=True)
 177             self.delete_query(q)
 178
 179         if not remove_only:
 180             # then add them [all or just one passed]
 181             if not tags:
 182                 tags = chain(
 183                     catalogue.models.Tag.objects.exclude(category='set'),
 184                     PDCounterAuthor.objects.all(),
 185                     PDCounterBook.objects.all())
 186
 187             for tag in tags:
 188                 if isinstance(tag, PDCounterAuthor):
 189                     doc = {
 190                         "tag_id": int(tag.id),
 191                         "tag_name": tag.name,
 192                         "tag_name_pl": tag.name,
 193                         "tag_category": 'pd_author',
 194                         "is_pdcounter": True,
 195                         "uid": "tag%d_pd_a" % tag.id
 196                         }
 197                 elif isinstance(tag, PDCounterBook):
 198                     doc = {
 199                         "tag_id": int(tag.id),
 200                         "tag_name": tag.title,
 201                         "tag_name_pl": tag.title,
 202                         "tag_category": 'pd_book',
 203                         "is_pdcounter": True,
 204                         "uid": "tag%d_pd_b" % tag.id
 205                         }
 206                 else:
 207                     doc = {
 208                         "tag_id": int(tag.id),
 209                         "tag_name": tag.name,
 210                         "tag_name_pl": tag.name,
 211                         "tag_category": tag.category,
 212                         "is_pdcounter": False,
 213                         "uid": "tag%d" % tag.id
 214                         }
 215                 self.index.add(doc)
 216
 217     def create_book_doc(self, book):
 218         """
 219         Create a lucene document referring book id.
 220         """
 221         doc = {'book_id': int(book.id)}
 222         if book.parent is not None:
 223             doc['parent_id'] = int(book.parent.id)
 224         return doc
 225
 226     def remove_book(self, book_or_id, remove_snippets=True):
 227         """Removes a book from search index.
 228         book - Book instance."""
 229         if isinstance(book_or_id, catalogue.models.Book):
 230             book_id = book_or_id.id
 231         else:
 232             book_id = book_or_id
 233
 234         self.delete_query(self.index.Q(book_id=book_id))
 235
 236         if remove_snippets:
 237             snippets = Snippets(book_id)
 238             snippets.remove()
 239
 240     def index_book(self, book, book_info=None, overwrite=True):
 241         """
 242         Indexes the book.
 243         Creates a lucene document for extracted metadata
 244         and calls self.index_content() to index the contents of the book.
 245         """
 246         if overwrite:
 247             # we don't remove snippets, since they might be still needed by
 248             # threads using not reopened index
 249             self.remove_book(book, remove_snippets=False)
 250
 251         book_doc = self.create_book_doc(book)
 252         meta_fields = self.extract_metadata(book, book_info, dc_only=[
 253             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
 254         # let's not index it - it's only used for extracting publish date
 255         if 'source_name' in meta_fields:
 256             del meta_fields['source_name']
 257
 258         for n, f in meta_fields.items():
 259             book_doc[n] = f
 260
 261         book_doc['uid'] = "book%s" % book_doc['book_id']
 262         self.index.add(book_doc)
 263         del book_doc
 264         book_fields = {
 265             'title': meta_fields['title'],
 266             'authors': meta_fields['authors'],
 267             'published_date': meta_fields['published_date']
 268             }
 269
 270         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
 271             if tag_name in meta_fields:
 272                 book_fields[tag_name] = meta_fields[tag_name]
 273
 274         self.index_content(book, book_fields=book_fields)
 275
 276     master_tags = [
 277         'opowiadanie',
 278         'powiesc',
 279         'dramat_wierszowany_l',
 280         'dramat_wierszowany_lp',
 281         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
 282         'wywiad',
 283     ]
 284
 285     ignore_content_tags = [
 286         'uwaga', 'extra', 'nota_red', 'abstrakt',
 287         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
 288         'didaskalia',
 289         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
 290     ]
 291
 292     footnote_tags = ['pa', 'pt', 'pr', 'pe']
 293
 294     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
 295                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
 296
 297     published_date_re = re.compile("([0-9]+)[\]. ]*$")
 298
 299     def extract_metadata(self, book, book_info=None, dc_only=None):
 300         """
 301         Extract metadata from book and returns a map of fields keyed by fieldname
 302         """
 303         fields = {}
 304
 305         if book_info is None:
 306             book_info = dcparser.parse(open(book.xml_file.path))
 307
 308         fields['slug'] = book.slug
 309         fields['is_book'] = True
 310
 311         # validator, name
 312         for field in dcparser.BookInfo.FIELDS:
 313             if dc_only and field.name not in dc_only:
 314                 continue
 315             if hasattr(book_info, field.name):
 316                 if not getattr(book_info, field.name):
 317                     continue
 318                 # since no type information is available, we use validator
 319                 type_indicator = field.validator
 320                 if type_indicator == dcparser.as_unicode:
 321                     s = getattr(book_info, field.name)
 322                     if field.multiple:
 323                         s = ', '.join(s)
 324                     fields[field.name] = s
 325                 elif type_indicator == dcparser.as_person:
 326                     p = getattr(book_info, field.name)
 327                     if isinstance(p, dcparser.Person):
 328                         persons = str(p)
 329                     else:
 330                         persons = ', '.join(map(str, p))
 331                     fields[field.name] = persons
 332                 elif type_indicator == dcparser.as_date:
 333                     dt = getattr(book_info, field.name)
 334                     fields[field.name] = dt
 335
 336         # get published date
 337         pd = None
 338         if hasattr(book_info, 'source_name') and book_info.source_name:
 339             match = self.published_date_re.search(book_info.source_name)
 340             if match is not None:
 341                 pd = str(match.groups()[0])
 342         if not pd:
 343             pd = ""
 344         fields["published_date"] = pd
 345
 346         return fields
 347
 348     # def add_gaps(self, fields, fieldname):
 349     #     """
 350     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
 351     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
 352     #     """
 353     #     def gap():
 354     #         while True:
 355     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
 356     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
 357
 358     def get_master(self, root):
 359         """
 360         Returns the first master tag from an etree.
 361         """
 362         for master in root.iter():
 363             if master.tag in self.master_tags:
 364                 return master
 365
 366     def index_content(self, book, book_fields):
 367         """
 368         Walks the book XML and extract content from it.
 369         Adds parts for each header tag and for each fragment.
 370         """
 371         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
 372         root = wld.edoc.getroot()
 373
 374         master = self.get_master(root)
 375         if master is None:
 376             return []
 377
 378         def walker(node):
 379             if node.tag not in self.ignore_content_tags:
 380                 yield node, None, None
 381                 if node.text is not None:
 382                     yield None, node.text, None
 383                 for child in list(node):
 384                     for b, t, e in walker(child):
 385                         yield b, t, e
 386                 yield None, None, node
 387
 388             if node.tail is not None:
 389                 yield None, node.tail, None
 390             return
 391
 392         def fix_format(text):
 393             # separator = [" ", "\t", ".", ";", ","]
 394             if isinstance(text, list):
 395                 # need to join it first
 396                 text = filter(lambda s: s is not None, content)
 397                 text = ' '.join(text)
 398                 # for i in range(len(text)):
 399                 #     if i > 0:
 400                 #         if text[i][0] not in separator\
 401                 #             and text[i - 1][-1] not in separator:
 402                 #          text.insert(i, " ")
 403
 404             return re.sub("(?m)/$", "", text)
 405
 406         def add_part(snippets, **fields):
 407             doc = self.create_book_doc(book)
 408             for n, v in book_fields.items():
 409                 doc[n] = v
 410
 411             doc['header_index'] = fields["header_index"]
 412             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
 413             doc['header_type'] = fields['header_type']
 414
 415             doc['text'] = fields['text']
 416
 417             # snippets
 418             snip_pos = snippets.add(fields["text"])
 419
 420             doc['snippets_position'] = snip_pos[0]
 421             doc['snippets_length'] = snip_pos[1]
 422             if snippets.revision:
 423                 doc["snippets_revision"] = snippets.revision
 424
 425             if 'fragment_anchor' in fields:
 426                 doc["fragment_anchor"] = fields['fragment_anchor']
 427
 428             if 'themes' in fields:
 429                 doc['themes'] = fields['themes']
 430             doc['uid'] = "part%s-%s-%s-%s" % (
 431                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
 432             return doc
 433
 434         fragments = {}
 435         snippets = Snippets(book.id).open('w')
 436         try:
 437             for header, position in zip(list(master), range(len(master))):
 438
 439                 if header.tag in self.skip_header_tags:
 440                     continue
 441                 if header.tag is etree.Comment:
 442                     continue
 443
 444                 # section content
 445                 content = []
 446                 footnote = []
 447
 448                 def all_content(text):
 449                     for frag in fragments.values():
 450                         frag['text'].append(text)
 451                     content.append(text)
 452                 handle_text = [all_content]
 453
 454                 for start, text, end in walker(header):
 455                     # handle footnotes
 456                     if start is not None and start.tag in self.footnote_tags:
 457                         footnote = []
 458
 459                         def collect_footnote(t):
 460                             footnote.append(t)
 461
 462                         handle_text.append(collect_footnote)
 463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
 464                         handle_text.pop()
 465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
 466                                        text=''.join(footnote),
 467                                        is_footnote=True)
 468                         self.index.add(doc)
 469                         footnote = []
 470
 471                     # handle fragments and themes.
 472                     if start is not None and start.tag == 'begin':
 473                         fid = start.attrib['id'][1:]
 474                         fragments[fid] = {
 475                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
 476
 477                     # themes for this fragment
 478                     elif start is not None and start.tag == 'motyw':
 479                         fid = start.attrib['id'][1:]
 480                         handle_text.append(lambda text: None)
 481                         if start.text is not None:
 482                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
 483                     elif end is not None and end.tag == 'motyw':
 484                         handle_text.pop()
 485
 486                     elif start is not None and start.tag == 'end':
 487                         fid = start.attrib['id'][1:]
 488                         if fid not in fragments:
 489                             continue  # a broken <end> node, skip it
 490                         frag = fragments[fid]
 491                         if not frag['themes']:
 492                             continue  # empty themes list.
 493                         del fragments[fid]
 494
 495                         doc = add_part(snippets,
 496                                        header_type=frag['start_header'],
 497                                        header_index=frag['start_section'],
 498                                        header_span=position - frag['start_section'] + 1,
 499                                        fragment_anchor=fid,
 500                                        text=fix_format(frag['text']),
 501                                        themes=frag['themes'])
 502                         self.index.add(doc)
 503
 504                         # Collect content.
 505
 506                     if text is not None and handle_text is not []:
 507                         hdl = handle_text[-1]
 508                         hdl(text)
 509
 510                         # in the end, add a section text.
 511                 doc = add_part(snippets, header_index=position,
 512                                header_type=header.tag, text=fix_format(content))
 513
 514                 self.index.add(doc)
 515
 516         finally:
 517             snippets.close()
 518
 519     def remove_picture(self, picture_or_id):
 520         """Removes a picture from search index."""
 521         if isinstance(picture_or_id, picture.models.Picture):
 522             picture_id = picture_or_id.id
 523         else:
 524             picture_id = picture_or_id
 525         self.delete_query(self.index.Q(picture_id=picture_id))
 526
 527     def index_picture(self, picture, picture_info=None, overwrite=True):
 528         """
 529         Indexes the picture.
 530         Creates a lucene document for extracted metadata
 531         and calls self.index_area() to index the contents of the picture.
 532         """
 533         if overwrite:
 534             # we don't remove snippets, since they might be still needed by
 535             # threads using not reopened index
 536             self.remove_picture(picture)
 537
 538         picture_doc = {'picture_id': int(picture.id)}
 539         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
 540             'authors', 'title', 'epochs', 'kinds', 'genres'])
 541
 542         picture_doc.update(meta_fields)
 543
 544         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
 545         self.index.add(picture_doc)
 546         del picture_doc['is_book']
 547         for area in picture.areas.all():
 548             self.index_area(area, picture_fields=picture_doc)
 549
 550     def index_area(self, area, picture_fields):
 551         """
 552         Indexes themes and objects on the area.
 553         """
 554         doc = dict(picture_fields)
 555         doc['area_id'] = area.id
 556         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
 557         doc['uid'] = 'area%s' % area.id
 558         self.index.add(doc)
 559
 560
 561 @total_ordering
 562 class SearchResult(object):
 563     def __init__(self, doc, how_found=None, query_terms=None):
 564         self.boost = 1.0
 565         self._hits = []
 566         self._processed_hits = None  # processed hits
 567         self.snippets = []
 568         self.query_terms = query_terms
 569         self._book = None
 570
 571         if 'score' in doc:
 572             self._score = doc['score']
 573         else:
 574             self._score = 0
 575
 576         self.book_id = int(doc["book_id"])
 577
 578         try:
 579             self.published_date = int(doc.get("published_date"))
 580         except ValueError:
 581             self.published_date = 0
 582
 583         # content hits
 584         header_type = doc.get("header_type", None)
 585         # we have a content hit in some header of fragment
 586         if header_type is not None:
 587             sec = (header_type, int(doc["header_index"]))
 588             header_span = doc['header_span']
 589             header_span = header_span is not None and int(header_span) or 1
 590             fragment = doc.get("fragment_anchor", None)
 591             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
 592             snippets_rev = doc.get('snippets_revision', None)
 593
 594             hit = (sec + (header_span,), fragment, self._score, {
 595                 'how_found': how_found,
 596                 'snippets_pos': snippets_pos,
 597                 'snippets_revision': snippets_rev,
 598                 'themes': doc.get('themes', []),
 599                 'themes_pl': doc.get('themes_pl', [])
 600                 })
 601
 602             self._hits.append(hit)
 603
 604     @classmethod
 605     def from_book(cls, book, how_found=None, query_terms=None):
 606         doc = {
 607             'score': book.popularity.count,
 608             'book_id': book.id,
 609             'published_date': 0,
 610         }
 611         result = cls(doc, how_found=how_found, query_terms=query_terms)
 612         result._book = book
 613         return result
 614
 615     def __str__(self):
 616         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
 617             (self.book_id, len(self._hits),
 618              len(self._processed_hits) if self._processed_hits else -1,
 619              self._score, len(self.snippets))
 620
 621     def __bytes__(self):
 622         return str(self).encode('utf-8')
 623
 624     @property
 625     def score(self):
 626         return self._score * self.boost
 627
 628     def merge(self, other):
 629         if self.book_id != other.book_id:
 630             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
 631         self._hits += other._hits
 632         self._score += max(other._score, 0)
 633         return self
 634
 635     def get_book(self):
 636         if self._book is not None:
 637             return self._book
 638         try:
 639             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
 640         except catalogue.models.Book.DoesNotExist:
 641             self._book = None
 642         return self._book
 643
 644     book = property(get_book)
 645
 646     POSITION = 0
 647     FRAGMENT = 1
 648     POSITION_INDEX = 1
 649     POSITION_SPAN = 2
 650     SCORE = 2
 651     OTHER = 3
 652
 653     @property
 654     def hits(self):
 655         if self._processed_hits is not None:
 656             return self._processed_hits
 657
 658         # to sections and fragments
 659         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
 660
 661         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
 662
 663         # sections not covered by fragments
 664         sect = filter(lambda s: 0 == len(list(filter(
 665             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
 666                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
 667
 668         def remove_duplicates(lst, keyfn, larger):
 669             els = {}
 670             for e in lst:
 671                 eif = keyfn(e)
 672                 if eif in els:
 673                     if larger(els[eif], e):
 674                         continue
 675                 els[eif] = e
 676             return els.values()
 677
 678         # remove fragments with duplicated fid's and duplicated snippets
 679         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
 680
 681         # remove duplicate sections
 682         sections = {}
 683
 684         for s in sect:
 685             si = s[self.POSITION][self.POSITION_INDEX]
 686             # skip existing
 687             if si in sections:
 688                 if sections[si]['score'] >= s[self.SCORE]:
 689                     continue
 690
 691             m = {'score': s[self.SCORE],
 692                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
 693                  }
 694             m.update(s[self.OTHER])
 695             sections[si] = m
 696
 697         hits = list(sections.values())
 698
 699         for f in frags:
 700             try:
 701                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
 702             except catalogue.models.Fragment.DoesNotExist:
 703                 # stale index
 704                 continue
 705             # Figure out if we were searching for a token matching some word in theme name.
 706             themes = frag.tags.filter(category='theme')
 707             themes_hit = set()
 708             if self.query_terms is not None:
 709                 for i in range(0, len(f[self.OTHER]['themes'])):
 710                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
 711                     tms = map(str.lower, tms)
 712                     for qt in self.query_terms:
 713                         if qt in tms:
 714                             themes_hit.add(f[self.OTHER]['themes'][i])
 715                             break
 716
 717             def theme_by_name(n):
 718                 th = list(filter(lambda t: t.name == n, themes))
 719                 if th:
 720                     return th[0]
 721                 else:
 722                     return None
 723             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
 724
 725             m = {'score': f[self.SCORE],
 726                  'fragment': frag,
 727                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
 728                  'themes': themes,
 729                  'themes_hit': themes_hit
 730                  }
 731             m.update(f[self.OTHER])
 732             hits.append(m)
 733
 734         hits.sort(key=lambda h: h['score'], reverse=True)
 735
 736         self._processed_hits = hits
 737
 738         return hits
 739
 740     @staticmethod
 741     def aggregate(*result_lists):
 742         books = {}
 743         for rl in result_lists:
 744             for r in rl:
 745                 if r.book_id in books:
 746                     books[r.book_id].merge(r)
 747                 else:
 748                     books[r.book_id] = r
 749         return books.values()
 750
 751     def get_sort_key(self):
 752         return (-self.score,
 753                 self.published_date,
 754                 self.book.sort_key_author if self.book else '',
 755                 self.book.sort_key if self.book else '')
 756
 757     def __lt__(self, other):
 758         return self.get_sort_key() > other.get_sort_key()
 759
 760     def __eq__(self, other):
 761         return self.get_sort_key() == other.get_sort_key()
 762
 763     def __len__(self):
 764         return len(self.hits)
 765
 766     def snippet_pos(self, idx=0):
 767         return self.hits[idx]['snippets_pos']
 768
 769     def snippet_revision(self, idx=0):
 770         try:
 771             return self.hits[idx]['snippets_revision']
 772         except (IndexError, KeyError):
 773             return None
 774
 775
 776 @total_ordering
 777 class PictureResult(object):
 778     def __init__(self, doc, how_found=None, query_terms=None):
 779         self.boost = 1.0
 780         self.query_terms = query_terms
 781         self._picture = None
 782         self._hits = []
 783         self._processed_hits = None
 784
 785         if 'score' in doc:
 786             self._score = doc['score']
 787         else:
 788             self._score = 0
 789
 790         self.picture_id = int(doc["picture_id"])
 791
 792         if doc.get('area_id'):
 793             hit = (self._score, {
 794                 'how_found': how_found,
 795                 'area_id': doc['area_id'],
 796                 'themes': doc.get('themes', []),
 797                 'themes_pl': doc.get('themes_pl', []),
 798             })
 799
 800             self._hits.append(hit)
 801
 802     def __str__(self):
 803         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
 804
 805     def __repr__(self):
 806         return str(self)
 807
 808     @property
 809     def score(self):
 810         return self._score * self.boost
 811
 812     def merge(self, other):
 813         if self.picture_id != other.picture_id:
 814             raise ValueError(
 815                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
 816         self._hits += other._hits
 817         self._score += max(other._score, 0)
 818         return self
 819
 820     SCORE = 0
 821     OTHER = 1
 822
 823     @property
 824     def hits(self):
 825         if self._processed_hits is not None:
 826             return self._processed_hits
 827
 828         hits = []
 829         for hit in self._hits:
 830             try:
 831                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
 832             except picture.models.PictureArea.DoesNotExist:
 833                 # stale index
 834                 continue
 835             # Figure out if we were searching for a token matching some word in theme name.
 836             themes_hit = set()
 837             if self.query_terms is not None:
 838                 for i in range(0, len(hit[self.OTHER]['themes'])):
 839                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
 840                     tms = map(str.lower, tms)
 841                     for qt in self.query_terms:
 842                         if qt in tms:
 843                             themes_hit.add(hit[self.OTHER]['themes'][i])
 844                             break
 845
 846             m = {
 847                 'score': hit[self.SCORE],
 848                 'area': area,
 849                 'themes_hit': themes_hit,
 850             }
 851             m.update(hit[self.OTHER])
 852             hits.append(m)
 853
 854         hits.sort(key=lambda h: h['score'], reverse=True)
 855         hits = hits[:1]
 856         self._processed_hits = hits
 857         return hits
 858
 859     def get_picture(self):
 860         if self._picture is None:
 861             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
 862         return self._picture
 863
 864     picture = property(get_picture)
 865
 866     @staticmethod
 867     def aggregate(*result_lists):
 868         books = {}
 869         for rl in result_lists:
 870             for r in rl:
 871                 if r.picture_id in books:
 872                     books[r.picture_id].merge(r)
 873                 else:
 874                     books[r.picture_id] = r
 875         return books.values()
 876
 877     def __lt__(self, other):
 878         return self.score < other.score
 879
 880     def __eq__(self, other):
 881         return self.score == other.score
 882
 883
 884 class Search(SolrIndex):
 885     """
 886     Search facilities.
 887     """
 888     def __init__(self, default_field="text"):
 889         super(Search, self).__init__(mode='r')
 890
 891     def make_term_query(self, query, field='text', modal=operator.or_):
 892         """
 893         Returns term queries joined by boolean query.
 894         modal - applies to boolean query
 895         fuzzy - should the query by fuzzy.
 896         """
 897         if query is None:
 898             query = ''
 899         q = self.index.Q()
 900         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
 901
 902         return q
 903
 904     def search_by_author(self, words):
 905         from catalogue.models import Book
 906         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
 907         for word in words:
 908             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
 909         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
 910
 911     def search_words(self, words, fields, required=None, book=True, picture=False):
 912         if book and not picture and fields == ['authors']:
 913             return self.search_by_author(words)
 914         filters = []
 915         for word in words:
 916             if book or picture or (word not in stopwords):
 917                 word_filter = None
 918                 for field in fields:
 919                     q = self.index.Q(**{field: word})
 920                     if word_filter is None:
 921                         word_filter = q
 922                     else:
 923                         word_filter |= q
 924                 filters.append(word_filter)
 925         if required:
 926             required_filter = None
 927             for field in required:
 928                 for word in words:
 929                     if book or picture or (word not in stopwords):
 930                         q = self.index.Q(**{field: word})
 931                         if required_filter is None:
 932                             required_filter = q
 933                         else:
 934                             required_filter |= q
 935             filters.append(required_filter)
 936         if not filters:
 937             return []
 938         params = {}
 939         if book:
 940             params['is_book'] = True
 941         if picture:
 942             params['picture_id__gt'] = 0
 943         else:
 944             params['book_id__gt'] = 0
 945         query = self.index.query(**params)
 946         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
 947         result_class = PictureResult if picture else SearchResult
 948         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
 949
 950     def get_snippets(self, searchresult, query, field='text', num=1):
 951         """
 952         Returns a snippet for found scoreDoc.
 953         """
 954         maxnum = len(searchresult)
 955         if num is None or num < 0 or num > maxnum:
 956             num = maxnum
 957         book_id = searchresult.book_id
 958         revision = searchresult.snippet_revision()
 959         snippets = Snippets(book_id, revision=revision)
 960         snips = [None] * maxnum
 961         try:
 962             snippets.open()
 963             idx = 0
 964             while idx < maxnum and num > 0:
 965                 position, length = searchresult.snippet_pos(idx)
 966                 if position is None or length is None:
 967                     continue
 968                 text = snippets.get((int(position),
 969                                      int(length)))
 970                 snip = self.index.highlight(text=text, field=field, q=query)
 971                 if not snip and field == 'text':
 972                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
 973                 if snip not in snips:
 974                     snips[idx] = snip
 975                     if snip:
 976                         num -= 1
 977                 idx += 1
 978
 979         except IOError as e:
 980             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
 981             if not book:
 982                 log.error("Book does not exist for book id = %d" % book_id)
 983             elif not book.get().children.exists():
 984                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
 985             return []
 986         finally:
 987             snippets.close()
 988
 989         # remove verse end markers..
 990         snips = [s.replace("/\n", "\n") if s else s for s in snips]
 991
 992         searchresult.snippets = snips
 993
 994         return snips
 995
 996     @staticmethod
 997     def apply_filters(query, filters):
 998         """
 999         Apply filters to a query
1000         """
1001         if filters is None:
1002             filters = []
1003         filters = filter(lambda x: x is not None, filters)
1004         for f in filters:
1005             query = query.query(f)
1006         return query
1007
1008
1009 if getattr(settings, 'SEARCH_MOCK', False):
1010     from .mock_search import Search