68a2b3b1831d51f615f662153b0c879d8b55f328
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
17 import scorched
18 import catalogue.models
19 import picture.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
22 from . import custom
23
24 log = logging.getLogger('search')
25
26
27 if os.path.isfile(settings.SOLR_STOPWORDS):
28     stopwords = set(
29         line.strip()
30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
31 else:
32     stopwords = set()
33
34
35 class SolrIndex(object):
36     def __init__(self, mode=None):
37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
38
39
40 class Snippets(object):
41     """
42     This class manages snippet files for indexed object (book)
43     the snippets are concatenated together, and their positions and
44     lengths are kept in lucene index fields.
45     """
46     SNIPPET_DIR = "snippets"
47
48     def __init__(self, book_id, revision=None):
49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50         self.book_id = book_id
51         self.revision = revision
52         self.file = None
53         self.position = None
54
55     @property
56     def path(self):
57         if self.revision:
58             fn = "%d.%d" % (self.book_id, self.revision)
59         else:
60             fn = "%d" % self.book_id
61
62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
63
64     def open(self, mode='r'):
65         """
66         Open the snippet file. Call .close() afterwards.
67         """
68         if 'b' not in mode:
69             mode += 'b'
70
71         if 'w' in mode:
72             if os.path.exists(self.path):
73                 self.revision = 1
74                 while True:
75                     if not os.path.exists(self.path):
76                         break
77                     self.revision += 1
78
79         self.file = open(self.path, mode)
80         self.position = 0
81         return self
82
83     def add(self, snippet):
84         """
85         Append a snippet (unicode) to the snippet file.
86         Return a (position, length) tuple
87         """
88         txt = snippet.encode('utf-8')
89         l = len(txt)
90         self.file.write(txt)
91         pos = (self.position, l)
92         self.position += l
93         return pos
94
95     def get(self, pos):
96         """
97         Given a tuple of (position, length) return an unicode
98         of the snippet stored there.
99         """
100         self.file.seek(pos[0], 0)
101         try:
102             txt = self.file.read(pos[1]).decode('utf-8')
103         except:
104             return ''
105         return txt
106
107     def close(self):
108         """Close snippet file"""
109         if self.file:
110             self.file.close()
111
112     def remove(self):
113         self.revision = None
114         try:
115             os.unlink(self.path)
116             self.revision = 0
117             while True:
118                 self.revision += 1
119                 os.unlink(self.path)
120         except OSError:
121             pass
122
123
124 class Index(SolrIndex):
125     """
126     Class indexing books.
127     """
128     def __init__(self):
129         super(Index, self).__init__(mode='rw')
130
131     def delete_query(self, *queries):
132         """
133         index.delete(queries=...) doesn't work, so let's reimplement it
134         using deletion of list of uids.
135         """
136         uids = set()
137         for q in queries:
138             if isinstance(q, scorched.search.LuceneQuery):
139                 q = self.index.query(q)
140             q.field_limiter.update(['uid'])
141             st = 0
142             rows = 100
143             while True:
144                 ids = q.paginate(start=st, rows=rows).execute()
145                 if not len(ids):
146                     break
147                 for res in ids:
148                     uids.add(res['uid'])
149                 st += rows
150         if uids:
151             # FIXME: With Solr API change, this doesn't work.
152             #self.index.delete(uids)
153             return True
154         else:
155             return False
156
157     def index_tags(self, *tags, **kw):
158         """
159         Re-index global tag list.
160         Removes all tags from index, then index them again.
161         Indexed fields include: id, name (with and without polish stems), category
162         """
163         log.debug("Indexing tags")
164         remove_only = kw.get('remove_only', False)
165         # first, remove tags from index.
166         if tags:
167             tag_qs = []
168             for tag in tags:
169                 q_id = self.index.Q(tag_id=tag.id)
170
171                 if isinstance(tag, PDCounterAuthor):
172                     q_cat = self.index.Q(tag_category='pd_author')
173                 elif isinstance(tag, PDCounterBook):
174                     q_cat = self.index.Q(tag_category='pd_book')
175                 else:
176                     q_cat = self.index.Q(tag_category=tag.category)
177
178                 q_id_cat = self.index.Q(q_id & q_cat)
179                 tag_qs.append(q_id_cat)
180             self.delete_query(*tag_qs)
181         else:  # all
182             q = self.index.Q(tag_id__any=True)
183             self.delete_query(q)
184
185         if not remove_only:
186             # then add them [all or just one passed]
187             if not tags:
188                 tags = chain(
189                     catalogue.models.Tag.objects.exclude(category='set'),
190                     PDCounterAuthor.objects.all(),
191                     PDCounterBook.objects.all())
192
193             for tag in tags:
194                 if isinstance(tag, PDCounterAuthor):
195                     doc = {
196                         "tag_id": int(tag.id),
197                         "tag_name": tag.name,
198                         "tag_name_pl": tag.name,
199                         "tag_category": 'pd_author',
200                         "is_pdcounter": True,
201                         "uid": "tag%d_pd_a" % tag.id
202                         }
203                 elif isinstance(tag, PDCounterBook):
204                     doc = {
205                         "tag_id": int(tag.id),
206                         "tag_name": tag.title,
207                         "tag_name_pl": tag.title,
208                         "tag_category": 'pd_book',
209                         "is_pdcounter": True,
210                         "uid": "tag%d_pd_b" % tag.id
211                         }
212                 else:
213                     doc = {
214                         "tag_id": int(tag.id),
215                         "tag_name": tag.name,
216                         "tag_name_pl": tag.name,
217                         "tag_category": tag.category,
218                         "is_pdcounter": False,
219                         "uid": "tag%d" % tag.id
220                         }
221                 self.index.add(doc)
222
223     def create_book_doc(self, book):
224         """
225         Create a lucene document referring book id.
226         """
227         doc = {'book_id': int(book.id)}
228         if book.parent is not None:
229             doc['parent_id'] = int(book.parent.id)
230         return doc
231
232     def remove_book(self, book_or_id, remove_snippets=True):
233         """Removes a book from search index.
234         book - Book instance."""
235         if isinstance(book_or_id, catalogue.models.Book):
236             book_id = book_or_id.id
237         else:
238             book_id = book_or_id
239
240         self.delete_query(self.index.Q(book_id=book_id))
241
242         if remove_snippets:
243             snippets = Snippets(book_id)
244             snippets.remove()
245
246     def index_book(self, book, book_info=None, overwrite=True):
247         """
248         Indexes the book.
249         Creates a lucene document for extracted metadata
250         and calls self.index_content() to index the contents of the book.
251         """
252         if overwrite:
253             # we don't remove snippets, since they might be still needed by
254             # threads using not reopened index
255             self.remove_book(book, remove_snippets=False)
256
257         book_doc = self.create_book_doc(book)
258         meta_fields = self.extract_metadata(book, book_info, dc_only=[
259             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
260         # let's not index it - it's only used for extracting publish date
261         if 'source_name' in meta_fields:
262             del meta_fields['source_name']
263
264         for n, f in meta_fields.items():
265             book_doc[n] = f
266
267         book_doc['uid'] = "book%s" % book_doc['book_id']
268         self.index.add(book_doc)
269         del book_doc
270         book_fields = {
271             'title': meta_fields['title'],
272             'authors': meta_fields['authors'],
273             'published_date': meta_fields['published_date']
274             }
275
276         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
277             if tag_name in meta_fields:
278                 book_fields[tag_name] = meta_fields[tag_name]
279
280         self.index_content(book, book_fields=book_fields)
281
282     master_tags = [
283         'opowiadanie',
284         'powiesc',
285         'dramat_wierszowany_l',
286         'dramat_wierszowany_lp',
287         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
288         'wywiad',
289     ]
290
291     ignore_content_tags = [
292         'uwaga', 'extra', 'nota_red', 'abstrakt',
293         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
294         'didaskalia',
295         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
296     ]
297
298     footnote_tags = ['pa', 'pt', 'pr', 'pe']
299
300     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
301                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
302
303     published_date_re = re.compile("([0-9]+)[\]. ]*$")
304
305     def extract_metadata(self, book, book_info=None, dc_only=None):
306         """
307         Extract metadata from book and returns a map of fields keyed by fieldname
308         """
309         fields = {}
310
311         if book_info is None:
312             book_info = dcparser.parse(open(book.xml_file.path))
313
314         fields['slug'] = book.slug
315         fields['is_book'] = True
316
317         # validator, name
318         for field in dcparser.BookInfo.FIELDS:
319             if dc_only and field.name not in dc_only:
320                 continue
321             if hasattr(book_info, field.name):
322                 if not getattr(book_info, field.name):
323                     continue
324                 type_indicator = field.value_type
325                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
326                     s = getattr(book_info, field.name)
327                     if field.multiple:
328                         s = ', '.join(s)
329                     fields[field.name] = s
330                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
331                     p = getattr(book_info, field.name)
332                     if isinstance(p, librarian.meta.types.person.Person):
333                         persons = str(p)
334                     else:
335                         persons = ', '.join(map(str, p))
336                     fields[field.name] = persons
337                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
338                     dt = getattr(book_info, field.name)
339                     fields[field.name] = dt
340
341         # get published date
342         pd = None
343         if hasattr(book_info, 'source_name') and book_info.source_name:
344             match = self.published_date_re.search(book_info.source_name)
345             if match is not None:
346                 pd = str(match.groups()[0])
347         if not pd:
348             pd = ""
349         fields["published_date"] = pd
350
351         return fields
352
353     # def add_gaps(self, fields, fieldname):
354     #     """
355     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
356     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
357     #     """
358     #     def gap():
359     #         while True:
360     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
361     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
362
363     def get_master(self, root):
364         """
365         Returns the first master tag from an etree.
366         """
367         for master in root.iter():
368             if master.tag in self.master_tags:
369                 return master
370
371     def index_content(self, book, book_fields):
372         """
373         Walks the book XML and extract content from it.
374         Adds parts for each header tag and for each fragment.
375         """
376         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
377         root = wld.edoc.getroot()
378
379         master = self.get_master(root)
380         if master is None:
381             return []
382
383         def walker(node):
384             if node.tag not in self.ignore_content_tags:
385                 yield node, None, None
386                 if node.text is not None:
387                     yield None, node.text, None
388                 for child in list(node):
389                     for b, t, e in walker(child):
390                         yield b, t, e
391                 yield None, None, node
392
393             if node.tail is not None:
394                 yield None, node.tail, None
395             return
396
397         def fix_format(text):
398             # separator = [" ", "\t", ".", ";", ","]
399             if isinstance(text, list):
400                 # need to join it first
401                 text = filter(lambda s: s is not None, content)
402                 text = ' '.join(text)
403                 # for i in range(len(text)):
404                 #     if i > 0:
405                 #         if text[i][0] not in separator\
406                 #             and text[i - 1][-1] not in separator:
407                 #          text.insert(i, " ")
408
409             return re.sub("(?m)/$", "", text)
410
411         def add_part(snippets, **fields):
412             doc = self.create_book_doc(book)
413             for n, v in book_fields.items():
414                 doc[n] = v
415
416             doc['header_index'] = fields["header_index"]
417             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
418             doc['header_type'] = fields['header_type']
419
420             doc['text'] = fields['text']
421
422             # snippets
423             snip_pos = snippets.add(fields["text"])
424
425             doc['snippets_position'] = snip_pos[0]
426             doc['snippets_length'] = snip_pos[1]
427             if snippets.revision:
428                 doc["snippets_revision"] = snippets.revision
429
430             if 'fragment_anchor' in fields:
431                 doc["fragment_anchor"] = fields['fragment_anchor']
432
433             if 'themes' in fields:
434                 doc['themes'] = fields['themes']
435             doc['uid'] = "part%s-%s-%s-%s" % (
436                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
437             return doc
438
439         fragments = {}
440         snippets = Snippets(book.id).open('w')
441         try:
442             for header, position in zip(list(master), range(len(master))):
443
444                 if header.tag in self.skip_header_tags:
445                     continue
446                 if header.tag is etree.Comment:
447                     continue
448
449                 # section content
450                 content = []
451                 footnote = []
452
453                 def all_content(text):
454                     for frag in fragments.values():
455                         frag['text'].append(text)
456                     content.append(text)
457                 handle_text = [all_content]
458
459                 for start, text, end in walker(header):
460                     # handle footnotes
461                     if start is not None and start.tag in self.footnote_tags:
462                         footnote = []
463
464                         def collect_footnote(t):
465                             footnote.append(t)
466
467                         handle_text.append(collect_footnote)
468                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
469                         handle_text.pop()
470                         doc = add_part(snippets, header_index=position, header_type=header.tag,
471                                        text=''.join(footnote),
472                                        is_footnote=True)
473                         self.index.add(doc)
474                         footnote = []
475
476                     # handle fragments and themes.
477                     if start is not None and start.tag == 'begin':
478                         fid = start.attrib['id'][1:]
479                         fragments[fid] = {
480                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
481
482                     # themes for this fragment
483                     elif start is not None and start.tag == 'motyw':
484                         fid = start.attrib['id'][1:]
485                         handle_text.append(lambda text: None)
486                         if start.text is not None:
487                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
488                     elif end is not None and end.tag == 'motyw':
489                         handle_text.pop()
490
491                     elif start is not None and start.tag == 'end':
492                         fid = start.attrib['id'][1:]
493                         if fid not in fragments:
494                             continue  # a broken <end> node, skip it
495                         frag = fragments[fid]
496                         if not frag['themes']:
497                             continue  # empty themes list.
498                         del fragments[fid]
499
500                         doc = add_part(snippets,
501                                        header_type=frag['start_header'],
502                                        header_index=frag['start_section'],
503                                        header_span=position - frag['start_section'] + 1,
504                                        fragment_anchor=fid,
505                                        text=fix_format(frag['text']),
506                                        themes=frag['themes'])
507                         self.index.add(doc)
508
509                         # Collect content.
510
511                     if text is not None and handle_text is not []:
512                         hdl = handle_text[-1]
513                         hdl(text)
514
515                         # in the end, add a section text.
516                 doc = add_part(snippets, header_index=position,
517                                header_type=header.tag, text=fix_format(content))
518
519                 self.index.add(doc)
520
521         finally:
522             snippets.close()
523
524     def remove_picture(self, picture_or_id):
525         """Removes a picture from search index."""
526         if isinstance(picture_or_id, picture.models.Picture):
527             picture_id = picture_or_id.id
528         else:
529             picture_id = picture_or_id
530         self.delete_query(self.index.Q(picture_id=picture_id))
531
532     def index_picture(self, picture, picture_info=None, overwrite=True):
533         """
534         Indexes the picture.
535         Creates a lucene document for extracted metadata
536         and calls self.index_area() to index the contents of the picture.
537         """
538         if overwrite:
539             # we don't remove snippets, since they might be still needed by
540             # threads using not reopened index
541             self.remove_picture(picture)
542
543         picture_doc = {'picture_id': int(picture.id)}
544         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
545             'authors', 'title', 'epochs', 'kinds', 'genres'])
546
547         picture_doc.update(meta_fields)
548
549         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
550         self.index.add(picture_doc)
551         del picture_doc['is_book']
552         for area in picture.areas.all():
553             self.index_area(area, picture_fields=picture_doc)
554
555     def index_area(self, area, picture_fields):
556         """
557         Indexes themes and objects on the area.
558         """
559         doc = dict(picture_fields)
560         doc['area_id'] = area.id
561         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
562         doc['uid'] = 'area%s' % area.id
563         self.index.add(doc)
564
565
566 @total_ordering
567 class SearchResult(object):
568     def __init__(self, doc, how_found=None, query_terms=None):
569         self.boost = 1.0
570         self._hits = []
571         self._processed_hits = None  # processed hits
572         self.snippets = []
573         self.query_terms = query_terms
574         self._book = None
575
576         if 'score' in doc:
577             self._score = doc['score']
578         else:
579             self._score = 0
580
581         self.book_id = int(doc["book_id"])
582
583         try:
584             self.published_date = int(doc.get("published_date"))
585         except ValueError:
586             self.published_date = 0
587
588         # content hits
589         header_type = doc.get("header_type", None)
590         # we have a content hit in some header of fragment
591         if header_type is not None:
592             sec = (header_type, int(doc["header_index"]))
593             header_span = doc['header_span']
594             header_span = header_span is not None and int(header_span) or 1
595             fragment = doc.get("fragment_anchor", None)
596             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
597             snippets_rev = doc.get('snippets_revision', None)
598
599             hit = (sec + (header_span,), fragment, self._score, {
600                 'how_found': how_found,
601                 'snippets_pos': snippets_pos,
602                 'snippets_revision': snippets_rev,
603                 'themes': doc.get('themes', []),
604                 'themes_pl': doc.get('themes_pl', [])
605                 })
606
607             self._hits.append(hit)
608
609     @classmethod
610     def from_book(cls, book, how_found=None, query_terms=None):
611         doc = {
612             'score': book.popularity.count,
613             'book_id': book.id,
614             'published_date': 0,
615         }
616         result = cls(doc, how_found=how_found, query_terms=query_terms)
617         result._book = book
618         return result
619
620     def __str__(self):
621         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
622             (self.book_id, len(self._hits),
623              len(self._processed_hits) if self._processed_hits else -1,
624              self._score, len(self.snippets))
625
626     def __bytes__(self):
627         return str(self).encode('utf-8')
628
629     @property
630     def score(self):
631         return self._score * self.boost
632
633     def merge(self, other):
634         if self.book_id != other.book_id:
635             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
636         self._hits += other._hits
637         self._score += max(other._score, 0)
638         return self
639
640     def get_book(self):
641         if self._book is not None:
642             return self._book
643         try:
644             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
645         except catalogue.models.Book.DoesNotExist:
646             self._book = None
647         return self._book
648
649     book = property(get_book)
650
651     POSITION = 0
652     FRAGMENT = 1
653     POSITION_INDEX = 1
654     POSITION_SPAN = 2
655     SCORE = 2
656     OTHER = 3
657
658     @property
659     def hits(self):
660         if self._processed_hits is not None:
661             return self._processed_hits
662
663         # to sections and fragments
664         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
665
666         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
667
668         # sections not covered by fragments
669         sect = filter(lambda s: 0 == len(list(filter(
670             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
671                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
672
673         def remove_duplicates(lst, keyfn, larger):
674             els = {}
675             for e in lst:
676                 eif = keyfn(e)
677                 if eif in els:
678                     if larger(els[eif], e):
679                         continue
680                 els[eif] = e
681             return els.values()
682
683         # remove fragments with duplicated fid's and duplicated snippets
684         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
685
686         # remove duplicate sections
687         sections = {}
688
689         for s in sect:
690             si = s[self.POSITION][self.POSITION_INDEX]
691             # skip existing
692             if si in sections:
693                 if sections[si]['score'] >= s[self.SCORE]:
694                     continue
695
696             m = {'score': s[self.SCORE],
697                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
698                  }
699             m.update(s[self.OTHER])
700             sections[si] = m
701
702         hits = list(sections.values())
703
704         for f in frags:
705             try:
706                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
707             except catalogue.models.Fragment.DoesNotExist:
708                 # stale index
709                 continue
710             # Figure out if we were searching for a token matching some word in theme name.
711             themes = frag.tags.filter(category='theme')
712             themes_hit = set()
713             if self.query_terms is not None:
714                 for i in range(0, len(f[self.OTHER]['themes'])):
715                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
716                     tms = map(str.lower, tms)
717                     for qt in self.query_terms:
718                         if qt in tms:
719                             themes_hit.add(f[self.OTHER]['themes'][i])
720                             break
721
722             def theme_by_name(n):
723                 th = list(filter(lambda t: t.name == n, themes))
724                 if th:
725                     return th[0]
726                 else:
727                     return None
728             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
729
730             m = {'score': f[self.SCORE],
731                  'fragment': frag,
732                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
733                  'themes': themes,
734                  'themes_hit': themes_hit
735                  }
736             m.update(f[self.OTHER])
737             hits.append(m)
738
739         hits.sort(key=lambda h: h['score'], reverse=True)
740
741         self._processed_hits = hits
742
743         return hits
744
745     @staticmethod
746     def aggregate(*result_lists):
747         books = {}
748         for rl in result_lists:
749             for r in rl:
750                 if r.book_id in books:
751                     books[r.book_id].merge(r)
752                 else:
753                     books[r.book_id] = r
754         return books.values()
755
756     def get_sort_key(self):
757         return (-self.score,
758                 self.published_date,
759                 self.book.sort_key_author if self.book else '',
760                 self.book.sort_key if self.book else '')
761
762     def __lt__(self, other):
763         return self.get_sort_key() > other.get_sort_key()
764
765     def __eq__(self, other):
766         return self.get_sort_key() == other.get_sort_key()
767
768     def __len__(self):
769         return len(self.hits)
770
771     def snippet_pos(self, idx=0):
772         return self.hits[idx]['snippets_pos']
773
774     def snippet_revision(self, idx=0):
775         try:
776             return self.hits[idx]['snippets_revision']
777         except (IndexError, KeyError):
778             return None
779
780
781 @total_ordering
782 class PictureResult(object):
783     def __init__(self, doc, how_found=None, query_terms=None):
784         self.boost = 1.0
785         self.query_terms = query_terms
786         self._picture = None
787         self._hits = []
788         self._processed_hits = None
789
790         if 'score' in doc:
791             self._score = doc['score']
792         else:
793             self._score = 0
794
795         self.picture_id = int(doc["picture_id"])
796
797         if doc.get('area_id'):
798             hit = (self._score, {
799                 'how_found': how_found,
800                 'area_id': doc['area_id'],
801                 'themes': doc.get('themes', []),
802                 'themes_pl': doc.get('themes_pl', []),
803             })
804
805             self._hits.append(hit)
806
807     def __str__(self):
808         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
809
810     def __repr__(self):
811         return str(self)
812
813     @property
814     def score(self):
815         return self._score * self.boost
816
817     def merge(self, other):
818         if self.picture_id != other.picture_id:
819             raise ValueError(
820                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
821         self._hits += other._hits
822         self._score += max(other._score, 0)
823         return self
824
825     SCORE = 0
826     OTHER = 1
827
828     @property
829     def hits(self):
830         if self._processed_hits is not None:
831             return self._processed_hits
832
833         hits = []
834         for hit in self._hits:
835             try:
836                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
837             except picture.models.PictureArea.DoesNotExist:
838                 # stale index
839                 continue
840             # Figure out if we were searching for a token matching some word in theme name.
841             themes_hit = set()
842             if self.query_terms is not None:
843                 for i in range(0, len(hit[self.OTHER]['themes'])):
844                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
845                     tms = map(str.lower, tms)
846                     for qt in self.query_terms:
847                         if qt in tms:
848                             themes_hit.add(hit[self.OTHER]['themes'][i])
849                             break
850
851             m = {
852                 'score': hit[self.SCORE],
853                 'area': area,
854                 'themes_hit': themes_hit,
855             }
856             m.update(hit[self.OTHER])
857             hits.append(m)
858
859         hits.sort(key=lambda h: h['score'], reverse=True)
860         hits = hits[:1]
861         self._processed_hits = hits
862         return hits
863
864     def get_picture(self):
865         if self._picture is None:
866             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
867         return self._picture
868
869     picture = property(get_picture)
870
871     @staticmethod
872     def aggregate(*result_lists):
873         books = {}
874         for rl in result_lists:
875             for r in rl:
876                 if r.picture_id in books:
877                     books[r.picture_id].merge(r)
878                 else:
879                     books[r.picture_id] = r
880         return books.values()
881
882     def __lt__(self, other):
883         return self.score < other.score
884
885     def __eq__(self, other):
886         return self.score == other.score
887
888
889 class Search(SolrIndex):
890     """
891     Search facilities.
892     """
893     def __init__(self, default_field="text"):
894         super(Search, self).__init__(mode='r')
895
896     def make_term_query(self, query, field='text', modal=operator.or_):
897         """
898         Returns term queries joined by boolean query.
899         modal - applies to boolean query
900         fuzzy - should the query by fuzzy.
901         """
902         if query is None:
903             query = ''
904         q = self.index.Q()
905         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
906
907         return q
908
909     def search_by_author(self, words):
910         from catalogue.models import Book
911         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
912         for word in words:
913             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
914         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
915
916     def search_words(self, words, fields, required=None, book=True, picture=False):
917         if book and not picture and fields == ['authors']:
918             return self.search_by_author(words)
919         filters = []
920         for word in words:
921             if book or picture or (word not in stopwords):
922                 word_filter = None
923                 for field in fields:
924                     q = self.index.Q(**{field: word})
925                     if word_filter is None:
926                         word_filter = q
927                     else:
928                         word_filter |= q
929                 filters.append(word_filter)
930         if required:
931             required_filter = None
932             for field in required:
933                 for word in words:
934                     if book or picture or (word not in stopwords):
935                         q = self.index.Q(**{field: word})
936                         if required_filter is None:
937                             required_filter = q
938                         else:
939                             required_filter |= q
940             filters.append(required_filter)
941         if not filters:
942             return []
943         params = {}
944         if book:
945             params['is_book'] = True
946         if picture:
947             params['picture_id__gt'] = 0
948         else:
949             params['book_id__gt'] = 0
950         query = self.index.query(**params)
951         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
952         result_class = PictureResult if picture else SearchResult
953         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
954
955     def get_snippets(self, searchresult, query, field='text', num=1):
956         """
957         Returns a snippet for found scoreDoc.
958         """
959         maxnum = len(searchresult)
960         if num is None or num < 0 or num > maxnum:
961             num = maxnum
962         book_id = searchresult.book_id
963         revision = searchresult.snippet_revision()
964         snippets = Snippets(book_id, revision=revision)
965         snips = [None] * maxnum
966         try:
967             snippets.open()
968             idx = 0
969             while idx < maxnum and num > 0:
970                 position, length = searchresult.snippet_pos(idx)
971                 if position is None or length is None:
972                     continue
973                 text = snippets.get((int(position),
974                                      int(length)))
975                 snip = self.index.highlight(text=text, field=field, q=query)
976                 if not snip and field == 'text':
977                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
978                 if snip not in snips:
979                     snips[idx] = snip
980                     if snip:
981                         num -= 1
982                 idx += 1
983
984         except IOError as e:
985             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
986             if not book:
987                 log.error("Book does not exist for book id = %d" % book_id)
988             elif not book.get().children.exists():
989                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
990             return []
991         finally:
992             snippets.close()
993
994         # remove verse end markers..
995         snips = [s.replace("/\n", "\n") if s else s for s in snips]
996
997         searchresult.snippets = snips
998
999         return snips
1000
1001     @staticmethod
1002     def apply_filters(query, filters):
1003         """
1004         Apply filters to a query
1005         """
1006         if filters is None:
1007             filters = []
1008         filters = filter(lambda x: x is not None, filters)
1009         for f in filters:
1010             query = query.query(f)
1011         return query
1012
1013
1014 if getattr(settings, 'SEARCH_MOCK', False):
1015     from .mock_search import Search