a712b07026046c0f4984210dbbc643889ab13162
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
14 import scorched
15 import catalogue.models
16 import picture.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
19 from . import custom
20
21 log = logging.getLogger('search')
22
23
24 if os.path.isfile(settings.SOLR_STOPWORDS):
25     stopwords = set(
26         line.strip()
27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
28 else:
29     stopwords = set()
30
31
32 class SolrIndex(object):
33     def __init__(self, mode=None):
34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
35
36
37 class Snippets(object):
38     """
39     This class manages snippet files for indexed object (book)
40     the snippets are concatenated together, and their positions and
41     lengths are kept in lucene index fields.
42     """
43     SNIPPET_DIR = "snippets"
44
45     def __init__(self, book_id, revision=None):
46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47         self.book_id = book_id
48         self.revision = revision
49         self.file = None
50         self.position = None
51
52     @property
53     def path(self):
54         if self.revision:
55             fn = "%d.%d" % (self.book_id, self.revision)
56         else:
57             fn = "%d" % self.book_id
58
59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60
61     def open(self, mode='r'):
62         """
63         Open the snippet file. Call .close() afterwards.
64         """
65         if 'b' not in mode:
66             mode += 'b'
67
68         if 'w' in mode:
69             if os.path.exists(self.path):
70                 self.revision = 1
71                 while True:
72                     if not os.path.exists(self.path):
73                         break
74                     self.revision += 1
75
76         self.file = open(self.path, mode)
77         self.position = 0
78         return self
79
80     def add(self, snippet):
81         """
82         Append a snippet (unicode) to the snippet file.
83         Return a (position, length) tuple
84         """
85         txt = snippet.encode('utf-8')
86         l = len(txt)
87         self.file.write(txt)
88         pos = (self.position, l)
89         self.position += l
90         return pos
91
92     def get(self, pos):
93         """
94         Given a tuple of (position, length) return an unicode
95         of the snippet stored there.
96         """
97         self.file.seek(pos[0], 0)
98         txt = self.file.read(pos[1]).decode('utf-8')
99         return txt
100
101     def close(self):
102         """Close snippet file"""
103         if self.file:
104             self.file.close()
105
106     def remove(self):
107         self.revision = None
108         try:
109             os.unlink(self.path)
110             self.revision = 0
111             while True:
112                 self.revision += 1
113                 os.unlink(self.path)
114         except OSError:
115             pass
116
117
118 class Index(SolrIndex):
119     """
120     Class indexing books.
121     """
122     def __init__(self):
123         super(Index, self).__init__(mode='rw')
124
125     def delete_query(self, *queries):
126         """
127         index.delete(queries=...) doesn't work, so let's reimplement it
128         using deletion of list of uids.
129         """
130         uids = set()
131         for q in queries:
132             if isinstance(q, scorched.search.LuceneQuery):
133                 q = self.index.query(q)
134             q.field_limiter.update(['uid'])
135             st = 0
136             rows = 100
137             while True:
138                 ids = q.paginate(start=st, rows=rows).execute()
139                 if not len(ids):
140                     break
141                 for res in ids:
142                     uids.add(res['uid'])
143                 st += rows
144         if uids:
145             self.index.delete(uids)
146             return True
147         else:
148             return False
149
150     def index_tags(self, *tags, **kw):
151         """
152         Re-index global tag list.
153         Removes all tags from index, then index them again.
154         Indexed fields include: id, name (with and without polish stems), category
155         """
156         log.debug("Indexing tags")
157         remove_only = kw.get('remove_only', False)
158         # first, remove tags from index.
159         if tags:
160             tag_qs = []
161             for tag in tags:
162                 q_id = self.index.Q(tag_id=tag.id)
163
164                 if isinstance(tag, PDCounterAuthor):
165                     q_cat = self.index.Q(tag_category='pd_author')
166                 elif isinstance(tag, PDCounterBook):
167                     q_cat = self.index.Q(tag_category='pd_book')
168                 else:
169                     q_cat = self.index.Q(tag_category=tag.category)
170
171                 q_id_cat = self.index.Q(q_id & q_cat)
172                 tag_qs.append(q_id_cat)
173             self.delete_query(*tag_qs)
174         else:  # all
175             q = self.index.Q(tag_id__any=True)
176             self.delete_query(q)
177
178         if not remove_only:
179             # then add them [all or just one passed]
180             if not tags:
181                 tags = chain(
182                     catalogue.models.Tag.objects.exclude(category='set'),
183                     PDCounterAuthor.objects.all(),
184                     PDCounterBook.objects.all())
185
186             for tag in tags:
187                 if isinstance(tag, PDCounterAuthor):
188                     doc = {
189                         "tag_id": int(tag.id),
190                         "tag_name": tag.name,
191                         "tag_name_pl": tag.name,
192                         "tag_category": 'pd_author',
193                         "is_pdcounter": True,
194                         "uid": "tag%d_pd_a" % tag.id
195                         }
196                 elif isinstance(tag, PDCounterBook):
197                     doc = {
198                         "tag_id": int(tag.id),
199                         "tag_name": tag.title,
200                         "tag_name_pl": tag.title,
201                         "tag_category": 'pd_book',
202                         "is_pdcounter": True,
203                         "uid": "tag%d_pd_b" % tag.id
204                         }
205                 else:
206                     doc = {
207                         "tag_id": int(tag.id),
208                         "tag_name": tag.name,
209                         "tag_name_pl": tag.name,
210                         "tag_category": tag.category,
211                         "is_pdcounter": False,
212                         "uid": "tag%d" % tag.id
213                         }
214                 self.index.add(doc)
215
216     def create_book_doc(self, book):
217         """
218         Create a lucene document referring book id.
219         """
220         doc = {'book_id': int(book.id)}
221         if book.parent is not None:
222             doc['parent_id'] = int(book.parent.id)
223         return doc
224
225     def remove_book(self, book_or_id, remove_snippets=True):
226         """Removes a book from search index.
227         book - Book instance."""
228         if isinstance(book_or_id, catalogue.models.Book):
229             book_id = book_or_id.id
230         else:
231             book_id = book_or_id
232
233         self.delete_query(self.index.Q(book_id=book_id))
234
235         if remove_snippets:
236             snippets = Snippets(book_id)
237             snippets.remove()
238
239     def index_book(self, book, book_info=None, overwrite=True):
240         """
241         Indexes the book.
242         Creates a lucene document for extracted metadata
243         and calls self.index_content() to index the contents of the book.
244         """
245         if overwrite:
246             # we don't remove snippets, since they might be still needed by
247             # threads using not reopened index
248             self.remove_book(book, remove_snippets=False)
249
250         book_doc = self.create_book_doc(book)
251         meta_fields = self.extract_metadata(book, book_info, dc_only=[
252             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
253         # let's not index it - it's only used for extracting publish date
254         if 'source_name' in meta_fields:
255             del meta_fields['source_name']
256
257         for n, f in meta_fields.items():
258             book_doc[n] = f
259
260         book_doc['uid'] = "book%s" % book_doc['book_id']
261         self.index.add(book_doc)
262         del book_doc
263         book_fields = {
264             'title': meta_fields['title'],
265             'authors': meta_fields['authors'],
266             'published_date': meta_fields['published_date']
267             }
268
269         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
270             if tag_name in meta_fields:
271                 book_fields[tag_name] = meta_fields[tag_name]
272
273         self.index_content(book, book_fields=book_fields)
274
275     master_tags = [
276         'opowiadanie',
277         'powiesc',
278         'dramat_wierszowany_l',
279         'dramat_wierszowany_lp',
280         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
281         'wywiad',
282     ]
283
284     ignore_content_tags = [
285         'uwaga', 'extra', 'nota_red', 'abstrakt',
286         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
287         'didaskalia',
288         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
289     ]
290
291     footnote_tags = ['pa', 'pt', 'pr', 'pe']
292
293     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
294                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
295
296     published_date_re = re.compile("([0-9]+)[\]. ]*$")
297
298     def extract_metadata(self, book, book_info=None, dc_only=None):
299         """
300         Extract metadata from book and returns a map of fields keyed by fieldname
301         """
302         fields = {}
303
304         if book_info is None:
305             book_info = dcparser.parse(open(book.xml_file.path))
306
307         fields['slug'] = book.slug
308         fields['is_book'] = True
309
310         # validator, name
311         for field in dcparser.BookInfo.FIELDS:
312             if dc_only and field.name not in dc_only:
313                 continue
314             if hasattr(book_info, field.name):
315                 if not getattr(book_info, field.name):
316                     continue
317                 # since no type information is available, we use validator
318                 type_indicator = field.validator
319                 if type_indicator == dcparser.as_unicode:
320                     s = getattr(book_info, field.name)
321                     if field.multiple:
322                         s = ', '.join(s)
323                     fields[field.name] = s
324                 elif type_indicator == dcparser.as_person:
325                     p = getattr(book_info, field.name)
326                     if isinstance(p, dcparser.Person):
327                         persons = str(p)
328                     else:
329                         persons = ', '.join(map(str, p))
330                     fields[field.name] = persons
331                 elif type_indicator == dcparser.as_date:
332                     dt = getattr(book_info, field.name)
333                     fields[field.name] = dt
334
335         # get published date
336         pd = None
337         if hasattr(book_info, 'source_name') and book_info.source_name:
338             match = self.published_date_re.search(book_info.source_name)
339             if match is not None:
340                 pd = str(match.groups()[0])
341         if not pd:
342             pd = ""
343         fields["published_date"] = pd
344
345         return fields
346
347     # def add_gaps(self, fields, fieldname):
348     #     """
349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
351     #     """
352     #     def gap():
353     #         while True:
354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
356
357     def get_master(self, root):
358         """
359         Returns the first master tag from an etree.
360         """
361         for master in root.iter():
362             if master.tag in self.master_tags:
363                 return master
364
365     def index_content(self, book, book_fields):
366         """
367         Walks the book XML and extract content from it.
368         Adds parts for each header tag and for each fragment.
369         """
370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
371         root = wld.edoc.getroot()
372
373         master = self.get_master(root)
374         if master is None:
375             return []
376
377         def walker(node):
378             if node.tag not in self.ignore_content_tags:
379                 yield node, None, None
380                 if node.text is not None:
381                     yield None, node.text, None
382                 for child in list(node):
383                     for b, t, e in walker(child):
384                         yield b, t, e
385                 yield None, None, node
386
387             if node.tail is not None:
388                 yield None, node.tail, None
389             return
390
391         def fix_format(text):
392             # separator = [u" ", u"\t", u".", u";", u","]
393             if isinstance(text, list):
394                 # need to join it first
395                 text = filter(lambda s: s is not None, content)
396                 text = u' '.join(text)
397                 # for i in range(len(text)):
398                 #     if i > 0:
399                 #         if text[i][0] not in separator\
400                 #             and text[i - 1][-1] not in separator:
401                 #          text.insert(i, u" ")
402
403             return re.sub("(?m)/$", "", text)
404
405         def add_part(snippets, **fields):
406             doc = self.create_book_doc(book)
407             for n, v in book_fields.items():
408                 doc[n] = v
409
410             doc['header_index'] = fields["header_index"]
411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
412             doc['header_type'] = fields['header_type']
413
414             doc['text'] = fields['text']
415
416             # snippets
417             snip_pos = snippets.add(fields["text"])
418
419             doc['snippets_position'] = snip_pos[0]
420             doc['snippets_length'] = snip_pos[1]
421             if snippets.revision:
422                 doc["snippets_revision"] = snippets.revision
423
424             if 'fragment_anchor' in fields:
425                 doc["fragment_anchor"] = fields['fragment_anchor']
426
427             if 'themes' in fields:
428                 doc['themes'] = fields['themes']
429             doc['uid'] = "part%s-%s-%s-%s" % (
430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
431             return doc
432
433         fragments = {}
434         snippets = Snippets(book.id).open('w')
435         try:
436             for header, position in zip(list(master), range(len(master))):
437
438                 if header.tag in self.skip_header_tags:
439                     continue
440                 if header.tag is etree.Comment:
441                     continue
442
443                 # section content
444                 content = []
445                 footnote = []
446
447                 def all_content(text):
448                     for frag in fragments.values():
449                         frag['text'].append(text)
450                     content.append(text)
451                 handle_text = [all_content]
452
453                 for start, text, end in walker(header):
454                     # handle footnotes
455                     if start is not None and start.tag in self.footnote_tags:
456                         footnote = []
457
458                         def collect_footnote(t):
459                             footnote.append(t)
460
461                         handle_text.append(collect_footnote)
462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463                         handle_text.pop()
464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
465                                        text=u''.join(footnote),
466                                        is_footnote=True)
467                         self.index.add(doc)
468                         footnote = []
469
470                     # handle fragments and themes.
471                     if start is not None and start.tag == 'begin':
472                         fid = start.attrib['id'][1:]
473                         fragments[fid] = {
474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(lambda text: None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if not frag['themes']:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        text=fix_format(frag['text']),
500                                        themes=frag['themes'])
501                         self.index.add(doc)
502
503                         # Collect content.
504
505                     if text is not None and handle_text is not []:
506                         hdl = handle_text[-1]
507                         hdl(text)
508
509                         # in the end, add a section text.
510                 doc = add_part(snippets, header_index=position,
511                                header_type=header.tag, text=fix_format(content))
512
513                 self.index.add(doc)
514
515         finally:
516             snippets.close()
517
518     def remove_picture(self, picture_or_id):
519         """Removes a picture from search index."""
520         if isinstance(picture_or_id, picture.models.Picture):
521             picture_id = picture_or_id.id
522         else:
523             picture_id = picture_or_id
524         self.delete_query(self.index.Q(picture_id=picture_id))
525
526     def index_picture(self, picture, picture_info=None, overwrite=True):
527         """
528         Indexes the picture.
529         Creates a lucene document for extracted metadata
530         and calls self.index_area() to index the contents of the picture.
531         """
532         if overwrite:
533             # we don't remove snippets, since they might be still needed by
534             # threads using not reopened index
535             self.remove_picture(picture)
536
537         picture_doc = {'picture_id': int(picture.id)}
538         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
539             'authors', 'title', 'epochs', 'kinds', 'genres'])
540
541         picture_doc.update(meta_fields)
542
543         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
544         self.index.add(picture_doc)
545         del picture_doc['is_book']
546         for area in picture.areas.all():
547             self.index_area(area, picture_fields=picture_doc)
548
549     def index_area(self, area, picture_fields):
550         """
551         Indexes themes and objects on the area.
552         """
553         doc = dict(picture_fields)
554         doc['area_id'] = area.id
555         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
556         doc['uid'] = 'area%s' % area.id
557         self.index.add(doc)
558
559
560 @total_ordering
561 class SearchResult(object):
562     def __init__(self, doc, how_found=None, query_terms=None):
563         self.boost = 1.0
564         self._hits = []
565         self._processed_hits = None  # processed hits
566         self.snippets = []
567         self.query_terms = query_terms
568         self._book = None
569
570         if 'score' in doc:
571             self._score = doc['score']
572         else:
573             self._score = 0
574
575         self.book_id = int(doc["book_id"])
576
577         try:
578             self.published_date = int(doc.get("published_date"))
579         except ValueError:
580             self.published_date = 0
581
582         # content hits
583         header_type = doc.get("header_type", None)
584         # we have a content hit in some header of fragment
585         if header_type is not None:
586             sec = (header_type, int(doc["header_index"]))
587             header_span = doc['header_span']
588             header_span = header_span is not None and int(header_span) or 1
589             fragment = doc.get("fragment_anchor", None)
590             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
591             snippets_rev = doc.get('snippets_revision', None)
592
593             hit = (sec + (header_span,), fragment, self._score, {
594                 'how_found': how_found,
595                 'snippets_pos': snippets_pos,
596                 'snippets_revision': snippets_rev,
597                 'themes': doc.get('themes', []),
598                 'themes_pl': doc.get('themes_pl', [])
599                 })
600
601             self._hits.append(hit)
602
603     @classmethod
604     def from_book(cls, book, how_found=None, query_terms=None):
605         doc = {
606             'score': book.popularity.count,
607             'book_id': book.id,
608             'published_date': 0,
609         }
610         result = cls(doc, how_found=how_found, query_terms=query_terms)
611         result._book = book
612         return result
613
614     def __str__(self):
615         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
616             (self.book_id, len(self._hits),
617              len(self._processed_hits) if self._processed_hits else -1,
618              self._score, len(self.snippets))
619
620     def __bytes__(self):
621         return str(self).encode('utf-8')
622
623     @property
624     def score(self):
625         return self._score * self.boost
626
627     def merge(self, other):
628         if self.book_id != other.book_id:
629             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
630         self._hits += other._hits
631         self._score += max(other._score, 0)
632         return self
633
634     def get_book(self):
635         if self._book is not None:
636             return self._book
637         self._book = catalogue.models.Book.objects.get(id=self.book_id)
638         return self._book
639
640     book = property(get_book)
641
642     POSITION = 0
643     FRAGMENT = 1
644     POSITION_INDEX = 1
645     POSITION_SPAN = 2
646     SCORE = 2
647     OTHER = 3
648
649     @property
650     def hits(self):
651         if self._processed_hits is not None:
652             return self._processed_hits
653
654         # to sections and fragments
655         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
656
657         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
658
659         # sections not covered by fragments
660         sect = filter(lambda s: 0 == len(list(filter(
661             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
662                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
663
664         def remove_duplicates(lst, keyfn, compare):
665             els = {}
666             for e in lst:
667                 eif = keyfn(e)
668                 if eif in els:
669                     if compare(els[eif], e) >= 1:
670                         continue
671                 els[eif] = e
672             return els.values()
673
674         # remove fragments with duplicated fid's and duplicated snippets
675         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
676         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
677         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
678
679         # remove duplicate sections
680         sections = {}
681
682         for s in sect:
683             si = s[self.POSITION][self.POSITION_INDEX]
684             # skip existing
685             if si in sections:
686                 if sections[si]['score'] >= s[self.SCORE]:
687                     continue
688
689             m = {'score': s[self.SCORE],
690                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
691                  }
692             m.update(s[self.OTHER])
693             sections[si] = m
694
695         hits = list(sections.values())
696
697         for f in frags:
698             try:
699                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
700             except catalogue.models.Fragment.DoesNotExist:
701                 # stale index
702                 continue
703             # Figure out if we were searching for a token matching some word in theme name.
704             themes = frag.tags.filter(category='theme')
705             themes_hit = set()
706             if self.query_terms is not None:
707                 for i in range(0, len(f[self.OTHER]['themes'])):
708                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
709                     tms = map(str.lower, tms)
710                     for qt in self.query_terms:
711                         if qt in tms:
712                             themes_hit.add(f[self.OTHER]['themes'][i])
713                             break
714
715             def theme_by_name(n):
716                 th = filter(lambda t: t.name == n, themes)
717                 if th:
718                     return th[0]
719                 else:
720                     return None
721             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
722
723             m = {'score': f[self.SCORE],
724                  'fragment': frag,
725                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
726                  'themes': themes,
727                  'themes_hit': themes_hit
728                  }
729             m.update(f[self.OTHER])
730             hits.append(m)
731
732         hits.sort(key=lambda h: h['score'], reverse=True)
733
734         self._processed_hits = hits
735
736         return hits
737
738     @staticmethod
739     def aggregate(*result_lists):
740         books = {}
741         for rl in result_lists:
742             for r in rl:
743                 if r.book_id in books:
744                     books[r.book_id].merge(r)
745                 else:
746                     books[r.book_id] = r
747         return books.values()
748
749     def __lt__(self, other):
750         return (-self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) > \
751                (-other.score, other.published_date, other.book.sort_key_author, other.book.sort_key)
752
753     def __eq__(self, other):
754         return (self.score, self.published_date, self.book.sort_key_author, self.book.sort_key) == \
755                (other.score, other.published_date, other.book.sort_key_author, other.book.sort_key)
756
757     def __len__(self):
758         return len(self.hits)
759
760     def snippet_pos(self, idx=0):
761         return self.hits[idx]['snippets_pos']
762
763     def snippet_revision(self, idx=0):
764         try:
765             return self.hits[idx]['snippets_revision']
766         except (IndexError, KeyError):
767             return None
768
769
770 @total_ordering
771 class PictureResult(object):
772     def __init__(self, doc, how_found=None, query_terms=None):
773         self.boost = 1.0
774         self.query_terms = query_terms
775         self._picture = None
776         self._hits = []
777         self._processed_hits = None
778
779         if 'score' in doc:
780             self._score = doc['score']
781         else:
782             self._score = 0
783
784         self.picture_id = int(doc["picture_id"])
785
786         if doc.get('area_id'):
787             hit = (self._score, {
788                 'how_found': how_found,
789                 'area_id': doc['area_id'],
790                 'themes': doc.get('themes', []),
791                 'themes_pl': doc.get('themes_pl', []),
792             })
793
794             self._hits.append(hit)
795
796     def __str__(self):
797         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
798
799     def __repr__(self):
800         return str(self)
801
802     @property
803     def score(self):
804         return self._score * self.boost
805
806     def merge(self, other):
807         if self.picture_id != other.picture_id:
808             raise ValueError(
809                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
810         self._hits += other._hits
811         self._score += max(other._score, 0)
812         return self
813
814     SCORE = 0
815     OTHER = 1
816
817     @property
818     def hits(self):
819         if self._processed_hits is not None:
820             return self._processed_hits
821
822         hits = []
823         for hit in self._hits:
824             try:
825                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
826             except picture.models.PictureArea.DoesNotExist:
827                 # stale index
828                 continue
829             # Figure out if we were searching for a token matching some word in theme name.
830             themes_hit = set()
831             if self.query_terms is not None:
832                 for i in range(0, len(hit[self.OTHER]['themes'])):
833                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
834                     tms = map(str.lower, tms)
835                     for qt in self.query_terms:
836                         if qt in tms:
837                             themes_hit.add(hit[self.OTHER]['themes'][i])
838                             break
839
840             m = {
841                 'score': hit[self.SCORE],
842                 'area': area,
843                 'themes_hit': themes_hit,
844             }
845             m.update(hit[self.OTHER])
846             hits.append(m)
847
848         hits.sort(key=lambda h: h['score'], reverse=True)
849         hits = hits[:1]
850         self._processed_hits = hits
851         return hits
852
853     def get_picture(self):
854         if self._picture is None:
855             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
856         return self._picture
857
858     picture = property(get_picture)
859
860     @staticmethod
861     def aggregate(*result_lists):
862         books = {}
863         for rl in result_lists:
864             for r in rl:
865                 if r.picture_id in books:
866                     books[r.picture_id].merge(r)
867                 else:
868                     books[r.picture_id] = r
869         return books.values()
870
871     def __lt__(self, other):
872         return self.score < other.score
873
874     def __eq__(self, other):
875         return self.score == other.score
876
877
878 class Search(SolrIndex):
879     """
880     Search facilities.
881     """
882     def __init__(self, default_field="text"):
883         super(Search, self).__init__(mode='r')
884
885     def make_term_query(self, query, field='text', modal=operator.or_):
886         """
887         Returns term queries joined by boolean query.
888         modal - applies to boolean query
889         fuzzy - should the query by fuzzy.
890         """
891         if query is None:
892             query = ''
893         q = self.index.Q()
894         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
895
896         return q
897
898     def search_by_author(self, words):
899         from catalogue.models import Book
900         books = Book.objects.filter(parent=None).order_by('-popularity__count')
901         for word in words:
902             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
903         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
904
905     def search_words(self, words, fields, required=None, book=True, picture=False):
906         if book and not picture and fields == ['authors']:
907             return self.search_by_author(words)
908         filters = []
909         for word in words:
910             if book or picture or (word not in stopwords):
911                 word_filter = None
912                 for field in fields:
913                     q = self.index.Q(**{field: word})
914                     if word_filter is None:
915                         word_filter = q
916                     else:
917                         word_filter |= q
918                 filters.append(word_filter)
919         if required:
920             required_filter = None
921             for field in required:
922                 for word in words:
923                     if book or picture or (word not in stopwords):
924                         q = self.index.Q(**{field: word})
925                         if required_filter is None:
926                             required_filter = q
927                         else:
928                             required_filter |= q
929             filters.append(required_filter)
930         if not filters:
931             return []
932         params = {}
933         if book:
934             params['is_book'] = True
935         if picture:
936             params['picture_id__gt'] = 0
937         else:
938             params['book_id__gt'] = 0
939         query = self.index.query(**params)
940         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
941         result_class = PictureResult if picture else SearchResult
942         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
943
944     def get_snippets(self, searchresult, query, field='text', num=1):
945         """
946         Returns a snippet for found scoreDoc.
947         """
948         maxnum = len(searchresult)
949         if num is None or num < 0 or num > maxnum:
950             num = maxnum
951         book_id = searchresult.book_id
952         revision = searchresult.snippet_revision()
953         snippets = Snippets(book_id, revision=revision)
954         snips = [None] * maxnum
955         try:
956             snippets.open()
957             idx = 0
958             while idx < maxnum and num > 0:
959                 position, length = searchresult.snippet_pos(idx)
960                 if position is None or length is None:
961                     continue
962                 text = snippets.get((int(position),
963                                      int(length)))
964                 snip = self.index.highlight(text=text, field=field, q=query)
965                 if not snip and field == 'text':
966                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
967                 if snip not in snips:
968                     snips[idx] = snip
969                     if snip:
970                         num -= 1
971                 idx += 1
972
973         except IOError as e:
974             book = catalogue.models.Book.objects.filter(id=book_id)
975             if not book:
976                 log.error("Book does not exist for book id = %d" % book_id)
977             elif not book.get().children.exists():
978                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
979             return []
980         finally:
981             snippets.close()
982
983         # remove verse end markers..
984         snips = [s.replace("/\n", "\n") if s else s for s in snips]
985
986         searchresult.snippets = snips
987
988         return snips
989
990     @staticmethod
991     def apply_filters(query, filters):
992         """
993         Apply filters to a query
994         """
995         if filters is None:
996             filters = []
997         filters = filter(lambda x: x is not None, filters)
998         for f in filters:
999             query = query.query(f)
1000         return query
1001
1002
1003 if getattr(settings, 'SEARCH_MOCK', False):
1004     from .mock_search import Search