bd31a2acfb1aaeda373fc9837f6ddf35a43db955
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
14 import scorched
15 import catalogue.models
16 import picture.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
19 from . import custom
20
21 log = logging.getLogger('search')
22
23
24 if os.path.isfile(settings.SOLR_STOPWORDS):
25     stopwords = set(
26         line.strip()
27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
28 else:
29     stopwords = set()
30
31
32 class SolrIndex(object):
33     def __init__(self, mode=None):
34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
35
36
37 class Snippets(object):
38     """
39     This class manages snippet files for indexed object (book)
40     the snippets are concatenated together, and their positions and
41     lengths are kept in lucene index fields.
42     """
43     SNIPPET_DIR = "snippets"
44
45     def __init__(self, book_id, revision=None):
46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47         self.book_id = book_id
48         self.revision = revision
49         self.file = None
50         self.position = None
51
52     @property
53     def path(self):
54         if self.revision:
55             fn = "%d.%d" % (self.book_id, self.revision)
56         else:
57             fn = "%d" % self.book_id
58
59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60
61     def open(self, mode='r'):
62         """
63         Open the snippet file. Call .close() afterwards.
64         """
65         if 'b' not in mode:
66             mode += 'b'
67
68         if 'w' in mode:
69             if os.path.exists(self.path):
70                 self.revision = 1
71                 while True:
72                     if not os.path.exists(self.path):
73                         break
74                     self.revision += 1
75
76         self.file = open(self.path, mode)
77         self.position = 0
78         return self
79
80     def add(self, snippet):
81         """
82         Append a snippet (unicode) to the snippet file.
83         Return a (position, length) tuple
84         """
85         txt = snippet.encode('utf-8')
86         l = len(txt)
87         self.file.write(txt)
88         pos = (self.position, l)
89         self.position += l
90         return pos
91
92     def get(self, pos):
93         """
94         Given a tuple of (position, length) return an unicode
95         of the snippet stored there.
96         """
97         self.file.seek(pos[0], 0)
98         txt = self.file.read(pos[1]).decode('utf-8')
99         return txt
100
101     def close(self):
102         """Close snippet file"""
103         if self.file:
104             self.file.close()
105
106     def remove(self):
107         self.revision = None
108         try:
109             os.unlink(self.path)
110             self.revision = 0
111             while True:
112                 self.revision += 1
113                 os.unlink(self.path)
114         except OSError:
115             pass
116
117
118 class Index(SolrIndex):
119     """
120     Class indexing books.
121     """
122     def __init__(self):
123         super(Index, self).__init__(mode='rw')
124
125     def delete_query(self, *queries):
126         """
127         index.delete(queries=...) doesn't work, so let's reimplement it
128         using deletion of list of uids.
129         """
130         uids = set()
131         for q in queries:
132             if isinstance(q, scorched.search.LuceneQuery):
133                 q = self.index.query(q)
134             q.field_limiter.update(['uid'])
135             st = 0
136             rows = 100
137             while True:
138                 ids = q.paginate(start=st, rows=rows).execute()
139                 if not len(ids):
140                     break
141                 for res in ids:
142                     uids.add(res['uid'])
143                 st += rows
144         if uids:
145             # FIXME: With Solr API change, this doesn't work.
146             #self.index.delete(uids)
147             return True
148         else:
149             return False
150
151     def index_tags(self, *tags, **kw):
152         """
153         Re-index global tag list.
154         Removes all tags from index, then index them again.
155         Indexed fields include: id, name (with and without polish stems), category
156         """
157         log.debug("Indexing tags")
158         remove_only = kw.get('remove_only', False)
159         # first, remove tags from index.
160         if tags:
161             tag_qs = []
162             for tag in tags:
163                 q_id = self.index.Q(tag_id=tag.id)
164
165                 if isinstance(tag, PDCounterAuthor):
166                     q_cat = self.index.Q(tag_category='pd_author')
167                 elif isinstance(tag, PDCounterBook):
168                     q_cat = self.index.Q(tag_category='pd_book')
169                 else:
170                     q_cat = self.index.Q(tag_category=tag.category)
171
172                 q_id_cat = self.index.Q(q_id & q_cat)
173                 tag_qs.append(q_id_cat)
174             self.delete_query(*tag_qs)
175         else:  # all
176             q = self.index.Q(tag_id__any=True)
177             self.delete_query(q)
178
179         if not remove_only:
180             # then add them [all or just one passed]
181             if not tags:
182                 tags = chain(
183                     catalogue.models.Tag.objects.exclude(category='set'),
184                     PDCounterAuthor.objects.all(),
185                     PDCounterBook.objects.all())
186
187             for tag in tags:
188                 if isinstance(tag, PDCounterAuthor):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.name,
192                         "tag_name_pl": tag.name,
193                         "tag_category": 'pd_author',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_a" % tag.id
196                         }
197                 elif isinstance(tag, PDCounterBook):
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.title,
201                         "tag_name_pl": tag.title,
202                         "tag_category": 'pd_book',
203                         "is_pdcounter": True,
204                         "uid": "tag%d_pd_b" % tag.id
205                         }
206                 else:
207                     doc = {
208                         "tag_id": int(tag.id),
209                         "tag_name": tag.name,
210                         "tag_name_pl": tag.name,
211                         "tag_category": tag.category,
212                         "is_pdcounter": False,
213                         "uid": "tag%d" % tag.id
214                         }
215                 self.index.add(doc)
216
217     def create_book_doc(self, book):
218         """
219         Create a lucene document referring book id.
220         """
221         doc = {'book_id': int(book.id)}
222         if book.parent is not None:
223             doc['parent_id'] = int(book.parent.id)
224         return doc
225
226     def remove_book(self, book_or_id, remove_snippets=True):
227         """Removes a book from search index.
228         book - Book instance."""
229         if isinstance(book_or_id, catalogue.models.Book):
230             book_id = book_or_id.id
231         else:
232             book_id = book_or_id
233
234         self.delete_query(self.index.Q(book_id=book_id))
235
236         if remove_snippets:
237             snippets = Snippets(book_id)
238             snippets.remove()
239
240     def index_book(self, book, book_info=None, overwrite=True):
241         """
242         Indexes the book.
243         Creates a lucene document for extracted metadata
244         and calls self.index_content() to index the contents of the book.
245         """
246         if overwrite:
247             # we don't remove snippets, since they might be still needed by
248             # threads using not reopened index
249             self.remove_book(book, remove_snippets=False)
250
251         book_doc = self.create_book_doc(book)
252         meta_fields = self.extract_metadata(book, book_info, dc_only=[
253             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
254         # let's not index it - it's only used for extracting publish date
255         if 'source_name' in meta_fields:
256             del meta_fields['source_name']
257
258         for n, f in meta_fields.items():
259             book_doc[n] = f
260
261         book_doc['uid'] = "book%s" % book_doc['book_id']
262         self.index.add(book_doc)
263         del book_doc
264         book_fields = {
265             'title': meta_fields['title'],
266             'authors': meta_fields['authors'],
267             'published_date': meta_fields['published_date']
268             }
269
270         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
271             if tag_name in meta_fields:
272                 book_fields[tag_name] = meta_fields[tag_name]
273
274         self.index_content(book, book_fields=book_fields)
275
276     master_tags = [
277         'opowiadanie',
278         'powiesc',
279         'dramat_wierszowany_l',
280         'dramat_wierszowany_lp',
281         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
282         'wywiad',
283     ]
284
285     ignore_content_tags = [
286         'uwaga', 'extra', 'nota_red', 'abstrakt',
287         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
288         'didaskalia',
289         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
290     ]
291
292     footnote_tags = ['pa', 'pt', 'pr', 'pe']
293
294     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
295                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
296
297     published_date_re = re.compile("([0-9]+)[\]. ]*$")
298
299     def extract_metadata(self, book, book_info=None, dc_only=None):
300         """
301         Extract metadata from book and returns a map of fields keyed by fieldname
302         """
303         fields = {}
304
305         if book_info is None:
306             book_info = dcparser.parse(open(book.xml_file.path))
307
308         fields['slug'] = book.slug
309         fields['is_book'] = True
310
311         # validator, name
312         for field in dcparser.BookInfo.FIELDS:
313             if dc_only and field.name not in dc_only:
314                 continue
315             if hasattr(book_info, field.name):
316                 if not getattr(book_info, field.name):
317                     continue
318                 # since no type information is available, we use validator
319                 type_indicator = field.validator
320                 if type_indicator == dcparser.as_unicode:
321                     s = getattr(book_info, field.name)
322                     if field.multiple:
323                         s = ', '.join(s)
324                     fields[field.name] = s
325                 elif type_indicator == dcparser.as_person:
326                     p = getattr(book_info, field.name)
327                     if isinstance(p, dcparser.Person):
328                         persons = str(p)
329                     else:
330                         persons = ', '.join(map(str, p))
331                     fields[field.name] = persons
332                 elif type_indicator == dcparser.as_date:
333                     dt = getattr(book_info, field.name)
334                     fields[field.name] = dt
335
336         # get published date
337         pd = None
338         if hasattr(book_info, 'source_name') and book_info.source_name:
339             match = self.published_date_re.search(book_info.source_name)
340             if match is not None:
341                 pd = str(match.groups()[0])
342         if not pd:
343             pd = ""
344         fields["published_date"] = pd
345
346         return fields
347
348     # def add_gaps(self, fields, fieldname):
349     #     """
350     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
351     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
352     #     """
353     #     def gap():
354     #         while True:
355     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
356     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
357
358     def get_master(self, root):
359         """
360         Returns the first master tag from an etree.
361         """
362         for master in root.iter():
363             if master.tag in self.master_tags:
364                 return master
365
366     def index_content(self, book, book_fields):
367         """
368         Walks the book XML and extract content from it.
369         Adds parts for each header tag and for each fragment.
370         """
371         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
372         root = wld.edoc.getroot()
373
374         master = self.get_master(root)
375         if master is None:
376             return []
377
378         def walker(node):
379             if node.tag not in self.ignore_content_tags:
380                 yield node, None, None
381                 if node.text is not None:
382                     yield None, node.text, None
383                 for child in list(node):
384                     for b, t, e in walker(child):
385                         yield b, t, e
386                 yield None, None, node
387
388             if node.tail is not None:
389                 yield None, node.tail, None
390             return
391
392         def fix_format(text):
393             # separator = [u" ", u"\t", u".", u";", u","]
394             if isinstance(text, list):
395                 # need to join it first
396                 text = filter(lambda s: s is not None, content)
397                 text = u' '.join(text)
398                 # for i in range(len(text)):
399                 #     if i > 0:
400                 #         if text[i][0] not in separator\
401                 #             and text[i - 1][-1] not in separator:
402                 #          text.insert(i, u" ")
403
404             return re.sub("(?m)/$", "", text)
405
406         def add_part(snippets, **fields):
407             doc = self.create_book_doc(book)
408             for n, v in book_fields.items():
409                 doc[n] = v
410
411             doc['header_index'] = fields["header_index"]
412             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
413             doc['header_type'] = fields['header_type']
414
415             doc['text'] = fields['text']
416
417             # snippets
418             snip_pos = snippets.add(fields["text"])
419
420             doc['snippets_position'] = snip_pos[0]
421             doc['snippets_length'] = snip_pos[1]
422             if snippets.revision:
423                 doc["snippets_revision"] = snippets.revision
424
425             if 'fragment_anchor' in fields:
426                 doc["fragment_anchor"] = fields['fragment_anchor']
427
428             if 'themes' in fields:
429                 doc['themes'] = fields['themes']
430             doc['uid'] = "part%s-%s-%s-%s" % (
431                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
432             return doc
433
434         fragments = {}
435         snippets = Snippets(book.id).open('w')
436         try:
437             for header, position in zip(list(master), range(len(master))):
438
439                 if header.tag in self.skip_header_tags:
440                     continue
441                 if header.tag is etree.Comment:
442                     continue
443
444                 # section content
445                 content = []
446                 footnote = []
447
448                 def all_content(text):
449                     for frag in fragments.values():
450                         frag['text'].append(text)
451                     content.append(text)
452                 handle_text = [all_content]
453
454                 for start, text, end in walker(header):
455                     # handle footnotes
456                     if start is not None and start.tag in self.footnote_tags:
457                         footnote = []
458
459                         def collect_footnote(t):
460                             footnote.append(t)
461
462                         handle_text.append(collect_footnote)
463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
464                         handle_text.pop()
465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
466                                        text=u''.join(footnote),
467                                        is_footnote=True)
468                         self.index.add(doc)
469                         footnote = []
470
471                     # handle fragments and themes.
472                     if start is not None and start.tag == 'begin':
473                         fid = start.attrib['id'][1:]
474                         fragments[fid] = {
475                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
476
477                     # themes for this fragment
478                     elif start is not None and start.tag == 'motyw':
479                         fid = start.attrib['id'][1:]
480                         handle_text.append(lambda text: None)
481                         if start.text is not None:
482                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
483                     elif end is not None and end.tag == 'motyw':
484                         handle_text.pop()
485
486                     elif start is not None and start.tag == 'end':
487                         fid = start.attrib['id'][1:]
488                         if fid not in fragments:
489                             continue  # a broken <end> node, skip it
490                         frag = fragments[fid]
491                         if not frag['themes']:
492                             continue  # empty themes list.
493                         del fragments[fid]
494
495                         doc = add_part(snippets,
496                                        header_type=frag['start_header'],
497                                        header_index=frag['start_section'],
498                                        header_span=position - frag['start_section'] + 1,
499                                        fragment_anchor=fid,
500                                        text=fix_format(frag['text']),
501                                        themes=frag['themes'])
502                         self.index.add(doc)
503
504                         # Collect content.
505
506                     if text is not None and handle_text is not []:
507                         hdl = handle_text[-1]
508                         hdl(text)
509
510                         # in the end, add a section text.
511                 doc = add_part(snippets, header_index=position,
512                                header_type=header.tag, text=fix_format(content))
513
514                 self.index.add(doc)
515
516         finally:
517             snippets.close()
518
519     def remove_picture(self, picture_or_id):
520         """Removes a picture from search index."""
521         if isinstance(picture_or_id, picture.models.Picture):
522             picture_id = picture_or_id.id
523         else:
524             picture_id = picture_or_id
525         self.delete_query(self.index.Q(picture_id=picture_id))
526
527     def index_picture(self, picture, picture_info=None, overwrite=True):
528         """
529         Indexes the picture.
530         Creates a lucene document for extracted metadata
531         and calls self.index_area() to index the contents of the picture.
532         """
533         if overwrite:
534             # we don't remove snippets, since they might be still needed by
535             # threads using not reopened index
536             self.remove_picture(picture)
537
538         picture_doc = {'picture_id': int(picture.id)}
539         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
540             'authors', 'title', 'epochs', 'kinds', 'genres'])
541
542         picture_doc.update(meta_fields)
543
544         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
545         self.index.add(picture_doc)
546         del picture_doc['is_book']
547         for area in picture.areas.all():
548             self.index_area(area, picture_fields=picture_doc)
549
550     def index_area(self, area, picture_fields):
551         """
552         Indexes themes and objects on the area.
553         """
554         doc = dict(picture_fields)
555         doc['area_id'] = area.id
556         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
557         doc['uid'] = 'area%s' % area.id
558         self.index.add(doc)
559
560
561 @total_ordering
562 class SearchResult(object):
563     def __init__(self, doc, how_found=None, query_terms=None):
564         self.boost = 1.0
565         self._hits = []
566         self._processed_hits = None  # processed hits
567         self.snippets = []
568         self.query_terms = query_terms
569         self._book = None
570
571         if 'score' in doc:
572             self._score = doc['score']
573         else:
574             self._score = 0
575
576         self.book_id = int(doc["book_id"])
577
578         try:
579             self.published_date = int(doc.get("published_date"))
580         except ValueError:
581             self.published_date = 0
582
583         # content hits
584         header_type = doc.get("header_type", None)
585         # we have a content hit in some header of fragment
586         if header_type is not None:
587             sec = (header_type, int(doc["header_index"]))
588             header_span = doc['header_span']
589             header_span = header_span is not None and int(header_span) or 1
590             fragment = doc.get("fragment_anchor", None)
591             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
592             snippets_rev = doc.get('snippets_revision', None)
593
594             hit = (sec + (header_span,), fragment, self._score, {
595                 'how_found': how_found,
596                 'snippets_pos': snippets_pos,
597                 'snippets_revision': snippets_rev,
598                 'themes': doc.get('themes', []),
599                 'themes_pl': doc.get('themes_pl', [])
600                 })
601
602             self._hits.append(hit)
603
604     @classmethod
605     def from_book(cls, book, how_found=None, query_terms=None):
606         doc = {
607             'score': book.popularity.count,
608             'book_id': book.id,
609             'published_date': 0,
610         }
611         result = cls(doc, how_found=how_found, query_terms=query_terms)
612         result._book = book
613         return result
614
615     def __str__(self):
616         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
617             (self.book_id, len(self._hits),
618              len(self._processed_hits) if self._processed_hits else -1,
619              self._score, len(self.snippets))
620
621     def __bytes__(self):
622         return str(self).encode('utf-8')
623
624     @property
625     def score(self):
626         return self._score * self.boost
627
628     def merge(self, other):
629         if self.book_id != other.book_id:
630             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
631         self._hits += other._hits
632         self._score += max(other._score, 0)
633         return self
634
635     def get_book(self):
636         if self._book is not None:
637             return self._book
638         try:
639             self._book = catalogue.models.Book.objects.get(id=self.book_id)
640         except catalogue.models.Book.DoesNotExist:
641             self._book = None
642         return self._book
643
644     book = property(get_book)
645
646     POSITION = 0
647     FRAGMENT = 1
648     POSITION_INDEX = 1
649     POSITION_SPAN = 2
650     SCORE = 2
651     OTHER = 3
652
653     @property
654     def hits(self):
655         if self._processed_hits is not None:
656             return self._processed_hits
657
658         # to sections and fragments
659         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
660
661         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
662
663         # sections not covered by fragments
664         sect = filter(lambda s: 0 == len(list(filter(
665             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
666                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
667
668         def remove_duplicates(lst, keyfn, compare):
669             els = {}
670             for e in lst:
671                 eif = keyfn(e)
672                 if eif in els:
673                     if compare(els[eif], e) >= 1:
674                         continue
675                 els[eif] = e
676             return els.values()
677
678         # remove fragments with duplicated fid's and duplicated snippets
679         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
680         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
681         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
682
683         # remove duplicate sections
684         sections = {}
685
686         for s in sect:
687             si = s[self.POSITION][self.POSITION_INDEX]
688             # skip existing
689             if si in sections:
690                 if sections[si]['score'] >= s[self.SCORE]:
691                     continue
692
693             m = {'score': s[self.SCORE],
694                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
695                  }
696             m.update(s[self.OTHER])
697             sections[si] = m
698
699         hits = list(sections.values())
700
701         for f in frags:
702             try:
703                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
704             except catalogue.models.Fragment.DoesNotExist:
705                 # stale index
706                 continue
707             # Figure out if we were searching for a token matching some word in theme name.
708             themes = frag.tags.filter(category='theme')
709             themes_hit = set()
710             if self.query_terms is not None:
711                 for i in range(0, len(f[self.OTHER]['themes'])):
712                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
713                     tms = map(str.lower, tms)
714                     for qt in self.query_terms:
715                         if qt in tms:
716                             themes_hit.add(f[self.OTHER]['themes'][i])
717                             break
718
719             def theme_by_name(n):
720                 th = list(filter(lambda t: t.name == n, themes))
721                 if th:
722                     return th[0]
723                 else:
724                     return None
725             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
726
727             m = {'score': f[self.SCORE],
728                  'fragment': frag,
729                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
730                  'themes': themes,
731                  'themes_hit': themes_hit
732                  }
733             m.update(f[self.OTHER])
734             hits.append(m)
735
736         hits.sort(key=lambda h: h['score'], reverse=True)
737
738         self._processed_hits = hits
739
740         return hits
741
742     @staticmethod
743     def aggregate(*result_lists):
744         books = {}
745         for rl in result_lists:
746             for r in rl:
747                 if r.book_id in books:
748                     books[r.book_id].merge(r)
749                 else:
750                     books[r.book_id] = r
751         return books.values()
752
753     def get_sort_key(self):
754         return (-self.score,
755                 self.published_date,
756                 self.book.sort_key_author if self.book else '',
757                 self.book.sort_key if self.book else '')
758
759     def __lt__(self, other):
760         return self.get_sort_key() > other.get_sort_key()
761
762     def __eq__(self, other):
763         return self.get_sort_key() == other.get_sort_key()
764
765     def __len__(self):
766         return len(self.hits)
767
768     def snippet_pos(self, idx=0):
769         return self.hits[idx]['snippets_pos']
770
771     def snippet_revision(self, idx=0):
772         try:
773             return self.hits[idx]['snippets_revision']
774         except (IndexError, KeyError):
775             return None
776
777
778 @total_ordering
779 class PictureResult(object):
780     def __init__(self, doc, how_found=None, query_terms=None):
781         self.boost = 1.0
782         self.query_terms = query_terms
783         self._picture = None
784         self._hits = []
785         self._processed_hits = None
786
787         if 'score' in doc:
788             self._score = doc['score']
789         else:
790             self._score = 0
791
792         self.picture_id = int(doc["picture_id"])
793
794         if doc.get('area_id'):
795             hit = (self._score, {
796                 'how_found': how_found,
797                 'area_id': doc['area_id'],
798                 'themes': doc.get('themes', []),
799                 'themes_pl': doc.get('themes_pl', []),
800             })
801
802             self._hits.append(hit)
803
804     def __str__(self):
805         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
806
807     def __repr__(self):
808         return str(self)
809
810     @property
811     def score(self):
812         return self._score * self.boost
813
814     def merge(self, other):
815         if self.picture_id != other.picture_id:
816             raise ValueError(
817                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
818         self._hits += other._hits
819         self._score += max(other._score, 0)
820         return self
821
822     SCORE = 0
823     OTHER = 1
824
825     @property
826     def hits(self):
827         if self._processed_hits is not None:
828             return self._processed_hits
829
830         hits = []
831         for hit in self._hits:
832             try:
833                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
834             except picture.models.PictureArea.DoesNotExist:
835                 # stale index
836                 continue
837             # Figure out if we were searching for a token matching some word in theme name.
838             themes_hit = set()
839             if self.query_terms is not None:
840                 for i in range(0, len(hit[self.OTHER]['themes'])):
841                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
842                     tms = map(str.lower, tms)
843                     for qt in self.query_terms:
844                         if qt in tms:
845                             themes_hit.add(hit[self.OTHER]['themes'][i])
846                             break
847
848             m = {
849                 'score': hit[self.SCORE],
850                 'area': area,
851                 'themes_hit': themes_hit,
852             }
853             m.update(hit[self.OTHER])
854             hits.append(m)
855
856         hits.sort(key=lambda h: h['score'], reverse=True)
857         hits = hits[:1]
858         self._processed_hits = hits
859         return hits
860
861     def get_picture(self):
862         if self._picture is None:
863             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
864         return self._picture
865
866     picture = property(get_picture)
867
868     @staticmethod
869     def aggregate(*result_lists):
870         books = {}
871         for rl in result_lists:
872             for r in rl:
873                 if r.picture_id in books:
874                     books[r.picture_id].merge(r)
875                 else:
876                     books[r.picture_id] = r
877         return books.values()
878
879     def __lt__(self, other):
880         return self.score < other.score
881
882     def __eq__(self, other):
883         return self.score == other.score
884
885
886 class Search(SolrIndex):
887     """
888     Search facilities.
889     """
890     def __init__(self, default_field="text"):
891         super(Search, self).__init__(mode='r')
892
893     def make_term_query(self, query, field='text', modal=operator.or_):
894         """
895         Returns term queries joined by boolean query.
896         modal - applies to boolean query
897         fuzzy - should the query by fuzzy.
898         """
899         if query is None:
900             query = ''
901         q = self.index.Q()
902         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
903
904         return q
905
906     def search_by_author(self, words):
907         from catalogue.models import Book
908         books = Book.objects.filter(parent=None).order_by('-popularity__count')
909         for word in words:
910             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
911         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
912
913     def search_words(self, words, fields, required=None, book=True, picture=False):
914         if book and not picture and fields == ['authors']:
915             return self.search_by_author(words)
916         filters = []
917         for word in words:
918             if book or picture or (word not in stopwords):
919                 word_filter = None
920                 for field in fields:
921                     q = self.index.Q(**{field: word})
922                     if word_filter is None:
923                         word_filter = q
924                     else:
925                         word_filter |= q
926                 filters.append(word_filter)
927         if required:
928             required_filter = None
929             for field in required:
930                 for word in words:
931                     if book or picture or (word not in stopwords):
932                         q = self.index.Q(**{field: word})
933                         if required_filter is None:
934                             required_filter = q
935                         else:
936                             required_filter |= q
937             filters.append(required_filter)
938         if not filters:
939             return []
940         params = {}
941         if book:
942             params['is_book'] = True
943         if picture:
944             params['picture_id__gt'] = 0
945         else:
946             params['book_id__gt'] = 0
947         query = self.index.query(**params)
948         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
949         result_class = PictureResult if picture else SearchResult
950         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
951
952     def get_snippets(self, searchresult, query, field='text', num=1):
953         """
954         Returns a snippet for found scoreDoc.
955         """
956         maxnum = len(searchresult)
957         if num is None or num < 0 or num > maxnum:
958             num = maxnum
959         book_id = searchresult.book_id
960         revision = searchresult.snippet_revision()
961         snippets = Snippets(book_id, revision=revision)
962         snips = [None] * maxnum
963         try:
964             snippets.open()
965             idx = 0
966             while idx < maxnum and num > 0:
967                 position, length = searchresult.snippet_pos(idx)
968                 if position is None or length is None:
969                     continue
970                 text = snippets.get((int(position),
971                                      int(length)))
972                 snip = self.index.highlight(text=text, field=field, q=query)
973                 if not snip and field == 'text':
974                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
975                 if snip not in snips:
976                     snips[idx] = snip
977                     if snip:
978                         num -= 1
979                 idx += 1
980
981         except IOError as e:
982             book = catalogue.models.Book.objects.filter(id=book_id)
983             if not book:
984                 log.error("Book does not exist for book id = %d" % book_id)
985             elif not book.get().children.exists():
986                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
987             return []
988         finally:
989             snippets.close()
990
991         # remove verse end markers..
992         snips = [s.replace("/\n", "\n") if s else s for s in snips]
993
994         searchresult.snippets = snips
995
996         return snips
997
998     @staticmethod
999     def apply_filters(query, filters):
1000         """
1001         Apply filters to a query
1002         """
1003         if filters is None:
1004             filters = []
1005         filters = filter(lambda x: x is not None, filters)
1006         for f in filters:
1007             query = query.query(f)
1008         return query
1009
1010
1011 if getattr(settings, 'SEARCH_MOCK', False):
1012     from .mock_search import Search