06d2c4a6eab01955c26bac9034fc12609a242df9
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
14 import scorched
15 import catalogue.models
16 import picture.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
19 from . import custom
20
21 log = logging.getLogger('search')
22
23
24 if os.path.isfile(settings.SOLR_STOPWORDS):
25     stopwords = set(
26         line.strip()
27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
28 else:
29     stopwords = set()
30
31
32 class SolrIndex(object):
33     def __init__(self, mode=None):
34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
35
36
37 class Snippets(object):
38     """
39     This class manages snippet files for indexed object (book)
40     the snippets are concatenated together, and their positions and
41     lengths are kept in lucene index fields.
42     """
43     SNIPPET_DIR = "snippets"
44
45     def __init__(self, book_id, revision=None):
46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47         self.book_id = book_id
48         self.revision = revision
49         self.file = None
50         self.position = None
51
52     @property
53     def path(self):
54         if self.revision:
55             fn = "%d.%d" % (self.book_id, self.revision)
56         else:
57             fn = "%d" % self.book_id
58
59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60
61     def open(self, mode='r'):
62         """
63         Open the snippet file. Call .close() afterwards.
64         """
65         if 'b' not in mode:
66             mode += 'b'
67
68         if 'w' in mode:
69             if os.path.exists(self.path):
70                 self.revision = 1
71                 while True:
72                     if not os.path.exists(self.path):
73                         break
74                     self.revision += 1
75
76         self.file = open(self.path, mode)
77         self.position = 0
78         return self
79
80     def add(self, snippet):
81         """
82         Append a snippet (unicode) to the snippet file.
83         Return a (position, length) tuple
84         """
85         txt = snippet.encode('utf-8')
86         l = len(txt)
87         self.file.write(txt)
88         pos = (self.position, l)
89         self.position += l
90         return pos
91
92     def get(self, pos):
93         """
94         Given a tuple of (position, length) return an unicode
95         of the snippet stored there.
96         """
97         self.file.seek(pos[0], 0)
98         txt = self.file.read(pos[1]).decode('utf-8')
99         return txt
100
101     def close(self):
102         """Close snippet file"""
103         if self.file:
104             self.file.close()
105
106     def remove(self):
107         self.revision = None
108         try:
109             os.unlink(self.path)
110             self.revision = 0
111             while True:
112                 self.revision += 1
113                 os.unlink(self.path)
114         except OSError:
115             pass
116
117
118 class Index(SolrIndex):
119     """
120     Class indexing books.
121     """
122     def __init__(self):
123         super(Index, self).__init__(mode='rw')
124
125     def delete_query(self, *queries):
126         """
127         index.delete(queries=...) doesn't work, so let's reimplement it
128         using deletion of list of uids.
129         """
130         uids = set()
131         for q in queries:
132             if isinstance(q, scorched.search.LuceneQuery):
133                 q = self.index.query(q)
134             q.field_limiter.update(['uid'])
135             st = 0
136             rows = 100
137             while True:
138                 ids = q.paginate(start=st, rows=rows).execute()
139                 if not len(ids):
140                     break
141                 for res in ids:
142                     uids.add(res['uid'])
143                 st += rows
144         if uids:
145             # FIXME: With Solr API change, this doesn't work.
146             #self.index.delete(uids)
147             return True
148         else:
149             return False
150
151     def index_tags(self, *tags, **kw):
152         """
153         Re-index global tag list.
154         Removes all tags from index, then index them again.
155         Indexed fields include: id, name (with and without polish stems), category
156         """
157         log.debug("Indexing tags")
158         remove_only = kw.get('remove_only', False)
159         # first, remove tags from index.
160         if tags:
161             tag_qs = []
162             for tag in tags:
163                 q_id = self.index.Q(tag_id=tag.id)
164
165                 if isinstance(tag, PDCounterAuthor):
166                     q_cat = self.index.Q(tag_category='pd_author')
167                 elif isinstance(tag, PDCounterBook):
168                     q_cat = self.index.Q(tag_category='pd_book')
169                 else:
170                     q_cat = self.index.Q(tag_category=tag.category)
171
172                 q_id_cat = self.index.Q(q_id & q_cat)
173                 tag_qs.append(q_id_cat)
174             self.delete_query(*tag_qs)
175         else:  # all
176             q = self.index.Q(tag_id__any=True)
177             self.delete_query(q)
178
179         if not remove_only:
180             # then add them [all or just one passed]
181             if not tags:
182                 tags = chain(
183                     catalogue.models.Tag.objects.exclude(category='set'),
184                     PDCounterAuthor.objects.all(),
185                     PDCounterBook.objects.all())
186
187             for tag in tags:
188                 if isinstance(tag, PDCounterAuthor):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.name,
192                         "tag_name_pl": tag.name,
193                         "tag_category": 'pd_author',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_a" % tag.id
196                         }
197                 elif isinstance(tag, PDCounterBook):
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.title,
201                         "tag_name_pl": tag.title,
202                         "tag_category": 'pd_book',
203                         "is_pdcounter": True,
204                         "uid": "tag%d_pd_b" % tag.id
205                         }
206                 else:
207                     doc = {
208                         "tag_id": int(tag.id),
209                         "tag_name": tag.name,
210                         "tag_name_pl": tag.name,
211                         "tag_category": tag.category,
212                         "is_pdcounter": False,
213                         "uid": "tag%d" % tag.id
214                         }
215                 self.index.add(doc)
216
217     def create_book_doc(self, book):
218         """
219         Create a lucene document referring book id.
220         """
221         doc = {'book_id': int(book.id)}
222         if book.parent is not None:
223             doc['parent_id'] = int(book.parent.id)
224         return doc
225
226     def remove_book(self, book_or_id, remove_snippets=True):
227         """Removes a book from search index.
228         book - Book instance."""
229         if isinstance(book_or_id, catalogue.models.Book):
230             book_id = book_or_id.id
231         else:
232             book_id = book_or_id
233
234         self.delete_query(self.index.Q(book_id=book_id))
235
236         if remove_snippets:
237             snippets = Snippets(book_id)
238             snippets.remove()
239
240     def index_book(self, book, book_info=None, overwrite=True):
241         """
242         Indexes the book.
243         Creates a lucene document for extracted metadata
244         and calls self.index_content() to index the contents of the book.
245         """
246         if overwrite:
247             # we don't remove snippets, since they might be still needed by
248             # threads using not reopened index
249             self.remove_book(book, remove_snippets=False)
250
251         book_doc = self.create_book_doc(book)
252         meta_fields = self.extract_metadata(book, book_info, dc_only=[
253             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
254         # let's not index it - it's only used for extracting publish date
255         if 'source_name' in meta_fields:
256             del meta_fields['source_name']
257
258         for n, f in meta_fields.items():
259             book_doc[n] = f
260
261         book_doc['uid'] = "book%s" % book_doc['book_id']
262         self.index.add(book_doc)
263         del book_doc
264         book_fields = {
265             'title': meta_fields['title'],
266             'authors': meta_fields['authors'],
267             'published_date': meta_fields['published_date']
268             }
269
270         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
271             if tag_name in meta_fields:
272                 book_fields[tag_name] = meta_fields[tag_name]
273
274         self.index_content(book, book_fields=book_fields)
275
276     master_tags = [
277         'opowiadanie',
278         'powiesc',
279         'dramat_wierszowany_l',
280         'dramat_wierszowany_lp',
281         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
282         'wywiad',
283     ]
284
285     ignore_content_tags = [
286         'uwaga', 'extra', 'nota_red', 'abstrakt',
287         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
288         'didaskalia',
289         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
290     ]
291
292     footnote_tags = ['pa', 'pt', 'pr', 'pe']
293
294     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
295                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
296
297     published_date_re = re.compile("([0-9]+)[\]. ]*$")
298
299     def extract_metadata(self, book, book_info=None, dc_only=None):
300         """
301         Extract metadata from book and returns a map of fields keyed by fieldname
302         """
303         fields = {}
304
305         if book_info is None:
306             book_info = dcparser.parse(open(book.xml_file.path))
307
308         fields['slug'] = book.slug
309         fields['is_book'] = True
310
311         # validator, name
312         for field in dcparser.BookInfo.FIELDS:
313             if dc_only and field.name not in dc_only:
314                 continue
315             if hasattr(book_info, field.name):
316                 if not getattr(book_info, field.name):
317                     continue
318                 # since no type information is available, we use validator
319                 type_indicator = field.validator
320                 if type_indicator == dcparser.as_unicode:
321                     s = getattr(book_info, field.name)
322                     if field.multiple:
323                         s = ', '.join(s)
324                     fields[field.name] = s
325                 elif type_indicator == dcparser.as_person:
326                     p = getattr(book_info, field.name)
327                     if isinstance(p, dcparser.Person):
328                         persons = str(p)
329                     else:
330                         persons = ', '.join(map(str, p))
331                     fields[field.name] = persons
332                 elif type_indicator == dcparser.as_date:
333                     dt = getattr(book_info, field.name)
334                     fields[field.name] = dt
335
336         # get published date
337         pd = None
338         if hasattr(book_info, 'source_name') and book_info.source_name:
339             match = self.published_date_re.search(book_info.source_name)
340             if match is not None:
341                 pd = str(match.groups()[0])
342         if not pd:
343             pd = ""
344         fields["published_date"] = pd
345
346         return fields
347
348     # def add_gaps(self, fields, fieldname):
349     #     """
350     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
351     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
352     #     """
353     #     def gap():
354     #         while True:
355     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
356     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
357
358     def get_master(self, root):
359         """
360         Returns the first master tag from an etree.
361         """
362         for master in root.iter():
363             if master.tag in self.master_tags:
364                 return master
365
366     def index_content(self, book, book_fields):
367         """
368         Walks the book XML and extract content from it.
369         Adds parts for each header tag and for each fragment.
370         """
371         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
372         root = wld.edoc.getroot()
373
374         master = self.get_master(root)
375         if master is None:
376             return []
377
378         def walker(node):
379             if node.tag not in self.ignore_content_tags:
380                 yield node, None, None
381                 if node.text is not None:
382                     yield None, node.text, None
383                 for child in list(node):
384                     for b, t, e in walker(child):
385                         yield b, t, e
386                 yield None, None, node
387
388             if node.tail is not None:
389                 yield None, node.tail, None
390             return
391
392         def fix_format(text):
393             # separator = [" ", "\t", ".", ";", ","]
394             if isinstance(text, list):
395                 # need to join it first
396                 text = filter(lambda s: s is not None, content)
397                 text = ' '.join(text)
398                 # for i in range(len(text)):
399                 #     if i > 0:
400                 #         if text[i][0] not in separator\
401                 #             and text[i - 1][-1] not in separator:
402                 #          text.insert(i, " ")
403
404             return re.sub("(?m)/$", "", text)
405
406         def add_part(snippets, **fields):
407             doc = self.create_book_doc(book)
408             for n, v in book_fields.items():
409                 doc[n] = v
410
411             doc['header_index'] = fields["header_index"]
412             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
413             doc['header_type'] = fields['header_type']
414
415             doc['text'] = fields['text']
416
417             # snippets
418             snip_pos = snippets.add(fields["text"])
419
420             doc['snippets_position'] = snip_pos[0]
421             doc['snippets_length'] = snip_pos[1]
422             if snippets.revision:
423                 doc["snippets_revision"] = snippets.revision
424
425             if 'fragment_anchor' in fields:
426                 doc["fragment_anchor"] = fields['fragment_anchor']
427
428             if 'themes' in fields:
429                 doc['themes'] = fields['themes']
430             doc['uid'] = "part%s-%s-%s-%s" % (
431                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
432             return doc
433
434         fragments = {}
435         snippets = Snippets(book.id).open('w')
436         try:
437             for header, position in zip(list(master), range(len(master))):
438
439                 if header.tag in self.skip_header_tags:
440                     continue
441                 if header.tag is etree.Comment:
442                     continue
443
444                 # section content
445                 content = []
446                 footnote = []
447
448                 def all_content(text):
449                     for frag in fragments.values():
450                         frag['text'].append(text)
451                     content.append(text)
452                 handle_text = [all_content]
453
454                 for start, text, end in walker(header):
455                     # handle footnotes
456                     if start is not None and start.tag in self.footnote_tags:
457                         footnote = []
458
459                         def collect_footnote(t):
460                             footnote.append(t)
461
462                         handle_text.append(collect_footnote)
463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
464                         handle_text.pop()
465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
466                                        text=''.join(footnote),
467                                        is_footnote=True)
468                         self.index.add(doc)
469                         footnote = []
470
471                     # handle fragments and themes.
472                     if start is not None and start.tag == 'begin':
473                         fid = start.attrib['id'][1:]
474                         fragments[fid] = {
475                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
476
477                     # themes for this fragment
478                     elif start is not None and start.tag == 'motyw':
479                         fid = start.attrib['id'][1:]
480                         handle_text.append(lambda text: None)
481                         if start.text is not None:
482                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
483                     elif end is not None and end.tag == 'motyw':
484                         handle_text.pop()
485
486                     elif start is not None and start.tag == 'end':
487                         fid = start.attrib['id'][1:]
488                         if fid not in fragments:
489                             continue  # a broken <end> node, skip it
490                         frag = fragments[fid]
491                         if not frag['themes']:
492                             continue  # empty themes list.
493                         del fragments[fid]
494
495                         doc = add_part(snippets,
496                                        header_type=frag['start_header'],
497                                        header_index=frag['start_section'],
498                                        header_span=position - frag['start_section'] + 1,
499                                        fragment_anchor=fid,
500                                        text=fix_format(frag['text']),
501                                        themes=frag['themes'])
502                         self.index.add(doc)
503
504                         # Collect content.
505
506                     if text is not None and handle_text is not []:
507                         hdl = handle_text[-1]
508                         hdl(text)
509
510                         # in the end, add a section text.
511                 doc = add_part(snippets, header_index=position,
512                                header_type=header.tag, text=fix_format(content))
513
514                 self.index.add(doc)
515
516         finally:
517             snippets.close()
518
519     def remove_picture(self, picture_or_id):
520         """Removes a picture from search index."""
521         if isinstance(picture_or_id, picture.models.Picture):
522             picture_id = picture_or_id.id
523         else:
524             picture_id = picture_or_id
525         self.delete_query(self.index.Q(picture_id=picture_id))
526
527     def index_picture(self, picture, picture_info=None, overwrite=True):
528         """
529         Indexes the picture.
530         Creates a lucene document for extracted metadata
531         and calls self.index_area() to index the contents of the picture.
532         """
533         if overwrite:
534             # we don't remove snippets, since they might be still needed by
535             # threads using not reopened index
536             self.remove_picture(picture)
537
538         picture_doc = {'picture_id': int(picture.id)}
539         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
540             'authors', 'title', 'epochs', 'kinds', 'genres'])
541
542         picture_doc.update(meta_fields)
543
544         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
545         self.index.add(picture_doc)
546         del picture_doc['is_book']
547         for area in picture.areas.all():
548             self.index_area(area, picture_fields=picture_doc)
549
550     def index_area(self, area, picture_fields):
551         """
552         Indexes themes and objects on the area.
553         """
554         doc = dict(picture_fields)
555         doc['area_id'] = area.id
556         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
557         doc['uid'] = 'area%s' % area.id
558         self.index.add(doc)
559
560
561 @total_ordering
562 class SearchResult(object):
563     def __init__(self, doc, how_found=None, query_terms=None):
564         self.boost = 1.0
565         self._hits = []
566         self._processed_hits = None  # processed hits
567         self.snippets = []
568         self.query_terms = query_terms
569         self._book = None
570
571         if 'score' in doc:
572             self._score = doc['score']
573         else:
574             self._score = 0
575
576         self.book_id = int(doc["book_id"])
577
578         try:
579             self.published_date = int(doc.get("published_date"))
580         except ValueError:
581             self.published_date = 0
582
583         # content hits
584         header_type = doc.get("header_type", None)
585         # we have a content hit in some header of fragment
586         if header_type is not None:
587             sec = (header_type, int(doc["header_index"]))
588             header_span = doc['header_span']
589             header_span = header_span is not None and int(header_span) or 1
590             fragment = doc.get("fragment_anchor", None)
591             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
592             snippets_rev = doc.get('snippets_revision', None)
593
594             hit = (sec + (header_span,), fragment, self._score, {
595                 'how_found': how_found,
596                 'snippets_pos': snippets_pos,
597                 'snippets_revision': snippets_rev,
598                 'themes': doc.get('themes', []),
599                 'themes_pl': doc.get('themes_pl', [])
600                 })
601
602             self._hits.append(hit)
603
604     @classmethod
605     def from_book(cls, book, how_found=None, query_terms=None):
606         doc = {
607             'score': book.popularity.count,
608             'book_id': book.id,
609             'published_date': 0,
610         }
611         result = cls(doc, how_found=how_found, query_terms=query_terms)
612         result._book = book
613         return result
614
615     def __str__(self):
616         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
617             (self.book_id, len(self._hits),
618              len(self._processed_hits) if self._processed_hits else -1,
619              self._score, len(self.snippets))
620
621     def __bytes__(self):
622         return str(self).encode('utf-8')
623
624     @property
625     def score(self):
626         return self._score * self.boost
627
628     def merge(self, other):
629         if self.book_id != other.book_id:
630             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
631         self._hits += other._hits
632         self._score += max(other._score, 0)
633         return self
634
635     def get_book(self):
636         if self._book is not None:
637             return self._book
638         try:
639             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
640         except catalogue.models.Book.DoesNotExist:
641             self._book = None
642         return self._book
643
644     book = property(get_book)
645
646     POSITION = 0
647     FRAGMENT = 1
648     POSITION_INDEX = 1
649     POSITION_SPAN = 2
650     SCORE = 2
651     OTHER = 3
652
653     @property
654     def hits(self):
655         if self._processed_hits is not None:
656             return self._processed_hits
657
658         # to sections and fragments
659         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
660
661         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
662
663         # sections not covered by fragments
664         sect = filter(lambda s: 0 == len(list(filter(
665             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
666                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
667
668         def remove_duplicates(lst, keyfn, larger):
669             els = {}
670             for e in lst:
671                 eif = keyfn(e)
672                 if eif in els:
673                     if larger(els[eif], e):
674                         continue
675                 els[eif] = e
676             return els.values()
677
678         # remove fragments with duplicated fid's and duplicated snippets
679         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
680
681         # remove duplicate sections
682         sections = {}
683
684         for s in sect:
685             si = s[self.POSITION][self.POSITION_INDEX]
686             # skip existing
687             if si in sections:
688                 if sections[si]['score'] >= s[self.SCORE]:
689                     continue
690
691             m = {'score': s[self.SCORE],
692                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
693                  }
694             m.update(s[self.OTHER])
695             sections[si] = m
696
697         hits = list(sections.values())
698
699         for f in frags:
700             try:
701                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
702             except catalogue.models.Fragment.DoesNotExist:
703                 # stale index
704                 continue
705             # Figure out if we were searching for a token matching some word in theme name.
706             themes = frag.tags.filter(category='theme')
707             themes_hit = set()
708             if self.query_terms is not None:
709                 for i in range(0, len(f[self.OTHER]['themes'])):
710                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
711                     tms = map(str.lower, tms)
712                     for qt in self.query_terms:
713                         if qt in tms:
714                             themes_hit.add(f[self.OTHER]['themes'][i])
715                             break
716
717             def theme_by_name(n):
718                 th = list(filter(lambda t: t.name == n, themes))
719                 if th:
720                     return th[0]
721                 else:
722                     return None
723             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
724
725             m = {'score': f[self.SCORE],
726                  'fragment': frag,
727                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
728                  'themes': themes,
729                  'themes_hit': themes_hit
730                  }
731             m.update(f[self.OTHER])
732             hits.append(m)
733
734         hits.sort(key=lambda h: h['score'], reverse=True)
735
736         self._processed_hits = hits
737
738         return hits
739
740     @staticmethod
741     def aggregate(*result_lists):
742         books = {}
743         for rl in result_lists:
744             for r in rl:
745                 if r.book_id in books:
746                     books[r.book_id].merge(r)
747                 else:
748                     books[r.book_id] = r
749         return books.values()
750
751     def get_sort_key(self):
752         return (-self.score,
753                 self.published_date,
754                 self.book.sort_key_author if self.book else '',
755                 self.book.sort_key if self.book else '')
756
757     def __lt__(self, other):
758         return self.get_sort_key() > other.get_sort_key()
759
760     def __eq__(self, other):
761         return self.get_sort_key() == other.get_sort_key()
762
763     def __len__(self):
764         return len(self.hits)
765
766     def snippet_pos(self, idx=0):
767         return self.hits[idx]['snippets_pos']
768
769     def snippet_revision(self, idx=0):
770         try:
771             return self.hits[idx]['snippets_revision']
772         except (IndexError, KeyError):
773             return None
774
775
776 @total_ordering
777 class PictureResult(object):
778     def __init__(self, doc, how_found=None, query_terms=None):
779         self.boost = 1.0
780         self.query_terms = query_terms
781         self._picture = None
782         self._hits = []
783         self._processed_hits = None
784
785         if 'score' in doc:
786             self._score = doc['score']
787         else:
788             self._score = 0
789
790         self.picture_id = int(doc["picture_id"])
791
792         if doc.get('area_id'):
793             hit = (self._score, {
794                 'how_found': how_found,
795                 'area_id': doc['area_id'],
796                 'themes': doc.get('themes', []),
797                 'themes_pl': doc.get('themes_pl', []),
798             })
799
800             self._hits.append(hit)
801
802     def __str__(self):
803         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
804
805     def __repr__(self):
806         return str(self)
807
808     @property
809     def score(self):
810         return self._score * self.boost
811
812     def merge(self, other):
813         if self.picture_id != other.picture_id:
814             raise ValueError(
815                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
816         self._hits += other._hits
817         self._score += max(other._score, 0)
818         return self
819
820     SCORE = 0
821     OTHER = 1
822
823     @property
824     def hits(self):
825         if self._processed_hits is not None:
826             return self._processed_hits
827
828         hits = []
829         for hit in self._hits:
830             try:
831                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
832             except picture.models.PictureArea.DoesNotExist:
833                 # stale index
834                 continue
835             # Figure out if we were searching for a token matching some word in theme name.
836             themes_hit = set()
837             if self.query_terms is not None:
838                 for i in range(0, len(hit[self.OTHER]['themes'])):
839                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
840                     tms = map(str.lower, tms)
841                     for qt in self.query_terms:
842                         if qt in tms:
843                             themes_hit.add(hit[self.OTHER]['themes'][i])
844                             break
845
846             m = {
847                 'score': hit[self.SCORE],
848                 'area': area,
849                 'themes_hit': themes_hit,
850             }
851             m.update(hit[self.OTHER])
852             hits.append(m)
853
854         hits.sort(key=lambda h: h['score'], reverse=True)
855         hits = hits[:1]
856         self._processed_hits = hits
857         return hits
858
859     def get_picture(self):
860         if self._picture is None:
861             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
862         return self._picture
863
864     picture = property(get_picture)
865
866     @staticmethod
867     def aggregate(*result_lists):
868         books = {}
869         for rl in result_lists:
870             for r in rl:
871                 if r.picture_id in books:
872                     books[r.picture_id].merge(r)
873                 else:
874                     books[r.picture_id] = r
875         return books.values()
876
877     def __lt__(self, other):
878         return self.score < other.score
879
880     def __eq__(self, other):
881         return self.score == other.score
882
883
884 class Search(SolrIndex):
885     """
886     Search facilities.
887     """
888     def __init__(self, default_field="text"):
889         super(Search, self).__init__(mode='r')
890
891     def make_term_query(self, query, field='text', modal=operator.or_):
892         """
893         Returns term queries joined by boolean query.
894         modal - applies to boolean query
895         fuzzy - should the query by fuzzy.
896         """
897         if query is None:
898             query = ''
899         q = self.index.Q()
900         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
901
902         return q
903
904     def search_by_author(self, words):
905         from catalogue.models import Book
906         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
907         for word in words:
908             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
909         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
910
911     def search_words(self, words, fields, required=None, book=True, picture=False):
912         if book and not picture and fields == ['authors']:
913             return self.search_by_author(words)
914         filters = []
915         for word in words:
916             if book or picture or (word not in stopwords):
917                 word_filter = None
918                 for field in fields:
919                     q = self.index.Q(**{field: word})
920                     if word_filter is None:
921                         word_filter = q
922                     else:
923                         word_filter |= q
924                 filters.append(word_filter)
925         if required:
926             required_filter = None
927             for field in required:
928                 for word in words:
929                     if book or picture or (word not in stopwords):
930                         q = self.index.Q(**{field: word})
931                         if required_filter is None:
932                             required_filter = q
933                         else:
934                             required_filter |= q
935             filters.append(required_filter)
936         if not filters:
937             return []
938         params = {}
939         if book:
940             params['is_book'] = True
941         if picture:
942             params['picture_id__gt'] = 0
943         else:
944             params['book_id__gt'] = 0
945         query = self.index.query(**params)
946         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
947         result_class = PictureResult if picture else SearchResult
948         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
949
950     def get_snippets(self, searchresult, query, field='text', num=1):
951         """
952         Returns a snippet for found scoreDoc.
953         """
954         maxnum = len(searchresult)
955         if num is None or num < 0 or num > maxnum:
956             num = maxnum
957         book_id = searchresult.book_id
958         revision = searchresult.snippet_revision()
959         snippets = Snippets(book_id, revision=revision)
960         snips = [None] * maxnum
961         try:
962             snippets.open()
963             idx = 0
964             while idx < maxnum and num > 0:
965                 position, length = searchresult.snippet_pos(idx)
966                 if position is None or length is None:
967                     continue
968                 text = snippets.get((int(position),
969                                      int(length)))
970                 snip = self.index.highlight(text=text, field=field, q=query)
971                 if not snip and field == 'text':
972                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
973                 if snip not in snips:
974                     snips[idx] = snip
975                     if snip:
976                         num -= 1
977                 idx += 1
978
979         except IOError as e:
980             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
981             if not book:
982                 log.error("Book does not exist for book id = %d" % book_id)
983             elif not book.get().children.exists():
984                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
985             return []
986         finally:
987             snippets.close()
988
989         # remove verse end markers..
990         snips = [s.replace("/\n", "\n") if s else s for s in snips]
991
992         searchresult.snippets = snips
993
994         return snips
995
996     @staticmethod
997     def apply_filters(query, filters):
998         """
999         Apply filters to a query
1000         """
1001         if filters is None:
1002             filters = []
1003         filters = filter(lambda x: x is not None, filters)
1004         for f in filters:
1005             query = query.query(f)
1006         return query
1007
1008
1009 if getattr(settings, 'SEARCH_MOCK', False):
1010     from .mock_search import Search