missing file
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
17 import scorched
18 import catalogue.models
19 import picture.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
22 from . import custom
23
24 log = logging.getLogger('search')
25
26
27 if os.path.isfile(settings.SOLR_STOPWORDS):
28     stopwords = set(
29         line.strip()
30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
31 else:
32     stopwords = set()
33
34
35 class SolrIndex(object):
36     def __init__(self, mode=None):
37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
38
39
40 class Snippets(object):
41     """
42     This class manages snippet files for indexed object (book)
43     the snippets are concatenated together, and their positions and
44     lengths are kept in lucene index fields.
45     """
46     SNIPPET_DIR = "snippets"
47
48     def __init__(self, book_id, revision=None):
49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50         self.book_id = book_id
51         self.revision = revision
52         self.file = None
53         self.position = None
54
55     @property
56     def path(self):
57         if self.revision:
58             fn = "%d.%d" % (self.book_id, self.revision)
59         else:
60             fn = "%d" % self.book_id
61
62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
63
64     def open(self, mode='r'):
65         """
66         Open the snippet file. Call .close() afterwards.
67         """
68         if 'b' not in mode:
69             mode += 'b'
70
71         if 'w' in mode:
72             if os.path.exists(self.path):
73                 self.revision = 1
74                 while True:
75                     if not os.path.exists(self.path):
76                         break
77                     self.revision += 1
78
79         self.file = open(self.path, mode)
80         self.position = 0
81         return self
82
83     def add(self, snippet):
84         """
85         Append a snippet (unicode) to the snippet file.
86         Return a (position, length) tuple
87         """
88         txt = snippet.encode('utf-8')
89         l = len(txt)
90         self.file.write(txt)
91         pos = (self.position, l)
92         self.position += l
93         return pos
94
95     def get(self, pos):
96         """
97         Given a tuple of (position, length) return an unicode
98         of the snippet stored there.
99         """
100         self.file.seek(pos[0], 0)
101         try:
102             txt = self.file.read(pos[1]).decode('utf-8')
103         except:
104             return ''
105         return txt
106
107     def close(self):
108         """Close snippet file"""
109         if self.file:
110             self.file.close()
111
112     def remove(self):
113         self.revision = None
114         try:
115             os.unlink(self.path)
116             self.revision = 0
117             while True:
118                 self.revision += 1
119                 os.unlink(self.path)
120         except OSError:
121             pass
122
123
124 class Index(SolrIndex):
125     """
126     Class indexing books.
127     """
128     def __init__(self):
129         super(Index, self).__init__(mode='rw')
130
131     def remove_snippets(self, book):
132         book.snippet_set.all().delete()
133
134     def add_snippet(self, book, doc):
135         assert book.id == doc.pop('book_id')
136         # Fragments already exist and can be indexed where they live.
137         if 'fragment_anchor' in doc:
138             return
139
140         text = doc.pop('text')
141         header_index = doc.pop('header_index')
142         book.snippet_set.create(
143             sec=header_index,
144             text=text,
145         )
146
147     def delete_query(self, *queries):
148         """
149         index.delete(queries=...) doesn't work, so let's reimplement it
150         using deletion of list of uids.
151         """
152         uids = set()
153         for q in queries:
154             if isinstance(q, scorched.search.LuceneQuery):
155                 q = self.index.query(q)
156             q.field_limiter.update(['uid'])
157             st = 0
158             rows = 100
159             while True:
160                 ids = q.paginate(start=st, rows=rows).execute()
161                 if not len(ids):
162                     break
163                 for res in ids:
164                     uids.add(res['uid'])
165                 st += rows
166         if uids:
167             # FIXME: With Solr API change, this doesn't work.
168             #self.index.delete(uids)
169             return True
170         else:
171             return False
172
173     def index_tags(self, *tags, **kw):
174         """
175         Re-index global tag list.
176         Removes all tags from index, then index them again.
177         Indexed fields include: id, name (with and without polish stems), category
178         """
179         log.debug("Indexing tags")
180         remove_only = kw.get('remove_only', False)
181         # first, remove tags from index.
182         if tags:
183             tag_qs = []
184             for tag in tags:
185                 q_id = self.index.Q(tag_id=tag.id)
186
187                 if isinstance(tag, PDCounterAuthor):
188                     q_cat = self.index.Q(tag_category='pd_author')
189                 elif isinstance(tag, PDCounterBook):
190                     q_cat = self.index.Q(tag_category='pd_book')
191                 else:
192                     q_cat = self.index.Q(tag_category=tag.category)
193
194                 q_id_cat = self.index.Q(q_id & q_cat)
195                 tag_qs.append(q_id_cat)
196             self.delete_query(*tag_qs)
197         else:  # all
198             q = self.index.Q(tag_id__any=True)
199             self.delete_query(q)
200
201         if not remove_only:
202             # then add them [all or just one passed]
203             if not tags:
204                 tags = chain(
205                     catalogue.models.Tag.objects.exclude(category='set'),
206                     PDCounterAuthor.objects.all(),
207                     PDCounterBook.objects.all())
208
209             for tag in tags:
210                 if isinstance(tag, PDCounterAuthor):
211                     doc = {
212                         "tag_id": int(tag.id),
213                         "tag_name": tag.name,
214                         "tag_name_pl": tag.name,
215                         "tag_category": 'pd_author',
216                         "is_pdcounter": True,
217                         "uid": "tag%d_pd_a" % tag.id
218                         }
219                 elif isinstance(tag, PDCounterBook):
220                     doc = {
221                         "tag_id": int(tag.id),
222                         "tag_name": tag.title,
223                         "tag_name_pl": tag.title,
224                         "tag_category": 'pd_book',
225                         "is_pdcounter": True,
226                         "uid": "tag%d_pd_b" % tag.id
227                         }
228                 else:
229                     doc = {
230                         "tag_id": int(tag.id),
231                         "tag_name": tag.name,
232                         "tag_name_pl": tag.name,
233                         "tag_category": tag.category,
234                         "is_pdcounter": False,
235                         "uid": "tag%d" % tag.id
236                         }
237                 self.index.add(doc)
238
239     def create_book_doc(self, book):
240         """
241         Create a lucene document referring book id.
242         """
243         doc = {'book_id': int(book.id)}
244         if book.parent is not None:
245             doc['parent_id'] = int(book.parent.id)
246         return doc
247
248     def remove_book(self, book, remove_snippets=True):
249         """Removes a book from search index.
250         book - Book instance."""
251         self.delete_query(self.index.Q(book_id=book.id))
252
253         if remove_snippets:
254             snippets = Snippets(book.id)
255             snippets.remove()
256         self.remove_snippets(book)
257
258     def index_book(self, book, book_info=None, overwrite=True):
259         """
260         Indexes the book.
261         Creates a lucene document for extracted metadata
262         and calls self.index_content() to index the contents of the book.
263         """
264         if not book.xml_file: return
265
266         if overwrite:
267             # we don't remove snippets, since they might be still needed by
268             # threads using not reopened index
269             self.remove_book(book, remove_snippets=False)
270
271         book_doc = self.create_book_doc(book)
272         meta_fields = self.extract_metadata(book, book_info, dc_only=[
273             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
274         # let's not index it - it's only used for extracting publish date
275         if 'source_name' in meta_fields:
276             del meta_fields['source_name']
277
278         for n, f in meta_fields.items():
279             book_doc[n] = f
280
281         book_doc['uid'] = "book%s" % book_doc['book_id']
282         self.index.add(book_doc)
283         del book_doc
284         book_fields = {
285             'title': meta_fields['title'],
286             'authors': meta_fields['authors'],
287             'published_date': meta_fields['published_date']
288             }
289
290         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
291             if tag_name in meta_fields:
292                 book_fields[tag_name] = meta_fields[tag_name]
293
294         self.index_content(book, book_fields=book_fields)
295
296     master_tags = [
297         'opowiadanie',
298         'powiesc',
299         'dramat_wierszowany_l',
300         'dramat_wierszowany_lp',
301         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
302         'wywiad',
303     ]
304
305     ignore_content_tags = [
306         'uwaga', 'extra', 'nota_red', 'abstrakt',
307         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
308         'didaskalia',
309         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
310     ]
311
312     footnote_tags = ['pa', 'pt', 'pr', 'pe']
313
314     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
315                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
316
317     published_date_re = re.compile("([0-9]+)[\]. ]*$")
318
319     def extract_metadata(self, book, book_info=None, dc_only=None):
320         """
321         Extract metadata from book and returns a map of fields keyed by fieldname
322         """
323         fields = {}
324
325         if book_info is None:
326             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
327
328         fields['slug'] = book.slug
329         fields['is_book'] = True
330
331         # validator, name
332         for field in dcparser.BookInfo.FIELDS:
333             if dc_only and field.name not in dc_only:
334                 continue
335             if hasattr(book_info, field.name):
336                 if not getattr(book_info, field.name):
337                     continue
338                 type_indicator = field.value_type
339                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
340                     s = getattr(book_info, field.name)
341                     if field.multiple:
342                         s = ', '.join(s)
343                     fields[field.name] = s
344                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
345                     p = getattr(book_info, field.name)
346                     if isinstance(p, librarian.meta.types.person.Person):
347                         persons = str(p)
348                     else:
349                         persons = ', '.join(map(str, p))
350                     fields[field.name] = persons
351                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
352                     dt = getattr(book_info, field.name)
353                     fields[field.name] = dt
354
355         # get published date
356         pd = None
357         if hasattr(book_info, 'source_name') and book_info.source_name:
358             match = self.published_date_re.search(book_info.source_name)
359             if match is not None:
360                 pd = str(match.groups()[0])
361         if not pd:
362             pd = ""
363         fields["published_date"] = pd
364
365         return fields
366
367     # def add_gaps(self, fields, fieldname):
368     #     """
369     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
370     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
371     #     """
372     #     def gap():
373     #         while True:
374     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
375     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
376
377     def get_master(self, root):
378         """
379         Returns the first master tag from an etree.
380         """
381         for master in root.iter():
382             if master.tag in self.master_tags:
383                 return master
384
385     def index_content(self, book, book_fields):
386         """
387         Walks the book XML and extract content from it.
388         Adds parts for each header tag and for each fragment.
389         """
390         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
391         root = wld.edoc.getroot()
392
393         master = self.get_master(root)
394         if master is None:
395             return []
396
397         def walker(node):
398             if node.tag not in self.ignore_content_tags:
399                 yield node, None, None
400                 if node.text is not None:
401                     yield None, node.text, None
402                 for child in list(node):
403                     for b, t, e in walker(child):
404                         yield b, t, e
405                 yield None, None, node
406
407             if node.tail is not None:
408                 yield None, node.tail, None
409             return
410
411         def fix_format(text):
412             # separator = [" ", "\t", ".", ";", ","]
413             if isinstance(text, list):
414                 # need to join it first
415                 text = filter(lambda s: s is not None, content)
416                 text = ' '.join(text)
417                 # for i in range(len(text)):
418                 #     if i > 0:
419                 #         if text[i][0] not in separator\
420                 #             and text[i - 1][-1] not in separator:
421                 #          text.insert(i, " ")
422
423             return re.sub("(?m)/$", "", text)
424
425         def add_part(snippets, **fields):
426             doc = self.create_book_doc(book)
427             for n, v in book_fields.items():
428                 doc[n] = v
429
430             doc['header_index'] = fields["header_index"]
431             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
432             doc['header_type'] = fields['header_type']
433
434             doc['text'] = fields['text']
435
436             # snippets
437             snip_pos = snippets.add(fields["text"])
438
439             doc['snippets_position'] = snip_pos[0]
440             doc['snippets_length'] = snip_pos[1]
441             if snippets.revision:
442                 doc["snippets_revision"] = snippets.revision
443
444             if 'fragment_anchor' in fields:
445                 doc["fragment_anchor"] = fields['fragment_anchor']
446
447             if 'themes' in fields:
448                 doc['themes'] = fields['themes']
449             doc['uid'] = "part%s-%s-%s-%s" % (
450                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
451             return doc
452
453         fragments = {}
454         snippets = Snippets(book.id).open('w')
455         try:
456             for header, position in zip(list(master), range(len(master))):
457
458                 if header.tag in self.skip_header_tags:
459                     continue
460                 if header.tag is etree.Comment:
461                     continue
462
463                 # section content
464                 content = []
465                 footnote = []
466
467                 def all_content(text):
468                     for frag in fragments.values():
469                         frag['text'].append(text)
470                     content.append(text)
471                 handle_text = [all_content]
472
473                 for start, text, end in walker(header):
474                     # handle footnotes
475                     if start is not None and start.tag in self.footnote_tags:
476                         footnote = []
477
478                         def collect_footnote(t):
479                             footnote.append(t)
480
481                         handle_text.append(collect_footnote)
482                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
483                         handle_text.pop()
484                         doc = add_part(snippets, header_index=position, header_type=header.tag,
485                                        text=''.join(footnote))
486                         self.add_snippet(book, doc)
487                         self.index.add(doc)
488                         footnote = []
489
490                     # handle fragments and themes.
491                     if start is not None and start.tag == 'begin':
492                         fid = start.attrib['id'][1:]
493                         fragments[fid] = {
494                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
495
496                     # themes for this fragment
497                     elif start is not None and start.tag == 'motyw':
498                         fid = start.attrib['id'][1:]
499                         handle_text.append(lambda text: None)
500                         if start.text is not None:
501                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
502                     elif end is not None and end.tag == 'motyw':
503                         handle_text.pop()
504
505                     elif start is not None and start.tag == 'end':
506                         fid = start.attrib['id'][1:]
507                         if fid not in fragments:
508                             continue  # a broken <end> node, skip it
509                         frag = fragments[fid]
510                         if not frag['themes']:
511                             continue  # empty themes list.
512                         del fragments[fid]
513
514                         doc = add_part(snippets,
515                                        header_type=frag['start_header'],
516                                        header_index=frag['start_section'],
517                                        header_span=position - frag['start_section'] + 1,
518                                        fragment_anchor=fid,
519                                        text=fix_format(frag['text']),
520                                        themes=frag['themes'])
521                         # Add searchable fragment
522                         self.add_snippet(book, doc)
523                         self.index.add(doc)
524
525                         # Collect content.
526
527                     if text is not None and handle_text is not []:
528                         hdl = handle_text[-1]
529                         hdl(text)
530
531                         # in the end, add a section text.
532                 doc = add_part(snippets, header_index=position,
533                                header_type=header.tag, text=fix_format(content))
534
535                 self.add_snippet(book, doc)
536                 self.index.add(doc)
537
538         finally:
539             snippets.close()
540
541     def remove_picture(self, picture_or_id):
542         """Removes a picture from search index."""
543         if isinstance(picture_or_id, picture.models.Picture):
544             picture_id = picture_or_id.id
545         else:
546             picture_id = picture_or_id
547         self.delete_query(self.index.Q(picture_id=picture_id))
548
549     def index_picture(self, picture, picture_info=None, overwrite=True):
550         """
551         Indexes the picture.
552         Creates a lucene document for extracted metadata
553         and calls self.index_area() to index the contents of the picture.
554         """
555         if overwrite:
556             # we don't remove snippets, since they might be still needed by
557             # threads using not reopened index
558             self.remove_picture(picture)
559
560         picture_doc = {'picture_id': int(picture.id)}
561         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
562             'authors', 'title', 'epochs', 'kinds', 'genres'])
563
564         picture_doc.update(meta_fields)
565
566         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
567         self.index.add(picture_doc)
568         del picture_doc['is_book']
569         for area in picture.areas.all():
570             self.index_area(area, picture_fields=picture_doc)
571
572     def index_area(self, area, picture_fields):
573         """
574         Indexes themes and objects on the area.
575         """
576         doc = dict(picture_fields)
577         doc['area_id'] = area.id
578         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
579         doc['uid'] = 'area%s' % area.id
580         self.index.add(doc)
581
582
583 @total_ordering
584 class SearchResult(object):
585     def __init__(self, doc, how_found=None, query_terms=None):
586         self.boost = 1.0
587         self._hits = []
588         self._processed_hits = None  # processed hits
589         self.snippets = []
590         self.query_terms = query_terms
591         self._book = None
592
593         if 'score' in doc:
594             self._score = doc['score']
595         else:
596             self._score = 0
597
598         self.book_id = int(doc["book_id"])
599
600         try:
601             self.published_date = int(doc.get("published_date"))
602         except ValueError:
603             self.published_date = 0
604
605         # content hits
606         header_type = doc.get("header_type", None)
607         # we have a content hit in some header of fragment
608         if header_type is not None:
609             sec = (header_type, int(doc["header_index"]))
610             header_span = doc['header_span']
611             header_span = header_span is not None and int(header_span) or 1
612             fragment = doc.get("fragment_anchor", None)
613             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
614             snippets_rev = doc.get('snippets_revision', None)
615
616             hit = (sec + (header_span,), fragment, self._score, {
617                 'how_found': how_found,
618                 'snippets_pos': snippets_pos,
619                 'snippets_revision': snippets_rev,
620                 'themes': doc.get('themes', []),
621                 'themes_pl': doc.get('themes_pl', [])
622                 })
623
624             self._hits.append(hit)
625
626     @classmethod
627     def from_book(cls, book, how_found=None, query_terms=None):
628         doc = {
629             'score': book.popularity.count,
630             'book_id': book.id,
631             'published_date': 0,
632         }
633         result = cls(doc, how_found=how_found, query_terms=query_terms)
634         result._book = book
635         return result
636
637     def __str__(self):
638         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
639             (self.book_id, len(self._hits),
640              len(self._processed_hits) if self._processed_hits else -1,
641              self._score, len(self.snippets))
642
643     def __bytes__(self):
644         return str(self).encode('utf-8')
645
646     @property
647     def score(self):
648         return self._score * self.boost
649
650     def merge(self, other):
651         if self.book_id != other.book_id:
652             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
653         self._hits += other._hits
654         self._score += max(other._score, 0)
655         return self
656
657     def get_book(self):
658         if self._book is not None:
659             return self._book
660         try:
661             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
662         except catalogue.models.Book.DoesNotExist:
663             self._book = None
664         return self._book
665
666     book = property(get_book)
667
668     POSITION = 0
669     FRAGMENT = 1
670     POSITION_INDEX = 1
671     POSITION_SPAN = 2
672     SCORE = 2
673     OTHER = 3
674
675     @property
676     def hits(self):
677         if self._processed_hits is not None:
678             return self._processed_hits
679
680         # to sections and fragments
681         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
682
683         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
684
685         # sections not covered by fragments
686         sect = filter(lambda s: 0 == len(list(filter(
687             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
688                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
689
690         def remove_duplicates(lst, keyfn, larger):
691             els = {}
692             for e in lst:
693                 eif = keyfn(e)
694                 if eif in els:
695                     if larger(els[eif], e):
696                         continue
697                 els[eif] = e
698             return els.values()
699
700         # remove fragments with duplicated fid's and duplicated snippets
701         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
702
703         # remove duplicate sections
704         sections = {}
705
706         for s in sect:
707             si = s[self.POSITION][self.POSITION_INDEX]
708             # skip existing
709             if si in sections:
710                 if sections[si]['score'] >= s[self.SCORE]:
711                     continue
712
713             m = {'score': s[self.SCORE],
714                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
715                  }
716             m.update(s[self.OTHER])
717             sections[si] = m
718
719         hits = list(sections.values())
720
721         for f in frags:
722             try:
723                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
724             except catalogue.models.Fragment.DoesNotExist:
725                 # stale index
726                 continue
727             # Figure out if we were searching for a token matching some word in theme name.
728             themes = frag.tags.filter(category='theme')
729             themes_hit = set()
730             if self.query_terms is not None:
731                 for i in range(0, len(f[self.OTHER]['themes'])):
732                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
733                     tms = map(str.lower, tms)
734                     for qt in self.query_terms:
735                         if qt in tms:
736                             themes_hit.add(f[self.OTHER]['themes'][i])
737                             break
738
739             def theme_by_name(n):
740                 th = list(filter(lambda t: t.name == n, themes))
741                 if th:
742                     return th[0]
743                 else:
744                     return None
745             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
746
747             m = {'score': f[self.SCORE],
748                  'fragment': frag,
749                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
750                  'themes': themes,
751                  'themes_hit': themes_hit
752                  }
753             m.update(f[self.OTHER])
754             hits.append(m)
755
756         hits.sort(key=lambda h: h['score'], reverse=True)
757
758         self._processed_hits = hits
759
760         return hits
761
762     @staticmethod
763     def aggregate(*result_lists):
764         books = {}
765         for rl in result_lists:
766             for r in rl:
767                 if r.book_id in books:
768                     books[r.book_id].merge(r)
769                 else:
770                     books[r.book_id] = r
771         return books.values()
772
773     def get_sort_key(self):
774         return (-self.score,
775                 self.published_date,
776                 self.book.sort_key_author if self.book else '',
777                 self.book.sort_key if self.book else '')
778
779     def __lt__(self, other):
780         return self.get_sort_key() > other.get_sort_key()
781
782     def __eq__(self, other):
783         return self.get_sort_key() == other.get_sort_key()
784
785     def __len__(self):
786         return len(self.hits)
787
788     def snippet_pos(self, idx=0):
789         return self.hits[idx]['snippets_pos']
790
791     def snippet_revision(self, idx=0):
792         try:
793             return self.hits[idx]['snippets_revision']
794         except (IndexError, KeyError):
795             return None
796
797
798 @total_ordering
799 class PictureResult(object):
800     def __init__(self, doc, how_found=None, query_terms=None):
801         self.boost = 1.0
802         self.query_terms = query_terms
803         self._picture = None
804         self._hits = []
805         self._processed_hits = None
806
807         if 'score' in doc:
808             self._score = doc['score']
809         else:
810             self._score = 0
811
812         self.picture_id = int(doc["picture_id"])
813
814         if doc.get('area_id'):
815             hit = (self._score, {
816                 'how_found': how_found,
817                 'area_id': doc['area_id'],
818                 'themes': doc.get('themes', []),
819                 'themes_pl': doc.get('themes_pl', []),
820             })
821
822             self._hits.append(hit)
823
824     def __str__(self):
825         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
826
827     def __repr__(self):
828         return str(self)
829
830     @property
831     def score(self):
832         return self._score * self.boost
833
834     def merge(self, other):
835         if self.picture_id != other.picture_id:
836             raise ValueError(
837                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
838         self._hits += other._hits
839         self._score += max(other._score, 0)
840         return self
841
842     SCORE = 0
843     OTHER = 1
844
845     @property
846     def hits(self):
847         if self._processed_hits is not None:
848             return self._processed_hits
849
850         hits = []
851         for hit in self._hits:
852             try:
853                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
854             except picture.models.PictureArea.DoesNotExist:
855                 # stale index
856                 continue
857             # Figure out if we were searching for a token matching some word in theme name.
858             themes_hit = set()
859             if self.query_terms is not None:
860                 for i in range(0, len(hit[self.OTHER]['themes'])):
861                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
862                     tms = map(str.lower, tms)
863                     for qt in self.query_terms:
864                         if qt in tms:
865                             themes_hit.add(hit[self.OTHER]['themes'][i])
866                             break
867
868             m = {
869                 'score': hit[self.SCORE],
870                 'area': area,
871                 'themes_hit': themes_hit,
872             }
873             m.update(hit[self.OTHER])
874             hits.append(m)
875
876         hits.sort(key=lambda h: h['score'], reverse=True)
877         hits = hits[:1]
878         self._processed_hits = hits
879         return hits
880
881     def get_picture(self):
882         if self._picture is None:
883             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
884         return self._picture
885
886     picture = property(get_picture)
887
888     @staticmethod
889     def aggregate(*result_lists):
890         books = {}
891         for rl in result_lists:
892             for r in rl:
893                 if r.picture_id in books:
894                     books[r.picture_id].merge(r)
895                 else:
896                     books[r.picture_id] = r
897         return books.values()
898
899     def __lt__(self, other):
900         return self.score < other.score
901
902     def __eq__(self, other):
903         return self.score == other.score
904
905
906 class Search(SolrIndex):
907     """
908     Search facilities.
909     """
910     def __init__(self, default_field="text"):
911         super(Search, self).__init__(mode='r')
912
913     def make_term_query(self, query, field='text', modal=operator.or_):
914         """
915         Returns term queries joined by boolean query.
916         modal - applies to boolean query
917         fuzzy - should the query by fuzzy.
918         """
919         if query is None:
920             query = ''
921         q = self.index.Q()
922         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
923
924         return q
925
926     def search_by_author(self, words):
927         from catalogue.models import Book
928         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
929         for word in words:
930             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
931         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
932
933     def search_words(self, words, fields, required=None, book=True, picture=False):
934         if book and not picture and fields == ['authors']:
935             return self.search_by_author(words)
936         filters = []
937         for word in words:
938             if book or picture or (word not in stopwords):
939                 word_filter = None
940                 for field in fields:
941                     q = self.index.Q(**{field: word})
942                     if word_filter is None:
943                         word_filter = q
944                     else:
945                         word_filter |= q
946                 filters.append(word_filter)
947         if required:
948             required_filter = None
949             for field in required:
950                 for word in words:
951                     if book or picture or (word not in stopwords):
952                         q = self.index.Q(**{field: word})
953                         if required_filter is None:
954                             required_filter = q
955                         else:
956                             required_filter |= q
957             filters.append(required_filter)
958         if not filters:
959             return []
960         params = {}
961         if book:
962             params['is_book'] = True
963         if picture:
964             params['picture_id__gt'] = 0
965         else:
966             params['book_id__gt'] = 0
967         query = self.index.query(**params)
968         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
969         result_class = PictureResult if picture else SearchResult
970         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
971
972     def get_snippets(self, searchresult, query, field='text', num=1):
973         """
974         Returns a snippet for found scoreDoc.
975         """
976         maxnum = len(searchresult)
977         if num is None or num < 0 or num > maxnum:
978             num = maxnum
979         book_id = searchresult.book_id
980         revision = searchresult.snippet_revision()
981         snippets = Snippets(book_id, revision=revision)
982         snips = [None] * maxnum
983         try:
984             snippets.open()
985             idx = 0
986             while idx < maxnum and num > 0:
987                 position, length = searchresult.snippet_pos(idx)
988                 if position is None or length is None:
989                     continue
990                 text = snippets.get((int(position),
991                                      int(length)))
992                 snip = self.index.highlight(text=text, field=field, q=query)
993                 if not snip and field == 'text':
994                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
995                 if snip not in snips:
996                     snips[idx] = snip
997                     if snip:
998                         num -= 1
999                 idx += 1
1000
1001         except IOError as e:
1002             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1003             if not book:
1004                 log.error("Book does not exist for book id = %d" % book_id)
1005             elif not book.get().children.exists():
1006                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1007             return []
1008         finally:
1009             snippets.close()
1010
1011         # remove verse end markers..
1012         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1013
1014         searchresult.snippets = snips
1015
1016         return snips
1017
1018     @staticmethod
1019     def apply_filters(query, filters):
1020         """
1021         Apply filters to a query
1022         """
1023         if filters is None:
1024             filters = []
1025         filters = filter(lambda x: x is not None, filters)
1026         for f in filters:
1027             query = query.query(f)
1028         return query
1029
1030
1031 if getattr(settings, 'SEARCH_MOCK', False):
1032     from .mock_search import Search