4606f57db86ed20571f10c30d827350af7ce563d
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.person
13 import librarian.meta.types.text
14 from librarian.parser import WLDocument
15 from lxml import etree
16 import scorched
17 import catalogue.models
18 import picture.models
19 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
20 from wolnelektury.utils import makedirs
21 from . import custom
22
23 log = logging.getLogger('search')
24
25
26 if os.path.isfile(settings.SOLR_STOPWORDS):
27     stopwords = set(
28         line.strip()
29         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
30 else:
31     stopwords = set()
32
33
34 class SolrIndex(object):
35     def __init__(self, mode=None):
36         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
37
38
39 class Snippets(object):
40     """
41     This class manages snippet files for indexed object (book)
42     the snippets are concatenated together, and their positions and
43     lengths are kept in lucene index fields.
44     """
45     SNIPPET_DIR = "snippets"
46
47     def __init__(self, book_id, revision=None):
48         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
49         self.book_id = book_id
50         self.revision = revision
51         self.file = None
52         self.position = None
53
54     @property
55     def path(self):
56         if self.revision:
57             fn = "%d.%d" % (self.book_id, self.revision)
58         else:
59             fn = "%d" % self.book_id
60
61         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
62
63     def open(self, mode='r'):
64         """
65         Open the snippet file. Call .close() afterwards.
66         """
67         if 'b' not in mode:
68             mode += 'b'
69
70         if 'w' in mode:
71             if os.path.exists(self.path):
72                 self.revision = 1
73                 while True:
74                     if not os.path.exists(self.path):
75                         break
76                     self.revision += 1
77
78         self.file = open(self.path, mode)
79         self.position = 0
80         return self
81
82     def add(self, snippet):
83         """
84         Append a snippet (unicode) to the snippet file.
85         Return a (position, length) tuple
86         """
87         txt = snippet.encode('utf-8')
88         l = len(txt)
89         self.file.write(txt)
90         pos = (self.position, l)
91         self.position += l
92         return pos
93
94     def get(self, pos):
95         """
96         Given a tuple of (position, length) return an unicode
97         of the snippet stored there.
98         """
99         self.file.seek(pos[0], 0)
100         try:
101             txt = self.file.read(pos[1]).decode('utf-8')
102         except:
103             return ''
104         return txt
105
106     def close(self):
107         """Close snippet file"""
108         if self.file:
109             self.file.close()
110
111     def remove(self):
112         self.revision = None
113         try:
114             os.unlink(self.path)
115             self.revision = 0
116             while True:
117                 self.revision += 1
118                 os.unlink(self.path)
119         except OSError:
120             pass
121
122
123 class Index(SolrIndex):
124     """
125     Class indexing books.
126     """
127     def __init__(self):
128         super(Index, self).__init__(mode='rw')
129
130     def remove_snippets(self, book):
131         book.snippet_set.all().delete()
132
133     def add_snippet(self, book, doc):
134         assert book.id == doc.pop('book_id')
135         # Fragments already exist and can be indexed where they live.
136         if 'fragment_anchor' in doc:
137             return
138
139         text = doc.pop('text')
140         header_index = doc.pop('header_index')
141         book.snippet_set.create(
142             sec=header_index,
143             text=text,
144         )
145
146     def delete_query(self, *queries):
147         """
148         index.delete(queries=...) doesn't work, so let's reimplement it
149         using deletion of list of uids.
150         """
151         uids = set()
152         for q in queries:
153             if isinstance(q, scorched.search.LuceneQuery):
154                 q = self.index.query(q)
155             q.field_limiter.update(['uid'])
156             st = 0
157             rows = 100
158             while True:
159                 ids = q.paginate(start=st, rows=rows).execute()
160                 if not len(ids):
161                     break
162                 for res in ids:
163                     uids.add(res['uid'])
164                 st += rows
165         if uids:
166             # FIXME: With Solr API change, this doesn't work.
167             #self.index.delete(uids)
168             return True
169         else:
170             return False
171
172     def index_tags(self, *tags, **kw):
173         """
174         Re-index global tag list.
175         Removes all tags from index, then index them again.
176         Indexed fields include: id, name (with and without polish stems), category
177         """
178         log.debug("Indexing tags")
179         remove_only = kw.get('remove_only', False)
180         # first, remove tags from index.
181         if tags:
182             tag_qs = []
183             for tag in tags:
184                 q_id = self.index.Q(tag_id=tag.id)
185
186                 if isinstance(tag, PDCounterAuthor):
187                     q_cat = self.index.Q(tag_category='pd_author')
188                 elif isinstance(tag, PDCounterBook):
189                     q_cat = self.index.Q(tag_category='pd_book')
190                 else:
191                     q_cat = self.index.Q(tag_category=tag.category)
192
193                 q_id_cat = self.index.Q(q_id & q_cat)
194                 tag_qs.append(q_id_cat)
195             self.delete_query(*tag_qs)
196         else:  # all
197             q = self.index.Q(tag_id__any=True)
198             self.delete_query(q)
199
200         if not remove_only:
201             # then add them [all or just one passed]
202             if not tags:
203                 tags = chain(
204                     catalogue.models.Tag.objects.exclude(category='set'),
205                     PDCounterAuthor.objects.all(),
206                     PDCounterBook.objects.all())
207
208             for tag in tags:
209                 if isinstance(tag, PDCounterAuthor):
210                     doc = {
211                         "tag_id": int(tag.id),
212                         "tag_name": tag.name,
213                         "tag_name_pl": tag.name,
214                         "tag_category": 'pd_author',
215                         "is_pdcounter": True,
216                         "uid": "tag%d_pd_a" % tag.id
217                         }
218                 elif isinstance(tag, PDCounterBook):
219                     doc = {
220                         "tag_id": int(tag.id),
221                         "tag_name": tag.title,
222                         "tag_name_pl": tag.title,
223                         "tag_category": 'pd_book',
224                         "is_pdcounter": True,
225                         "uid": "tag%d_pd_b" % tag.id
226                         }
227                 else:
228                     doc = {
229                         "tag_id": int(tag.id),
230                         "tag_name": tag.name,
231                         "tag_name_pl": tag.name,
232                         "tag_category": tag.category,
233                         "is_pdcounter": False,
234                         "uid": "tag%d" % tag.id
235                         }
236                 self.index.add(doc)
237
238     def create_book_doc(self, book):
239         """
240         Create a lucene document referring book id.
241         """
242         doc = {'book_id': int(book.id)}
243         if book.parent is not None:
244             doc['parent_id'] = int(book.parent.id)
245         return doc
246
247     def remove_book(self, book, remove_snippets=True, legacy=True):
248         """Removes a book from search index.
249         book - Book instance."""
250         if legacy:
251           self.delete_query(self.index.Q(book_id=book.id))
252
253           if remove_snippets:
254             snippets = Snippets(book.id)
255             snippets.remove()
256         self.remove_snippets(book)
257
258     def index_book(self, book, book_info=None, overwrite=True, legacy=True):
259         """
260         Indexes the book.
261         Creates a lucene document for extracted metadata
262         and calls self.index_content() to index the contents of the book.
263         """
264         if not book.xml_file: return
265
266         if overwrite:
267             # we don't remove snippets, since they might be still needed by
268             # threads using not reopened index
269             self.remove_book(book, remove_snippets=False, legacy=legacy)
270
271         book_doc = self.create_book_doc(book)
272         meta_fields = self.extract_metadata(book, book_info, dc_only=[
273             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
274         # let's not index it - it's only used for extracting publish date
275         if 'source_name' in meta_fields:
276             del meta_fields['source_name']
277
278         for n, f in meta_fields.items():
279             book_doc[n] = f
280
281         book_doc['uid'] = "book%s" % book_doc['book_id']
282         if legacy:
283             self.index.add(book_doc)
284         del book_doc
285         book_fields = {
286             'title': meta_fields['title'],
287             'authors': meta_fields['authors'],
288             'published_date': meta_fields['published_date']
289             }
290
291         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
292             if tag_name in meta_fields:
293                 book_fields[tag_name] = meta_fields[tag_name]
294
295         self.index_content(book, book_fields=book_fields, legacy=legacy)
296
297     master_tags = [
298         'opowiadanie',
299         'powiesc',
300         'dramat_wierszowany_l',
301         'dramat_wierszowany_lp',
302         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
303         'wywiad',
304     ]
305
306     ignore_content_tags = [
307         'uwaga', 'extra', 'nota_red', 'abstrakt',
308         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
309         'didaskalia',
310         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
311     ]
312
313     footnote_tags = ['pa', 'pt', 'pr', 'pe']
314
315     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
316                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
317
318     published_date_re = re.compile("([0-9]+)[\]. ]*$")
319
320     def extract_metadata(self, book, book_info=None, dc_only=None):
321         """
322         Extract metadata from book and returns a map of fields keyed by fieldname
323         """
324         fields = {}
325
326         if book_info is None:
327             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
328
329         fields['slug'] = book.slug
330         fields['is_book'] = True
331
332         # validator, name
333         for field in dcparser.BookInfo.FIELDS:
334             if dc_only and field.name not in dc_only:
335                 continue
336             if hasattr(book_info, field.name):
337                 if not getattr(book_info, field.name):
338                     continue
339                 type_indicator = field.value_type
340                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
341                     s = getattr(book_info, field.name)
342                     if field.multiple:
343                         s = ', '.join(s)
344                     fields[field.name] = s
345                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
346                     p = getattr(book_info, field.name)
347                     if isinstance(p, librarian.meta.types.person.Person):
348                         persons = str(p)
349                     else:
350                         persons = ', '.join(map(str, p))
351                     fields[field.name] = persons
352
353         # get published date
354         pd = None
355         if hasattr(book_info, 'source_name') and book_info.source_name:
356             match = self.published_date_re.search(book_info.source_name)
357             if match is not None:
358                 pd = str(match.groups()[0])
359         if not pd:
360             pd = ""
361         fields["published_date"] = pd
362
363         return fields
364
365     # def add_gaps(self, fields, fieldname):
366     #     """
367     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
368     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
369     #     """
370     #     def gap():
371     #         while True:
372     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
373     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
374
375     def get_master(self, root):
376         """
377         Returns the first master tag from an etree.
378         """
379         for master in root.iter():
380             if master.tag in self.master_tags:
381                 return master
382
383     def index_content(self, book, book_fields, legacy=True):
384         """
385         Walks the book XML and extract content from it.
386         Adds parts for each header tag and for each fragment.
387         """
388         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
389         root = wld.edoc.getroot()
390
391         master = self.get_master(root)
392         if master is None:
393             return []
394
395         def walker(node):
396             if node.tag not in self.ignore_content_tags:
397                 yield node, None, None
398                 if node.text is not None:
399                     yield None, node.text, None
400                 for child in list(node):
401                     for b, t, e in walker(child):
402                         yield b, t, e
403                 yield None, None, node
404
405             if node.tail is not None:
406                 yield None, node.tail, None
407             return
408
409         def fix_format(text):
410             # separator = [" ", "\t", ".", ";", ","]
411             if isinstance(text, list):
412                 # need to join it first
413                 text = filter(lambda s: s is not None, content)
414                 text = ' '.join(text)
415                 # for i in range(len(text)):
416                 #     if i > 0:
417                 #         if text[i][0] not in separator\
418                 #             and text[i - 1][-1] not in separator:
419                 #          text.insert(i, " ")
420
421             return re.sub("(?m)/$", "", text)
422
423         def add_part(snippets, **fields):
424             doc = self.create_book_doc(book)
425             for n, v in book_fields.items():
426                 doc[n] = v
427
428             doc['header_index'] = fields["header_index"]
429             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
430             doc['header_type'] = fields['header_type']
431
432             doc['text'] = fields['text']
433
434             # snippets
435             snip_pos = snippets.add(fields["text"])
436
437             doc['snippets_position'] = snip_pos[0]
438             doc['snippets_length'] = snip_pos[1]
439             if snippets.revision:
440                 doc["snippets_revision"] = snippets.revision
441
442             if 'fragment_anchor' in fields:
443                 doc["fragment_anchor"] = fields['fragment_anchor']
444
445             if 'themes' in fields:
446                 doc['themes'] = fields['themes']
447             doc['uid'] = "part%s-%s-%s-%s" % (
448                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
449             return doc
450
451         fragments = {}
452         snippets = Snippets(book.id).open('w')
453         try:
454             for header, position in zip(list(master), range(len(master))):
455
456                 if header.tag in self.skip_header_tags:
457                     continue
458                 if header.tag is etree.Comment:
459                     continue
460
461                 # section content
462                 content = []
463                 footnote = []
464
465                 def all_content(text):
466                     for frag in fragments.values():
467                         frag['text'].append(text)
468                     content.append(text)
469                 handle_text = [all_content]
470
471                 for start, text, end in walker(header):
472                     # handle footnotes
473                     if start is not None and start.tag in self.footnote_tags:
474                         footnote = []
475
476                         def collect_footnote(t):
477                             footnote.append(t)
478
479                         handle_text.append(collect_footnote)
480                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
481                         handle_text.pop()
482                         doc = add_part(snippets, header_index=position, header_type=header.tag,
483                                        text=''.join(footnote))
484                         self.add_snippet(book, doc)
485                         if legacy:
486                             self.index.add(doc)
487                         footnote = []
488
489                     # handle fragments and themes.
490                     if start is not None and start.tag == 'begin':
491                         fid = start.attrib['id'][1:]
492                         fragments[fid] = {
493                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
494
495                     # themes for this fragment
496                     elif start is not None and start.tag == 'motyw':
497                         fid = start.attrib['id'][1:]
498                         handle_text.append(lambda text: None)
499                         if start.text is not None:
500                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
501                     elif end is not None and end.tag == 'motyw':
502                         handle_text.pop()
503
504                     elif start is not None and start.tag == 'end':
505                         fid = start.attrib['id'][1:]
506                         if fid not in fragments:
507                             continue  # a broken <end> node, skip it
508                         frag = fragments[fid]
509                         if not frag['themes']:
510                             continue  # empty themes list.
511                         del fragments[fid]
512
513                         doc = add_part(snippets,
514                                        header_type=frag['start_header'],
515                                        header_index=frag['start_section'],
516                                        header_span=position - frag['start_section'] + 1,
517                                        fragment_anchor=fid,
518                                        text=fix_format(frag['text']),
519                                        themes=frag['themes'])
520                         # Add searchable fragment
521                         self.add_snippet(book, doc)
522                         if legacy:
523                             self.index.add(doc)
524
525                         # Collect content.
526
527                     if text is not None and handle_text is not []:
528                         hdl = handle_text[-1]
529                         hdl(text)
530
531                         # in the end, add a section text.
532                 doc = add_part(snippets, header_index=position,
533                                header_type=header.tag, text=fix_format(content))
534
535                 self.add_snippet(book, doc)
536                 if legacy:
537                     self.index.add(doc)
538
539         finally:
540             snippets.close()
541
542     def remove_picture(self, picture_or_id):
543         """Removes a picture from search index."""
544         if isinstance(picture_or_id, picture.models.Picture):
545             picture_id = picture_or_id.id
546         else:
547             picture_id = picture_or_id
548         self.delete_query(self.index.Q(picture_id=picture_id))
549
550     def index_picture(self, picture, picture_info=None, overwrite=True):
551         """
552         Indexes the picture.
553         Creates a lucene document for extracted metadata
554         and calls self.index_area() to index the contents of the picture.
555         """
556         if overwrite:
557             # we don't remove snippets, since they might be still needed by
558             # threads using not reopened index
559             self.remove_picture(picture)
560
561         picture_doc = {'picture_id': int(picture.id)}
562         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
563             'authors', 'title', 'epochs', 'kinds', 'genres'])
564
565         picture_doc.update(meta_fields)
566
567         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
568         self.index.add(picture_doc)
569         del picture_doc['is_book']
570         for area in picture.areas.all():
571             self.index_area(area, picture_fields=picture_doc)
572
573     def index_area(self, area, picture_fields):
574         """
575         Indexes themes and objects on the area.
576         """
577         doc = dict(picture_fields)
578         doc['area_id'] = area.id
579         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
580         doc['uid'] = 'area%s' % area.id
581         self.index.add(doc)
582
583
584 @total_ordering
585 class SearchResult(object):
586     def __init__(self, doc, how_found=None, query_terms=None):
587         self.boost = 1.0
588         self._hits = []
589         self._processed_hits = None  # processed hits
590         self.snippets = []
591         self.query_terms = query_terms
592         self._book = None
593
594         if 'score' in doc:
595             self._score = doc['score']
596         else:
597             self._score = 0
598
599         self.book_id = int(doc["book_id"])
600
601         try:
602             self.published_date = int(doc.get("published_date"))
603         except ValueError:
604             self.published_date = 0
605
606         # content hits
607         header_type = doc.get("header_type", None)
608         # we have a content hit in some header of fragment
609         if header_type is not None:
610             sec = (header_type, int(doc["header_index"]))
611             header_span = doc['header_span']
612             header_span = header_span is not None and int(header_span) or 1
613             fragment = doc.get("fragment_anchor", None)
614             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
615             snippets_rev = doc.get('snippets_revision', None)
616
617             hit = (sec + (header_span,), fragment, self._score, {
618                 'how_found': how_found,
619                 'snippets_pos': snippets_pos,
620                 'snippets_revision': snippets_rev,
621                 'themes': doc.get('themes', []),
622                 'themes_pl': doc.get('themes_pl', [])
623                 })
624
625             self._hits.append(hit)
626
627     @classmethod
628     def from_book(cls, book, how_found=None, query_terms=None):
629         doc = {
630             'score': book.popularity.count,
631             'book_id': book.id,
632             'published_date': 0,
633         }
634         result = cls(doc, how_found=how_found, query_terms=query_terms)
635         result._book = book
636         return result
637
638     def __str__(self):
639         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
640             (self.book_id, len(self._hits),
641              len(self._processed_hits) if self._processed_hits else -1,
642              self._score, len(self.snippets))
643
644     def __bytes__(self):
645         return str(self).encode('utf-8')
646
647     @property
648     def score(self):
649         return self._score * self.boost
650
651     def merge(self, other):
652         if self.book_id != other.book_id:
653             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
654         self._hits += other._hits
655         self._score += max(other._score, 0)
656         return self
657
658     def get_book(self):
659         if self._book is not None:
660             return self._book
661         try:
662             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
663         except catalogue.models.Book.DoesNotExist:
664             self._book = None
665         return self._book
666
667     book = property(get_book)
668
669     POSITION = 0
670     FRAGMENT = 1
671     POSITION_INDEX = 1
672     POSITION_SPAN = 2
673     SCORE = 2
674     OTHER = 3
675
676     @property
677     def hits(self):
678         if self._processed_hits is not None:
679             return self._processed_hits
680
681         # to sections and fragments
682         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
683
684         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
685
686         # sections not covered by fragments
687         sect = filter(lambda s: 0 == len(list(filter(
688             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
689                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
690
691         def remove_duplicates(lst, keyfn, larger):
692             els = {}
693             for e in lst:
694                 eif = keyfn(e)
695                 if eif in els:
696                     if larger(els[eif], e):
697                         continue
698                 els[eif] = e
699             return els.values()
700
701         # remove fragments with duplicated fid's and duplicated snippets
702         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
703
704         # remove duplicate sections
705         sections = {}
706
707         for s in sect:
708             si = s[self.POSITION][self.POSITION_INDEX]
709             # skip existing
710             if si in sections:
711                 if sections[si]['score'] >= s[self.SCORE]:
712                     continue
713
714             m = {'score': s[self.SCORE],
715                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
716                  }
717             m.update(s[self.OTHER])
718             sections[si] = m
719
720         hits = list(sections.values())
721
722         for f in frags:
723             try:
724                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
725             except catalogue.models.Fragment.DoesNotExist:
726                 # stale index
727                 continue
728             # Figure out if we were searching for a token matching some word in theme name.
729             themes = frag.tags.filter(category='theme')
730             themes_hit = set()
731             if self.query_terms is not None:
732                 for i in range(0, len(f[self.OTHER]['themes'])):
733                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
734                     tms = map(str.lower, tms)
735                     for qt in self.query_terms:
736                         if qt in tms:
737                             themes_hit.add(f[self.OTHER]['themes'][i])
738                             break
739
740             def theme_by_name(n):
741                 th = list(filter(lambda t: t.name == n, themes))
742                 if th:
743                     return th[0]
744                 else:
745                     return None
746             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
747
748             m = {'score': f[self.SCORE],
749                  'fragment': frag,
750                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
751                  'themes': themes,
752                  'themes_hit': themes_hit
753                  }
754             m.update(f[self.OTHER])
755             hits.append(m)
756
757         hits.sort(key=lambda h: h['score'], reverse=True)
758
759         self._processed_hits = hits
760
761         return hits
762
763     @staticmethod
764     def aggregate(*result_lists):
765         books = {}
766         for rl in result_lists:
767             for r in rl:
768                 if r.book_id in books:
769                     books[r.book_id].merge(r)
770                 else:
771                     books[r.book_id] = r
772         return books.values()
773
774     def get_sort_key(self):
775         return (-self.score,
776                 self.published_date,
777                 self.book.sort_key_author if self.book else '',
778                 self.book.sort_key if self.book else '')
779
780     def __lt__(self, other):
781         return self.get_sort_key() > other.get_sort_key()
782
783     def __eq__(self, other):
784         return self.get_sort_key() == other.get_sort_key()
785
786     def __len__(self):
787         return len(self.hits)
788
789     def snippet_pos(self, idx=0):
790         return self.hits[idx]['snippets_pos']
791
792     def snippet_revision(self, idx=0):
793         try:
794             return self.hits[idx]['snippets_revision']
795         except (IndexError, KeyError):
796             return None
797
798
799 @total_ordering
800 class PictureResult(object):
801     def __init__(self, doc, how_found=None, query_terms=None):
802         self.boost = 1.0
803         self.query_terms = query_terms
804         self._picture = None
805         self._hits = []
806         self._processed_hits = None
807
808         if 'score' in doc:
809             self._score = doc['score']
810         else:
811             self._score = 0
812
813         self.picture_id = int(doc["picture_id"])
814
815         if doc.get('area_id'):
816             hit = (self._score, {
817                 'how_found': how_found,
818                 'area_id': doc['area_id'],
819                 'themes': doc.get('themes', []),
820                 'themes_pl': doc.get('themes_pl', []),
821             })
822
823             self._hits.append(hit)
824
825     def __str__(self):
826         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
827
828     def __repr__(self):
829         return str(self)
830
831     @property
832     def score(self):
833         return self._score * self.boost
834
835     def merge(self, other):
836         if self.picture_id != other.picture_id:
837             raise ValueError(
838                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
839         self._hits += other._hits
840         self._score += max(other._score, 0)
841         return self
842
843     SCORE = 0
844     OTHER = 1
845
846     @property
847     def hits(self):
848         if self._processed_hits is not None:
849             return self._processed_hits
850
851         hits = []
852         for hit in self._hits:
853             try:
854                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
855             except picture.models.PictureArea.DoesNotExist:
856                 # stale index
857                 continue
858             # Figure out if we were searching for a token matching some word in theme name.
859             themes_hit = set()
860             if self.query_terms is not None:
861                 for i in range(0, len(hit[self.OTHER]['themes'])):
862                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
863                     tms = map(str.lower, tms)
864                     for qt in self.query_terms:
865                         if qt in tms:
866                             themes_hit.add(hit[self.OTHER]['themes'][i])
867                             break
868
869             m = {
870                 'score': hit[self.SCORE],
871                 'area': area,
872                 'themes_hit': themes_hit,
873             }
874             m.update(hit[self.OTHER])
875             hits.append(m)
876
877         hits.sort(key=lambda h: h['score'], reverse=True)
878         hits = hits[:1]
879         self._processed_hits = hits
880         return hits
881
882     def get_picture(self):
883         if self._picture is None:
884             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
885         return self._picture
886
887     picture = property(get_picture)
888
889     @staticmethod
890     def aggregate(*result_lists):
891         books = {}
892         for rl in result_lists:
893             for r in rl:
894                 if r.picture_id in books:
895                     books[r.picture_id].merge(r)
896                 else:
897                     books[r.picture_id] = r
898         return books.values()
899
900     def __lt__(self, other):
901         return self.score < other.score
902
903     def __eq__(self, other):
904         return self.score == other.score
905
906
907 class Search(SolrIndex):
908     """
909     Search facilities.
910     """
911     def __init__(self, default_field="text"):
912         super(Search, self).__init__(mode='r')
913
914     def make_term_query(self, query, field='text', modal=operator.or_):
915         """
916         Returns term queries joined by boolean query.
917         modal - applies to boolean query
918         fuzzy - should the query by fuzzy.
919         """
920         if query is None:
921             query = ''
922         q = self.index.Q()
923         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
924
925         return q
926
927     def search_by_author(self, words):
928         from catalogue.models import Book
929         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
930         for word in words:
931             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
932         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
933
934     def search_words(self, words, fields, required=None, book=True, picture=False):
935         if book and not picture and fields == ['authors']:
936             return self.search_by_author(words)
937         filters = []
938         for word in words:
939             if book or picture or (word not in stopwords):
940                 word_filter = None
941                 for field in fields:
942                     q = self.index.Q(**{field: word})
943                     if word_filter is None:
944                         word_filter = q
945                     else:
946                         word_filter |= q
947                 filters.append(word_filter)
948         if required:
949             required_filter = None
950             for field in required:
951                 for word in words:
952                     if book or picture or (word not in stopwords):
953                         q = self.index.Q(**{field: word})
954                         if required_filter is None:
955                             required_filter = q
956                         else:
957                             required_filter |= q
958             filters.append(required_filter)
959         if not filters:
960             return []
961         params = {}
962         if book:
963             params['is_book'] = True
964         if picture:
965             params['picture_id__gt'] = 0
966         else:
967             params['book_id__gt'] = 0
968         query = self.index.query(**params)
969         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
970         result_class = PictureResult if picture else SearchResult
971         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
972
973     def get_snippets(self, searchresult, query, field='text', num=1):
974         """
975         Returns a snippet for found scoreDoc.
976         """
977         maxnum = len(searchresult)
978         if num is None or num < 0 or num > maxnum:
979             num = maxnum
980         book_id = searchresult.book_id
981         revision = searchresult.snippet_revision()
982         snippets = Snippets(book_id, revision=revision)
983         snips = [None] * maxnum
984         try:
985             snippets.open()
986             idx = 0
987             while idx < maxnum and num > 0:
988                 position, length = searchresult.snippet_pos(idx)
989                 if position is None or length is None:
990                     continue
991                 text = snippets.get((int(position),
992                                      int(length)))
993                 snip = self.index.highlight(text=text, field=field, q=query)
994                 if not snip and field == 'text':
995                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
996                 if snip not in snips:
997                     snips[idx] = snip
998                     if snip:
999                         num -= 1
1000                 idx += 1
1001
1002         except IOError as e:
1003             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1004             if not book:
1005                 log.error("Book does not exist for book id = %d" % book_id)
1006             elif not book.get().children.exists():
1007                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1008             return []
1009         finally:
1010             snippets.close()
1011
1012         # remove verse end markers..
1013         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1014
1015         searchresult.snippets = snips
1016
1017         return snips
1018
1019     @staticmethod
1020     def apply_filters(query, filters):
1021         """
1022         Apply filters to a query
1023         """
1024         if filters is None:
1025             filters = []
1026         filters = filter(lambda x: x is not None, filters)
1027         for f in filters:
1028             query = query.query(f)
1029         return query
1030
1031
1032 if getattr(settings, 'SEARCH_MOCK', False):
1033     from .mock_search import Search