Fixes
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 import librarian.meta.types.date
13 import librarian.meta.types.person
14 import librarian.meta.types.text
15 from librarian.parser import WLDocument
16 from lxml import etree
17 import scorched
18 import catalogue.models
19 import picture.models
20 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
21 from wolnelektury.utils import makedirs
22 from . import custom
23
24 log = logging.getLogger('search')
25
26
27 if os.path.isfile(settings.SOLR_STOPWORDS):
28     stopwords = set(
29         line.strip()
30         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
31 else:
32     stopwords = set()
33
34
35 class SolrIndex(object):
36     def __init__(self, mode=None):
37         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
38
39
40 class Snippets(object):
41     """
42     This class manages snippet files for indexed object (book)
43     the snippets are concatenated together, and their positions and
44     lengths are kept in lucene index fields.
45     """
46     SNIPPET_DIR = "snippets"
47
48     def __init__(self, book_id, revision=None):
49         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
50         self.book_id = book_id
51         self.revision = revision
52         self.file = None
53         self.position = None
54
55     @property
56     def path(self):
57         if self.revision:
58             fn = "%d.%d" % (self.book_id, self.revision)
59         else:
60             fn = "%d" % self.book_id
61
62         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
63
64     def open(self, mode='r'):
65         """
66         Open the snippet file. Call .close() afterwards.
67         """
68         if 'b' not in mode:
69             mode += 'b'
70
71         if 'w' in mode:
72             if os.path.exists(self.path):
73                 self.revision = 1
74                 while True:
75                     if not os.path.exists(self.path):
76                         break
77                     self.revision += 1
78
79         self.file = open(self.path, mode)
80         self.position = 0
81         return self
82
83     def add(self, snippet):
84         """
85         Append a snippet (unicode) to the snippet file.
86         Return a (position, length) tuple
87         """
88         txt = snippet.encode('utf-8')
89         l = len(txt)
90         self.file.write(txt)
91         pos = (self.position, l)
92         self.position += l
93         return pos
94
95     def get(self, pos):
96         """
97         Given a tuple of (position, length) return an unicode
98         of the snippet stored there.
99         """
100         self.file.seek(pos[0], 0)
101         try:
102             txt = self.file.read(pos[1]).decode('utf-8')
103         except:
104             return ''
105         return txt
106
107     def close(self):
108         """Close snippet file"""
109         if self.file:
110             self.file.close()
111
112     def remove(self):
113         self.revision = None
114         try:
115             os.unlink(self.path)
116             self.revision = 0
117             while True:
118                 self.revision += 1
119                 os.unlink(self.path)
120         except OSError:
121             pass
122
123
124 class Index(SolrIndex):
125     """
126     Class indexing books.
127     """
128     def __init__(self):
129         super(Index, self).__init__(mode='rw')
130
131     def remove_snippets(self, book):
132         book.snippet_set.all().delete()
133
134     def add_snippet(self, book, doc):
135         assert book.id == doc.pop('book_id')
136         # Fragments already exist and can be indexed where they live.
137         if 'fragment_anchor' in doc:
138             return
139
140         text = doc.pop('text')
141         header_index = doc.pop('header_index')
142         book.snippet_set.create(
143             sec=header_index,
144             text=text,
145         )
146
147     def delete_query(self, *queries):
148         """
149         index.delete(queries=...) doesn't work, so let's reimplement it
150         using deletion of list of uids.
151         """
152         uids = set()
153         for q in queries:
154             if isinstance(q, scorched.search.LuceneQuery):
155                 q = self.index.query(q)
156             q.field_limiter.update(['uid'])
157             st = 0
158             rows = 100
159             while True:
160                 ids = q.paginate(start=st, rows=rows).execute()
161                 if not len(ids):
162                     break
163                 for res in ids:
164                     uids.add(res['uid'])
165                 st += rows
166         if uids:
167             # FIXME: With Solr API change, this doesn't work.
168             #self.index.delete(uids)
169             return True
170         else:
171             return False
172
173     def index_tags(self, *tags, **kw):
174         """
175         Re-index global tag list.
176         Removes all tags from index, then index them again.
177         Indexed fields include: id, name (with and without polish stems), category
178         """
179         log.debug("Indexing tags")
180         remove_only = kw.get('remove_only', False)
181         # first, remove tags from index.
182         if tags:
183             tag_qs = []
184             for tag in tags:
185                 q_id = self.index.Q(tag_id=tag.id)
186
187                 if isinstance(tag, PDCounterAuthor):
188                     q_cat = self.index.Q(tag_category='pd_author')
189                 elif isinstance(tag, PDCounterBook):
190                     q_cat = self.index.Q(tag_category='pd_book')
191                 else:
192                     q_cat = self.index.Q(tag_category=tag.category)
193
194                 q_id_cat = self.index.Q(q_id & q_cat)
195                 tag_qs.append(q_id_cat)
196             self.delete_query(*tag_qs)
197         else:  # all
198             q = self.index.Q(tag_id__any=True)
199             self.delete_query(q)
200
201         if not remove_only:
202             # then add them [all or just one passed]
203             if not tags:
204                 tags = chain(
205                     catalogue.models.Tag.objects.exclude(category='set'),
206                     PDCounterAuthor.objects.all(),
207                     PDCounterBook.objects.all())
208
209             for tag in tags:
210                 if isinstance(tag, PDCounterAuthor):
211                     doc = {
212                         "tag_id": int(tag.id),
213                         "tag_name": tag.name,
214                         "tag_name_pl": tag.name,
215                         "tag_category": 'pd_author',
216                         "is_pdcounter": True,
217                         "uid": "tag%d_pd_a" % tag.id
218                         }
219                 elif isinstance(tag, PDCounterBook):
220                     doc = {
221                         "tag_id": int(tag.id),
222                         "tag_name": tag.title,
223                         "tag_name_pl": tag.title,
224                         "tag_category": 'pd_book',
225                         "is_pdcounter": True,
226                         "uid": "tag%d_pd_b" % tag.id
227                         }
228                 else:
229                     doc = {
230                         "tag_id": int(tag.id),
231                         "tag_name": tag.name,
232                         "tag_name_pl": tag.name,
233                         "tag_category": tag.category,
234                         "is_pdcounter": False,
235                         "uid": "tag%d" % tag.id
236                         }
237                 self.index.add(doc)
238
239     def create_book_doc(self, book):
240         """
241         Create a lucene document referring book id.
242         """
243         doc = {'book_id': int(book.id)}
244         if book.parent is not None:
245             doc['parent_id'] = int(book.parent.id)
246         return doc
247
248     def remove_book(self, book, remove_snippets=True, legacy=True):
249         """Removes a book from search index.
250         book - Book instance."""
251         if legacy:
252           self.delete_query(self.index.Q(book_id=book.id))
253
254           if remove_snippets:
255             snippets = Snippets(book.id)
256             snippets.remove()
257         self.remove_snippets(book)
258
259     def index_book(self, book, book_info=None, overwrite=True, legacy=True):
260         """
261         Indexes the book.
262         Creates a lucene document for extracted metadata
263         and calls self.index_content() to index the contents of the book.
264         """
265         if not book.xml_file: return
266
267         if overwrite:
268             # we don't remove snippets, since they might be still needed by
269             # threads using not reopened index
270             self.remove_book(book, remove_snippets=False, legacy=legacy)
271
272         book_doc = self.create_book_doc(book)
273         meta_fields = self.extract_metadata(book, book_info, dc_only=[
274             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
275         # let's not index it - it's only used for extracting publish date
276         if 'source_name' in meta_fields:
277             del meta_fields['source_name']
278
279         for n, f in meta_fields.items():
280             book_doc[n] = f
281
282         book_doc['uid'] = "book%s" % book_doc['book_id']
283         if legacy:
284             self.index.add(book_doc)
285         del book_doc
286         book_fields = {
287             'title': meta_fields['title'],
288             'authors': meta_fields['authors'],
289             'published_date': meta_fields['published_date']
290             }
291
292         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
293             if tag_name in meta_fields:
294                 book_fields[tag_name] = meta_fields[tag_name]
295
296         self.index_content(book, book_fields=book_fields, legacy=legacy)
297
298     master_tags = [
299         'opowiadanie',
300         'powiesc',
301         'dramat_wierszowany_l',
302         'dramat_wierszowany_lp',
303         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
304         'wywiad',
305     ]
306
307     ignore_content_tags = [
308         'uwaga', 'extra', 'nota_red', 'abstrakt',
309         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
310         'didaskalia',
311         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
312     ]
313
314     footnote_tags = ['pa', 'pt', 'pr', 'pe']
315
316     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
317                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
318
319     published_date_re = re.compile("([0-9]+)[\]. ]*$")
320
321     def extract_metadata(self, book, book_info=None, dc_only=None):
322         """
323         Extract metadata from book and returns a map of fields keyed by fieldname
324         """
325         fields = {}
326
327         if book_info is None:
328             book_info = dcparser.parse(open(book.xml_file.path, 'rb'))
329
330         fields['slug'] = book.slug
331         fields['is_book'] = True
332
333         # validator, name
334         for field in dcparser.BookInfo.FIELDS:
335             if dc_only and field.name not in dc_only:
336                 continue
337             if hasattr(book_info, field.name):
338                 if not getattr(book_info, field.name):
339                     continue
340                 type_indicator = field.value_type
341                 if issubclass(type_indicator, librarian.meta.types.text.TextValue):
342                     s = getattr(book_info, field.name)
343                     if field.multiple:
344                         s = ', '.join(s)
345                     fields[field.name] = s
346                 elif issubclass(type_indicator, librarian.meta.types.person.Person):
347                     p = getattr(book_info, field.name)
348                     if isinstance(p, librarian.meta.types.person.Person):
349                         persons = str(p)
350                     else:
351                         persons = ', '.join(map(str, p))
352                     fields[field.name] = persons
353                 elif issubclass(type_indicator, librarian.meta.types.date.DateValue):
354                     dt = getattr(book_info, field.name)
355                     fields[field.name] = dt
356
357         # get published date
358         pd = None
359         if hasattr(book_info, 'source_name') and book_info.source_name:
360             match = self.published_date_re.search(book_info.source_name)
361             if match is not None:
362                 pd = str(match.groups()[0])
363         if not pd:
364             pd = ""
365         fields["published_date"] = pd
366
367         return fields
368
369     # def add_gaps(self, fields, fieldname):
370     #     """
371     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
372     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
373     #     """
374     #     def gap():
375     #         while True:
376     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
377     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
378
379     def get_master(self, root):
380         """
381         Returns the first master tag from an etree.
382         """
383         for master in root.iter():
384             if master.tag in self.master_tags:
385                 return master
386
387     def index_content(self, book, book_fields, legacy=True):
388         """
389         Walks the book XML and extract content from it.
390         Adds parts for each header tag and for each fragment.
391         """
392         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
393         root = wld.edoc.getroot()
394
395         master = self.get_master(root)
396         if master is None:
397             return []
398
399         def walker(node):
400             if node.tag not in self.ignore_content_tags:
401                 yield node, None, None
402                 if node.text is not None:
403                     yield None, node.text, None
404                 for child in list(node):
405                     for b, t, e in walker(child):
406                         yield b, t, e
407                 yield None, None, node
408
409             if node.tail is not None:
410                 yield None, node.tail, None
411             return
412
413         def fix_format(text):
414             # separator = [" ", "\t", ".", ";", ","]
415             if isinstance(text, list):
416                 # need to join it first
417                 text = filter(lambda s: s is not None, content)
418                 text = ' '.join(text)
419                 # for i in range(len(text)):
420                 #     if i > 0:
421                 #         if text[i][0] not in separator\
422                 #             and text[i - 1][-1] not in separator:
423                 #          text.insert(i, " ")
424
425             return re.sub("(?m)/$", "", text)
426
427         def add_part(snippets, **fields):
428             doc = self.create_book_doc(book)
429             for n, v in book_fields.items():
430                 doc[n] = v
431
432             doc['header_index'] = fields["header_index"]
433             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
434             doc['header_type'] = fields['header_type']
435
436             doc['text'] = fields['text']
437
438             # snippets
439             snip_pos = snippets.add(fields["text"])
440
441             doc['snippets_position'] = snip_pos[0]
442             doc['snippets_length'] = snip_pos[1]
443             if snippets.revision:
444                 doc["snippets_revision"] = snippets.revision
445
446             if 'fragment_anchor' in fields:
447                 doc["fragment_anchor"] = fields['fragment_anchor']
448
449             if 'themes' in fields:
450                 doc['themes'] = fields['themes']
451             doc['uid'] = "part%s-%s-%s-%s" % (
452                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
453             return doc
454
455         fragments = {}
456         snippets = Snippets(book.id).open('w')
457         try:
458             for header, position in zip(list(master), range(len(master))):
459
460                 if header.tag in self.skip_header_tags:
461                     continue
462                 if header.tag is etree.Comment:
463                     continue
464
465                 # section content
466                 content = []
467                 footnote = []
468
469                 def all_content(text):
470                     for frag in fragments.values():
471                         frag['text'].append(text)
472                     content.append(text)
473                 handle_text = [all_content]
474
475                 for start, text, end in walker(header):
476                     # handle footnotes
477                     if start is not None and start.tag in self.footnote_tags:
478                         footnote = []
479
480                         def collect_footnote(t):
481                             footnote.append(t)
482
483                         handle_text.append(collect_footnote)
484                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
485                         handle_text.pop()
486                         doc = add_part(snippets, header_index=position, header_type=header.tag,
487                                        text=''.join(footnote))
488                         self.add_snippet(book, doc)
489                         if legacy:
490                             self.index.add(doc)
491                         footnote = []
492
493                     # handle fragments and themes.
494                     if start is not None and start.tag == 'begin':
495                         fid = start.attrib['id'][1:]
496                         fragments[fid] = {
497                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
498
499                     # themes for this fragment
500                     elif start is not None and start.tag == 'motyw':
501                         fid = start.attrib['id'][1:]
502                         handle_text.append(lambda text: None)
503                         if start.text is not None:
504                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
505                     elif end is not None and end.tag == 'motyw':
506                         handle_text.pop()
507
508                     elif start is not None and start.tag == 'end':
509                         fid = start.attrib['id'][1:]
510                         if fid not in fragments:
511                             continue  # a broken <end> node, skip it
512                         frag = fragments[fid]
513                         if not frag['themes']:
514                             continue  # empty themes list.
515                         del fragments[fid]
516
517                         doc = add_part(snippets,
518                                        header_type=frag['start_header'],
519                                        header_index=frag['start_section'],
520                                        header_span=position - frag['start_section'] + 1,
521                                        fragment_anchor=fid,
522                                        text=fix_format(frag['text']),
523                                        themes=frag['themes'])
524                         # Add searchable fragment
525                         self.add_snippet(book, doc)
526                         if legacy:
527                             self.index.add(doc)
528
529                         # Collect content.
530
531                     if text is not None and handle_text is not []:
532                         hdl = handle_text[-1]
533                         hdl(text)
534
535                         # in the end, add a section text.
536                 doc = add_part(snippets, header_index=position,
537                                header_type=header.tag, text=fix_format(content))
538
539                 self.add_snippet(book, doc)
540                 if legacy:
541                     self.index.add(doc)
542
543         finally:
544             snippets.close()
545
546     def remove_picture(self, picture_or_id):
547         """Removes a picture from search index."""
548         if isinstance(picture_or_id, picture.models.Picture):
549             picture_id = picture_or_id.id
550         else:
551             picture_id = picture_or_id
552         self.delete_query(self.index.Q(picture_id=picture_id))
553
554     def index_picture(self, picture, picture_info=None, overwrite=True):
555         """
556         Indexes the picture.
557         Creates a lucene document for extracted metadata
558         and calls self.index_area() to index the contents of the picture.
559         """
560         if overwrite:
561             # we don't remove snippets, since they might be still needed by
562             # threads using not reopened index
563             self.remove_picture(picture)
564
565         picture_doc = {'picture_id': int(picture.id)}
566         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
567             'authors', 'title', 'epochs', 'kinds', 'genres'])
568
569         picture_doc.update(meta_fields)
570
571         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
572         self.index.add(picture_doc)
573         del picture_doc['is_book']
574         for area in picture.areas.all():
575             self.index_area(area, picture_fields=picture_doc)
576
577     def index_area(self, area, picture_fields):
578         """
579         Indexes themes and objects on the area.
580         """
581         doc = dict(picture_fields)
582         doc['area_id'] = area.id
583         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
584         doc['uid'] = 'area%s' % area.id
585         self.index.add(doc)
586
587
588 @total_ordering
589 class SearchResult(object):
590     def __init__(self, doc, how_found=None, query_terms=None):
591         self.boost = 1.0
592         self._hits = []
593         self._processed_hits = None  # processed hits
594         self.snippets = []
595         self.query_terms = query_terms
596         self._book = None
597
598         if 'score' in doc:
599             self._score = doc['score']
600         else:
601             self._score = 0
602
603         self.book_id = int(doc["book_id"])
604
605         try:
606             self.published_date = int(doc.get("published_date"))
607         except ValueError:
608             self.published_date = 0
609
610         # content hits
611         header_type = doc.get("header_type", None)
612         # we have a content hit in some header of fragment
613         if header_type is not None:
614             sec = (header_type, int(doc["header_index"]))
615             header_span = doc['header_span']
616             header_span = header_span is not None and int(header_span) or 1
617             fragment = doc.get("fragment_anchor", None)
618             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
619             snippets_rev = doc.get('snippets_revision', None)
620
621             hit = (sec + (header_span,), fragment, self._score, {
622                 'how_found': how_found,
623                 'snippets_pos': snippets_pos,
624                 'snippets_revision': snippets_rev,
625                 'themes': doc.get('themes', []),
626                 'themes_pl': doc.get('themes_pl', [])
627                 })
628
629             self._hits.append(hit)
630
631     @classmethod
632     def from_book(cls, book, how_found=None, query_terms=None):
633         doc = {
634             'score': book.popularity.count,
635             'book_id': book.id,
636             'published_date': 0,
637         }
638         result = cls(doc, how_found=how_found, query_terms=query_terms)
639         result._book = book
640         return result
641
642     def __str__(self):
643         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
644             (self.book_id, len(self._hits),
645              len(self._processed_hits) if self._processed_hits else -1,
646              self._score, len(self.snippets))
647
648     def __bytes__(self):
649         return str(self).encode('utf-8')
650
651     @property
652     def score(self):
653         return self._score * self.boost
654
655     def merge(self, other):
656         if self.book_id != other.book_id:
657             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
658         self._hits += other._hits
659         self._score += max(other._score, 0)
660         return self
661
662     def get_book(self):
663         if self._book is not None:
664             return self._book
665         try:
666             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
667         except catalogue.models.Book.DoesNotExist:
668             self._book = None
669         return self._book
670
671     book = property(get_book)
672
673     POSITION = 0
674     FRAGMENT = 1
675     POSITION_INDEX = 1
676     POSITION_SPAN = 2
677     SCORE = 2
678     OTHER = 3
679
680     @property
681     def hits(self):
682         if self._processed_hits is not None:
683             return self._processed_hits
684
685         # to sections and fragments
686         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
687
688         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
689
690         # sections not covered by fragments
691         sect = filter(lambda s: 0 == len(list(filter(
692             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
693                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
694
695         def remove_duplicates(lst, keyfn, larger):
696             els = {}
697             for e in lst:
698                 eif = keyfn(e)
699                 if eif in els:
700                     if larger(els[eif], e):
701                         continue
702                 els[eif] = e
703             return els.values()
704
705         # remove fragments with duplicated fid's and duplicated snippets
706         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
707
708         # remove duplicate sections
709         sections = {}
710
711         for s in sect:
712             si = s[self.POSITION][self.POSITION_INDEX]
713             # skip existing
714             if si in sections:
715                 if sections[si]['score'] >= s[self.SCORE]:
716                     continue
717
718             m = {'score': s[self.SCORE],
719                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
720                  }
721             m.update(s[self.OTHER])
722             sections[si] = m
723
724         hits = list(sections.values())
725
726         for f in frags:
727             try:
728                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
729             except catalogue.models.Fragment.DoesNotExist:
730                 # stale index
731                 continue
732             # Figure out if we were searching for a token matching some word in theme name.
733             themes = frag.tags.filter(category='theme')
734             themes_hit = set()
735             if self.query_terms is not None:
736                 for i in range(0, len(f[self.OTHER]['themes'])):
737                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
738                     tms = map(str.lower, tms)
739                     for qt in self.query_terms:
740                         if qt in tms:
741                             themes_hit.add(f[self.OTHER]['themes'][i])
742                             break
743
744             def theme_by_name(n):
745                 th = list(filter(lambda t: t.name == n, themes))
746                 if th:
747                     return th[0]
748                 else:
749                     return None
750             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
751
752             m = {'score': f[self.SCORE],
753                  'fragment': frag,
754                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
755                  'themes': themes,
756                  'themes_hit': themes_hit
757                  }
758             m.update(f[self.OTHER])
759             hits.append(m)
760
761         hits.sort(key=lambda h: h['score'], reverse=True)
762
763         self._processed_hits = hits
764
765         return hits
766
767     @staticmethod
768     def aggregate(*result_lists):
769         books = {}
770         for rl in result_lists:
771             for r in rl:
772                 if r.book_id in books:
773                     books[r.book_id].merge(r)
774                 else:
775                     books[r.book_id] = r
776         return books.values()
777
778     def get_sort_key(self):
779         return (-self.score,
780                 self.published_date,
781                 self.book.sort_key_author if self.book else '',
782                 self.book.sort_key if self.book else '')
783
784     def __lt__(self, other):
785         return self.get_sort_key() > other.get_sort_key()
786
787     def __eq__(self, other):
788         return self.get_sort_key() == other.get_sort_key()
789
790     def __len__(self):
791         return len(self.hits)
792
793     def snippet_pos(self, idx=0):
794         return self.hits[idx]['snippets_pos']
795
796     def snippet_revision(self, idx=0):
797         try:
798             return self.hits[idx]['snippets_revision']
799         except (IndexError, KeyError):
800             return None
801
802
803 @total_ordering
804 class PictureResult(object):
805     def __init__(self, doc, how_found=None, query_terms=None):
806         self.boost = 1.0
807         self.query_terms = query_terms
808         self._picture = None
809         self._hits = []
810         self._processed_hits = None
811
812         if 'score' in doc:
813             self._score = doc['score']
814         else:
815             self._score = 0
816
817         self.picture_id = int(doc["picture_id"])
818
819         if doc.get('area_id'):
820             hit = (self._score, {
821                 'how_found': how_found,
822                 'area_id': doc['area_id'],
823                 'themes': doc.get('themes', []),
824                 'themes_pl': doc.get('themes_pl', []),
825             })
826
827             self._hits.append(hit)
828
829     def __str__(self):
830         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
831
832     def __repr__(self):
833         return str(self)
834
835     @property
836     def score(self):
837         return self._score * self.boost
838
839     def merge(self, other):
840         if self.picture_id != other.picture_id:
841             raise ValueError(
842                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
843         self._hits += other._hits
844         self._score += max(other._score, 0)
845         return self
846
847     SCORE = 0
848     OTHER = 1
849
850     @property
851     def hits(self):
852         if self._processed_hits is not None:
853             return self._processed_hits
854
855         hits = []
856         for hit in self._hits:
857             try:
858                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
859             except picture.models.PictureArea.DoesNotExist:
860                 # stale index
861                 continue
862             # Figure out if we were searching for a token matching some word in theme name.
863             themes_hit = set()
864             if self.query_terms is not None:
865                 for i in range(0, len(hit[self.OTHER]['themes'])):
866                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
867                     tms = map(str.lower, tms)
868                     for qt in self.query_terms:
869                         if qt in tms:
870                             themes_hit.add(hit[self.OTHER]['themes'][i])
871                             break
872
873             m = {
874                 'score': hit[self.SCORE],
875                 'area': area,
876                 'themes_hit': themes_hit,
877             }
878             m.update(hit[self.OTHER])
879             hits.append(m)
880
881         hits.sort(key=lambda h: h['score'], reverse=True)
882         hits = hits[:1]
883         self._processed_hits = hits
884         return hits
885
886     def get_picture(self):
887         if self._picture is None:
888             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
889         return self._picture
890
891     picture = property(get_picture)
892
893     @staticmethod
894     def aggregate(*result_lists):
895         books = {}
896         for rl in result_lists:
897             for r in rl:
898                 if r.picture_id in books:
899                     books[r.picture_id].merge(r)
900                 else:
901                     books[r.picture_id] = r
902         return books.values()
903
904     def __lt__(self, other):
905         return self.score < other.score
906
907     def __eq__(self, other):
908         return self.score == other.score
909
910
911 class Search(SolrIndex):
912     """
913     Search facilities.
914     """
915     def __init__(self, default_field="text"):
916         super(Search, self).__init__(mode='r')
917
918     def make_term_query(self, query, field='text', modal=operator.or_):
919         """
920         Returns term queries joined by boolean query.
921         modal - applies to boolean query
922         fuzzy - should the query by fuzzy.
923         """
924         if query is None:
925             query = ''
926         q = self.index.Q()
927         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
928
929         return q
930
931     def search_by_author(self, words):
932         from catalogue.models import Book
933         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
934         for word in words:
935             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
936         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
937
938     def search_words(self, words, fields, required=None, book=True, picture=False):
939         if book and not picture and fields == ['authors']:
940             return self.search_by_author(words)
941         filters = []
942         for word in words:
943             if book or picture or (word not in stopwords):
944                 word_filter = None
945                 for field in fields:
946                     q = self.index.Q(**{field: word})
947                     if word_filter is None:
948                         word_filter = q
949                     else:
950                         word_filter |= q
951                 filters.append(word_filter)
952         if required:
953             required_filter = None
954             for field in required:
955                 for word in words:
956                     if book or picture or (word not in stopwords):
957                         q = self.index.Q(**{field: word})
958                         if required_filter is None:
959                             required_filter = q
960                         else:
961                             required_filter |= q
962             filters.append(required_filter)
963         if not filters:
964             return []
965         params = {}
966         if book:
967             params['is_book'] = True
968         if picture:
969             params['picture_id__gt'] = 0
970         else:
971             params['book_id__gt'] = 0
972         query = self.index.query(**params)
973         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
974         result_class = PictureResult if picture else SearchResult
975         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
976
977     def get_snippets(self, searchresult, query, field='text', num=1):
978         """
979         Returns a snippet for found scoreDoc.
980         """
981         maxnum = len(searchresult)
982         if num is None or num < 0 or num > maxnum:
983             num = maxnum
984         book_id = searchresult.book_id
985         revision = searchresult.snippet_revision()
986         snippets = Snippets(book_id, revision=revision)
987         snips = [None] * maxnum
988         try:
989             snippets.open()
990             idx = 0
991             while idx < maxnum and num > 0:
992                 position, length = searchresult.snippet_pos(idx)
993                 if position is None or length is None:
994                     continue
995                 text = snippets.get((int(position),
996                                      int(length)))
997                 snip = self.index.highlight(text=text, field=field, q=query)
998                 if not snip and field == 'text':
999                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
1000                 if snip not in snips:
1001                     snips[idx] = snip
1002                     if snip:
1003                         num -= 1
1004                 idx += 1
1005
1006         except IOError as e:
1007             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
1008             if not book:
1009                 log.error("Book does not exist for book id = %d" % book_id)
1010             elif not book.get().children.exists():
1011                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
1012             return []
1013         finally:
1014             snippets.close()
1015
1016         # remove verse end markers..
1017         snips = [s.replace("/\n", "\n") if s else s for s in snips]
1018
1019         searchresult.snippets = snips
1020
1021         return snips
1022
1023     @staticmethod
1024     def apply_filters(query, filters):
1025         """
1026         Apply filters to a query
1027         """
1028         if filters is None:
1029             filters = []
1030         filters = filter(lambda x: x is not None, filters)
1031         for f in filters:
1032             query = query.query(f)
1033         return query
1034
1035
1036 if getattr(settings, 'SEARCH_MOCK', False):
1037     from .mock_search import Search