Temporary view to avoid redirection.
[wolnelektury.git] / src / search / index.py
1 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
2 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
3 #
4 from functools import reduce, total_ordering
5 from itertools import chain
6 import logging
7 import operator
8 import os
9 import re
10 from django.conf import settings
11 from librarian import dcparser
12 from librarian.parser import WLDocument
13 from lxml import etree
14 import scorched
15 import catalogue.models
16 import picture.models
17 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
18 from wolnelektury.utils import makedirs
19 from . import custom
20
21 log = logging.getLogger('search')
22
23
24 if os.path.isfile(settings.SOLR_STOPWORDS):
25     stopwords = set(
26         line.strip()
27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
28 else:
29     stopwords = set()
30
31
32 class SolrIndex(object):
33     def __init__(self, mode=None):
34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
35
36
37 class Snippets(object):
38     """
39     This class manages snippet files for indexed object (book)
40     the snippets are concatenated together, and their positions and
41     lengths are kept in lucene index fields.
42     """
43     SNIPPET_DIR = "snippets"
44
45     def __init__(self, book_id, revision=None):
46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47         self.book_id = book_id
48         self.revision = revision
49         self.file = None
50         self.position = None
51
52     @property
53     def path(self):
54         if self.revision:
55             fn = "%d.%d" % (self.book_id, self.revision)
56         else:
57             fn = "%d" % self.book_id
58
59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60
61     def open(self, mode='r'):
62         """
63         Open the snippet file. Call .close() afterwards.
64         """
65         if 'b' not in mode:
66             mode += 'b'
67
68         if 'w' in mode:
69             if os.path.exists(self.path):
70                 self.revision = 1
71                 while True:
72                     if not os.path.exists(self.path):
73                         break
74                     self.revision += 1
75
76         self.file = open(self.path, mode)
77         self.position = 0
78         return self
79
80     def add(self, snippet):
81         """
82         Append a snippet (unicode) to the snippet file.
83         Return a (position, length) tuple
84         """
85         txt = snippet.encode('utf-8')
86         l = len(txt)
87         self.file.write(txt)
88         pos = (self.position, l)
89         self.position += l
90         return pos
91
92     def get(self, pos):
93         """
94         Given a tuple of (position, length) return an unicode
95         of the snippet stored there.
96         """
97         self.file.seek(pos[0], 0)
98         try:
99             txt = self.file.read(pos[1]).decode('utf-8')
100         except:
101             return ''
102         return txt
103
104     def close(self):
105         """Close snippet file"""
106         if self.file:
107             self.file.close()
108
109     def remove(self):
110         self.revision = None
111         try:
112             os.unlink(self.path)
113             self.revision = 0
114             while True:
115                 self.revision += 1
116                 os.unlink(self.path)
117         except OSError:
118             pass
119
120
121 class Index(SolrIndex):
122     """
123     Class indexing books.
124     """
125     def __init__(self):
126         super(Index, self).__init__(mode='rw')
127
128     def delete_query(self, *queries):
129         """
130         index.delete(queries=...) doesn't work, so let's reimplement it
131         using deletion of list of uids.
132         """
133         uids = set()
134         for q in queries:
135             if isinstance(q, scorched.search.LuceneQuery):
136                 q = self.index.query(q)
137             q.field_limiter.update(['uid'])
138             st = 0
139             rows = 100
140             while True:
141                 ids = q.paginate(start=st, rows=rows).execute()
142                 if not len(ids):
143                     break
144                 for res in ids:
145                     uids.add(res['uid'])
146                 st += rows
147         if uids:
148             # FIXME: With Solr API change, this doesn't work.
149             #self.index.delete(uids)
150             return True
151         else:
152             return False
153
154     def index_tags(self, *tags, **kw):
155         """
156         Re-index global tag list.
157         Removes all tags from index, then index them again.
158         Indexed fields include: id, name (with and without polish stems), category
159         """
160         log.debug("Indexing tags")
161         remove_only = kw.get('remove_only', False)
162         # first, remove tags from index.
163         if tags:
164             tag_qs = []
165             for tag in tags:
166                 q_id = self.index.Q(tag_id=tag.id)
167
168                 if isinstance(tag, PDCounterAuthor):
169                     q_cat = self.index.Q(tag_category='pd_author')
170                 elif isinstance(tag, PDCounterBook):
171                     q_cat = self.index.Q(tag_category='pd_book')
172                 else:
173                     q_cat = self.index.Q(tag_category=tag.category)
174
175                 q_id_cat = self.index.Q(q_id & q_cat)
176                 tag_qs.append(q_id_cat)
177             self.delete_query(*tag_qs)
178         else:  # all
179             q = self.index.Q(tag_id__any=True)
180             self.delete_query(q)
181
182         if not remove_only:
183             # then add them [all or just one passed]
184             if not tags:
185                 tags = chain(
186                     catalogue.models.Tag.objects.exclude(category='set'),
187                     PDCounterAuthor.objects.all(),
188                     PDCounterBook.objects.all())
189
190             for tag in tags:
191                 if isinstance(tag, PDCounterAuthor):
192                     doc = {
193                         "tag_id": int(tag.id),
194                         "tag_name": tag.name,
195                         "tag_name_pl": tag.name,
196                         "tag_category": 'pd_author',
197                         "is_pdcounter": True,
198                         "uid": "tag%d_pd_a" % tag.id
199                         }
200                 elif isinstance(tag, PDCounterBook):
201                     doc = {
202                         "tag_id": int(tag.id),
203                         "tag_name": tag.title,
204                         "tag_name_pl": tag.title,
205                         "tag_category": 'pd_book',
206                         "is_pdcounter": True,
207                         "uid": "tag%d_pd_b" % tag.id
208                         }
209                 else:
210                     doc = {
211                         "tag_id": int(tag.id),
212                         "tag_name": tag.name,
213                         "tag_name_pl": tag.name,
214                         "tag_category": tag.category,
215                         "is_pdcounter": False,
216                         "uid": "tag%d" % tag.id
217                         }
218                 self.index.add(doc)
219
220     def create_book_doc(self, book):
221         """
222         Create a lucene document referring book id.
223         """
224         doc = {'book_id': int(book.id)}
225         if book.parent is not None:
226             doc['parent_id'] = int(book.parent.id)
227         return doc
228
229     def remove_book(self, book_or_id, remove_snippets=True):
230         """Removes a book from search index.
231         book - Book instance."""
232         if isinstance(book_or_id, catalogue.models.Book):
233             book_id = book_or_id.id
234         else:
235             book_id = book_or_id
236
237         self.delete_query(self.index.Q(book_id=book_id))
238
239         if remove_snippets:
240             snippets = Snippets(book_id)
241             snippets.remove()
242
243     def index_book(self, book, book_info=None, overwrite=True):
244         """
245         Indexes the book.
246         Creates a lucene document for extracted metadata
247         and calls self.index_content() to index the contents of the book.
248         """
249         if overwrite:
250             # we don't remove snippets, since they might be still needed by
251             # threads using not reopened index
252             self.remove_book(book, remove_snippets=False)
253
254         book_doc = self.create_book_doc(book)
255         meta_fields = self.extract_metadata(book, book_info, dc_only=[
256             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
257         # let's not index it - it's only used for extracting publish date
258         if 'source_name' in meta_fields:
259             del meta_fields['source_name']
260
261         for n, f in meta_fields.items():
262             book_doc[n] = f
263
264         book_doc['uid'] = "book%s" % book_doc['book_id']
265         self.index.add(book_doc)
266         del book_doc
267         book_fields = {
268             'title': meta_fields['title'],
269             'authors': meta_fields['authors'],
270             'published_date': meta_fields['published_date']
271             }
272
273         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
274             if tag_name in meta_fields:
275                 book_fields[tag_name] = meta_fields[tag_name]
276
277         self.index_content(book, book_fields=book_fields)
278
279     master_tags = [
280         'opowiadanie',
281         'powiesc',
282         'dramat_wierszowany_l',
283         'dramat_wierszowany_lp',
284         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
285         'wywiad',
286     ]
287
288     ignore_content_tags = [
289         'uwaga', 'extra', 'nota_red', 'abstrakt',
290         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
291         'didaskalia',
292         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
293     ]
294
295     footnote_tags = ['pa', 'pt', 'pr', 'pe']
296
297     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
298                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
299
300     published_date_re = re.compile("([0-9]+)[\]. ]*$")
301
302     def extract_metadata(self, book, book_info=None, dc_only=None):
303         """
304         Extract metadata from book and returns a map of fields keyed by fieldname
305         """
306         fields = {}
307
308         if book_info is None:
309             book_info = dcparser.parse(open(book.xml_file.path))
310
311         fields['slug'] = book.slug
312         fields['is_book'] = True
313
314         # validator, name
315         for field in dcparser.BookInfo.FIELDS:
316             if dc_only and field.name not in dc_only:
317                 continue
318             if hasattr(book_info, field.name):
319                 if not getattr(book_info, field.name):
320                     continue
321                 # since no type information is available, we use validator
322                 type_indicator = field.validator
323                 if type_indicator == dcparser.as_unicode:
324                     s = getattr(book_info, field.name)
325                     if field.multiple:
326                         s = ', '.join(s)
327                     fields[field.name] = s
328                 elif type_indicator == dcparser.as_person:
329                     p = getattr(book_info, field.name)
330                     if isinstance(p, dcparser.Person):
331                         persons = str(p)
332                     else:
333                         persons = ', '.join(map(str, p))
334                     fields[field.name] = persons
335                 elif type_indicator == dcparser.as_date:
336                     dt = getattr(book_info, field.name)
337                     fields[field.name] = dt
338
339         # get published date
340         pd = None
341         if hasattr(book_info, 'source_name') and book_info.source_name:
342             match = self.published_date_re.search(book_info.source_name)
343             if match is not None:
344                 pd = str(match.groups()[0])
345         if not pd:
346             pd = ""
347         fields["published_date"] = pd
348
349         return fields
350
351     # def add_gaps(self, fields, fieldname):
352     #     """
353     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
354     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
355     #     """
356     #     def gap():
357     #         while True:
358     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
359     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
360
361     def get_master(self, root):
362         """
363         Returns the first master tag from an etree.
364         """
365         for master in root.iter():
366             if master.tag in self.master_tags:
367                 return master
368
369     def index_content(self, book, book_fields):
370         """
371         Walks the book XML and extract content from it.
372         Adds parts for each header tag and for each fragment.
373         """
374         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
375         root = wld.edoc.getroot()
376
377         master = self.get_master(root)
378         if master is None:
379             return []
380
381         def walker(node):
382             if node.tag not in self.ignore_content_tags:
383                 yield node, None, None
384                 if node.text is not None:
385                     yield None, node.text, None
386                 for child in list(node):
387                     for b, t, e in walker(child):
388                         yield b, t, e
389                 yield None, None, node
390
391             if node.tail is not None:
392                 yield None, node.tail, None
393             return
394
395         def fix_format(text):
396             # separator = [" ", "\t", ".", ";", ","]
397             if isinstance(text, list):
398                 # need to join it first
399                 text = filter(lambda s: s is not None, content)
400                 text = ' '.join(text)
401                 # for i in range(len(text)):
402                 #     if i > 0:
403                 #         if text[i][0] not in separator\
404                 #             and text[i - 1][-1] not in separator:
405                 #          text.insert(i, " ")
406
407             return re.sub("(?m)/$", "", text)
408
409         def add_part(snippets, **fields):
410             doc = self.create_book_doc(book)
411             for n, v in book_fields.items():
412                 doc[n] = v
413
414             doc['header_index'] = fields["header_index"]
415             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
416             doc['header_type'] = fields['header_type']
417
418             doc['text'] = fields['text']
419
420             # snippets
421             snip_pos = snippets.add(fields["text"])
422
423             doc['snippets_position'] = snip_pos[0]
424             doc['snippets_length'] = snip_pos[1]
425             if snippets.revision:
426                 doc["snippets_revision"] = snippets.revision
427
428             if 'fragment_anchor' in fields:
429                 doc["fragment_anchor"] = fields['fragment_anchor']
430
431             if 'themes' in fields:
432                 doc['themes'] = fields['themes']
433             doc['uid'] = "part%s-%s-%s-%s" % (
434                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
435             return doc
436
437         fragments = {}
438         snippets = Snippets(book.id).open('w')
439         try:
440             for header, position in zip(list(master), range(len(master))):
441
442                 if header.tag in self.skip_header_tags:
443                     continue
444                 if header.tag is etree.Comment:
445                     continue
446
447                 # section content
448                 content = []
449                 footnote = []
450
451                 def all_content(text):
452                     for frag in fragments.values():
453                         frag['text'].append(text)
454                     content.append(text)
455                 handle_text = [all_content]
456
457                 for start, text, end in walker(header):
458                     # handle footnotes
459                     if start is not None and start.tag in self.footnote_tags:
460                         footnote = []
461
462                         def collect_footnote(t):
463                             footnote.append(t)
464
465                         handle_text.append(collect_footnote)
466                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
467                         handle_text.pop()
468                         doc = add_part(snippets, header_index=position, header_type=header.tag,
469                                        text=''.join(footnote),
470                                        is_footnote=True)
471                         self.index.add(doc)
472                         footnote = []
473
474                     # handle fragments and themes.
475                     if start is not None and start.tag == 'begin':
476                         fid = start.attrib['id'][1:]
477                         fragments[fid] = {
478                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
479
480                     # themes for this fragment
481                     elif start is not None and start.tag == 'motyw':
482                         fid = start.attrib['id'][1:]
483                         handle_text.append(lambda text: None)
484                         if start.text is not None:
485                             fragments[fid]['themes'] += map(str.strip, map(str, (start.text.split(','))))
486                     elif end is not None and end.tag == 'motyw':
487                         handle_text.pop()
488
489                     elif start is not None and start.tag == 'end':
490                         fid = start.attrib['id'][1:]
491                         if fid not in fragments:
492                             continue  # a broken <end> node, skip it
493                         frag = fragments[fid]
494                         if not frag['themes']:
495                             continue  # empty themes list.
496                         del fragments[fid]
497
498                         doc = add_part(snippets,
499                                        header_type=frag['start_header'],
500                                        header_index=frag['start_section'],
501                                        header_span=position - frag['start_section'] + 1,
502                                        fragment_anchor=fid,
503                                        text=fix_format(frag['text']),
504                                        themes=frag['themes'])
505                         self.index.add(doc)
506
507                         # Collect content.
508
509                     if text is not None and handle_text is not []:
510                         hdl = handle_text[-1]
511                         hdl(text)
512
513                         # in the end, add a section text.
514                 doc = add_part(snippets, header_index=position,
515                                header_type=header.tag, text=fix_format(content))
516
517                 self.index.add(doc)
518
519         finally:
520             snippets.close()
521
522     def remove_picture(self, picture_or_id):
523         """Removes a picture from search index."""
524         if isinstance(picture_or_id, picture.models.Picture):
525             picture_id = picture_or_id.id
526         else:
527             picture_id = picture_or_id
528         self.delete_query(self.index.Q(picture_id=picture_id))
529
530     def index_picture(self, picture, picture_info=None, overwrite=True):
531         """
532         Indexes the picture.
533         Creates a lucene document for extracted metadata
534         and calls self.index_area() to index the contents of the picture.
535         """
536         if overwrite:
537             # we don't remove snippets, since they might be still needed by
538             # threads using not reopened index
539             self.remove_picture(picture)
540
541         picture_doc = {'picture_id': int(picture.id)}
542         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
543             'authors', 'title', 'epochs', 'kinds', 'genres'])
544
545         picture_doc.update(meta_fields)
546
547         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
548         self.index.add(picture_doc)
549         del picture_doc['is_book']
550         for area in picture.areas.all():
551             self.index_area(area, picture_fields=picture_doc)
552
553     def index_area(self, area, picture_fields):
554         """
555         Indexes themes and objects on the area.
556         """
557         doc = dict(picture_fields)
558         doc['area_id'] = area.id
559         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
560         doc['uid'] = 'area%s' % area.id
561         self.index.add(doc)
562
563
564 @total_ordering
565 class SearchResult(object):
566     def __init__(self, doc, how_found=None, query_terms=None):
567         self.boost = 1.0
568         self._hits = []
569         self._processed_hits = None  # processed hits
570         self.snippets = []
571         self.query_terms = query_terms
572         self._book = None
573
574         if 'score' in doc:
575             self._score = doc['score']
576         else:
577             self._score = 0
578
579         self.book_id = int(doc["book_id"])
580
581         try:
582             self.published_date = int(doc.get("published_date"))
583         except ValueError:
584             self.published_date = 0
585
586         # content hits
587         header_type = doc.get("header_type", None)
588         # we have a content hit in some header of fragment
589         if header_type is not None:
590             sec = (header_type, int(doc["header_index"]))
591             header_span = doc['header_span']
592             header_span = header_span is not None and int(header_span) or 1
593             fragment = doc.get("fragment_anchor", None)
594             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
595             snippets_rev = doc.get('snippets_revision', None)
596
597             hit = (sec + (header_span,), fragment, self._score, {
598                 'how_found': how_found,
599                 'snippets_pos': snippets_pos,
600                 'snippets_revision': snippets_rev,
601                 'themes': doc.get('themes', []),
602                 'themes_pl': doc.get('themes_pl', [])
603                 })
604
605             self._hits.append(hit)
606
607     @classmethod
608     def from_book(cls, book, how_found=None, query_terms=None):
609         doc = {
610             'score': book.popularity.count,
611             'book_id': book.id,
612             'published_date': 0,
613         }
614         result = cls(doc, how_found=how_found, query_terms=query_terms)
615         result._book = book
616         return result
617
618     def __str__(self):
619         return "<SR id=%d %d(%d) hits score=%f %d snippets>" % \
620             (self.book_id, len(self._hits),
621              len(self._processed_hits) if self._processed_hits else -1,
622              self._score, len(self.snippets))
623
624     def __bytes__(self):
625         return str(self).encode('utf-8')
626
627     @property
628     def score(self):
629         return self._score * self.boost
630
631     def merge(self, other):
632         if self.book_id != other.book_id:
633             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
634         self._hits += other._hits
635         self._score += max(other._score, 0)
636         return self
637
638     def get_book(self):
639         if self._book is not None:
640             return self._book
641         try:
642             self._book = catalogue.models.Book.objects.get(id=self.book_id, findable=True)
643         except catalogue.models.Book.DoesNotExist:
644             self._book = None
645         return self._book
646
647     book = property(get_book)
648
649     POSITION = 0
650     FRAGMENT = 1
651     POSITION_INDEX = 1
652     POSITION_SPAN = 2
653     SCORE = 2
654     OTHER = 3
655
656     @property
657     def hits(self):
658         if self._processed_hits is not None:
659             return self._processed_hits
660
661         # to sections and fragments
662         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
663
664         sect = [hit for hit in self._hits if hit[self.FRAGMENT] is None]
665
666         # sections not covered by fragments
667         sect = filter(lambda s: 0 == len(list(filter(
668             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
669                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags))), sect)
670
671         def remove_duplicates(lst, keyfn, larger):
672             els = {}
673             for e in lst:
674                 eif = keyfn(e)
675                 if eif in els:
676                     if larger(els[eif], e):
677                         continue
678                 els[eif] = e
679             return els.values()
680
681         # remove fragments with duplicated fid's and duplicated snippets
682         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: a[self.SCORE] > b[self.SCORE])
683
684         # remove duplicate sections
685         sections = {}
686
687         for s in sect:
688             si = s[self.POSITION][self.POSITION_INDEX]
689             # skip existing
690             if si in sections:
691                 if sections[si]['score'] >= s[self.SCORE]:
692                     continue
693
694             m = {'score': s[self.SCORE],
695                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
696                  }
697             m.update(s[self.OTHER])
698             sections[si] = m
699
700         hits = list(sections.values())
701
702         for f in frags:
703             try:
704                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
705             except catalogue.models.Fragment.DoesNotExist:
706                 # stale index
707                 continue
708             # Figure out if we were searching for a token matching some word in theme name.
709             themes = frag.tags.filter(category='theme')
710             themes_hit = set()
711             if self.query_terms is not None:
712                 for i in range(0, len(f[self.OTHER]['themes'])):
713                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
714                     tms = map(str.lower, tms)
715                     for qt in self.query_terms:
716                         if qt in tms:
717                             themes_hit.add(f[self.OTHER]['themes'][i])
718                             break
719
720             def theme_by_name(n):
721                 th = list(filter(lambda t: t.name == n, themes))
722                 if th:
723                     return th[0]
724                 else:
725                     return None
726             themes_hit = list(filter(lambda a: a is not None, map(theme_by_name, themes_hit)))
727
728             m = {'score': f[self.SCORE],
729                  'fragment': frag,
730                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
731                  'themes': themes,
732                  'themes_hit': themes_hit
733                  }
734             m.update(f[self.OTHER])
735             hits.append(m)
736
737         hits.sort(key=lambda h: h['score'], reverse=True)
738
739         self._processed_hits = hits
740
741         return hits
742
743     @staticmethod
744     def aggregate(*result_lists):
745         books = {}
746         for rl in result_lists:
747             for r in rl:
748                 if r.book_id in books:
749                     books[r.book_id].merge(r)
750                 else:
751                     books[r.book_id] = r
752         return books.values()
753
754     def get_sort_key(self):
755         return (-self.score,
756                 self.published_date,
757                 self.book.sort_key_author if self.book else '',
758                 self.book.sort_key if self.book else '')
759
760     def __lt__(self, other):
761         return self.get_sort_key() > other.get_sort_key()
762
763     def __eq__(self, other):
764         return self.get_sort_key() == other.get_sort_key()
765
766     def __len__(self):
767         return len(self.hits)
768
769     def snippet_pos(self, idx=0):
770         return self.hits[idx]['snippets_pos']
771
772     def snippet_revision(self, idx=0):
773         try:
774             return self.hits[idx]['snippets_revision']
775         except (IndexError, KeyError):
776             return None
777
778
779 @total_ordering
780 class PictureResult(object):
781     def __init__(self, doc, how_found=None, query_terms=None):
782         self.boost = 1.0
783         self.query_terms = query_terms
784         self._picture = None
785         self._hits = []
786         self._processed_hits = None
787
788         if 'score' in doc:
789             self._score = doc['score']
790         else:
791             self._score = 0
792
793         self.picture_id = int(doc["picture_id"])
794
795         if doc.get('area_id'):
796             hit = (self._score, {
797                 'how_found': how_found,
798                 'area_id': doc['area_id'],
799                 'themes': doc.get('themes', []),
800                 'themes_pl': doc.get('themes_pl', []),
801             })
802
803             self._hits.append(hit)
804
805     def __str__(self):
806         return "<PR id=%d score=%f >" % (self.picture_id, self._score)
807
808     def __repr__(self):
809         return str(self)
810
811     @property
812     def score(self):
813         return self._score * self.boost
814
815     def merge(self, other):
816         if self.picture_id != other.picture_id:
817             raise ValueError(
818                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
819         self._hits += other._hits
820         self._score += max(other._score, 0)
821         return self
822
823     SCORE = 0
824     OTHER = 1
825
826     @property
827     def hits(self):
828         if self._processed_hits is not None:
829             return self._processed_hits
830
831         hits = []
832         for hit in self._hits:
833             try:
834                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
835             except picture.models.PictureArea.DoesNotExist:
836                 # stale index
837                 continue
838             # Figure out if we were searching for a token matching some word in theme name.
839             themes_hit = set()
840             if self.query_terms is not None:
841                 for i in range(0, len(hit[self.OTHER]['themes'])):
842                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
843                     tms = map(str.lower, tms)
844                     for qt in self.query_terms:
845                         if qt in tms:
846                             themes_hit.add(hit[self.OTHER]['themes'][i])
847                             break
848
849             m = {
850                 'score': hit[self.SCORE],
851                 'area': area,
852                 'themes_hit': themes_hit,
853             }
854             m.update(hit[self.OTHER])
855             hits.append(m)
856
857         hits.sort(key=lambda h: h['score'], reverse=True)
858         hits = hits[:1]
859         self._processed_hits = hits
860         return hits
861
862     def get_picture(self):
863         if self._picture is None:
864             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
865         return self._picture
866
867     picture = property(get_picture)
868
869     @staticmethod
870     def aggregate(*result_lists):
871         books = {}
872         for rl in result_lists:
873             for r in rl:
874                 if r.picture_id in books:
875                     books[r.picture_id].merge(r)
876                 else:
877                     books[r.picture_id] = r
878         return books.values()
879
880     def __lt__(self, other):
881         return self.score < other.score
882
883     def __eq__(self, other):
884         return self.score == other.score
885
886
887 class Search(SolrIndex):
888     """
889     Search facilities.
890     """
891     def __init__(self, default_field="text"):
892         super(Search, self).__init__(mode='r')
893
894     def make_term_query(self, query, field='text', modal=operator.or_):
895         """
896         Returns term queries joined by boolean query.
897         modal - applies to boolean query
898         fuzzy - should the query by fuzzy.
899         """
900         if query is None:
901             query = ''
902         q = self.index.Q()
903         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
904
905         return q
906
907     def search_by_author(self, words):
908         from catalogue.models import Book
909         books = Book.objects.filter(parent=None, findable=True).order_by('-popularity__count')
910         for word in words:
911             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
912         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
913
914     def search_words(self, words, fields, required=None, book=True, picture=False):
915         if book and not picture and fields == ['authors']:
916             return self.search_by_author(words)
917         filters = []
918         for word in words:
919             if book or picture or (word not in stopwords):
920                 word_filter = None
921                 for field in fields:
922                     q = self.index.Q(**{field: word})
923                     if word_filter is None:
924                         word_filter = q
925                     else:
926                         word_filter |= q
927                 filters.append(word_filter)
928         if required:
929             required_filter = None
930             for field in required:
931                 for word in words:
932                     if book or picture or (word not in stopwords):
933                         q = self.index.Q(**{field: word})
934                         if required_filter is None:
935                             required_filter = q
936                         else:
937                             required_filter |= q
938             filters.append(required_filter)
939         if not filters:
940             return []
941         params = {}
942         if book:
943             params['is_book'] = True
944         if picture:
945             params['picture_id__gt'] = 0
946         else:
947             params['book_id__gt'] = 0
948         query = self.index.query(**params)
949         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
950         result_class = PictureResult if picture else SearchResult
951         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
952
953     def get_snippets(self, searchresult, query, field='text', num=1):
954         """
955         Returns a snippet for found scoreDoc.
956         """
957         maxnum = len(searchresult)
958         if num is None or num < 0 or num > maxnum:
959             num = maxnum
960         book_id = searchresult.book_id
961         revision = searchresult.snippet_revision()
962         snippets = Snippets(book_id, revision=revision)
963         snips = [None] * maxnum
964         try:
965             snippets.open()
966             idx = 0
967             while idx < maxnum and num > 0:
968                 position, length = searchresult.snippet_pos(idx)
969                 if position is None or length is None:
970                     continue
971                 text = snippets.get((int(position),
972                                      int(length)))
973                 snip = self.index.highlight(text=text, field=field, q=query)
974                 if not snip and field == 'text':
975                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
976                 if snip not in snips:
977                     snips[idx] = snip
978                     if snip:
979                         num -= 1
980                 idx += 1
981
982         except IOError as e:
983             book = catalogue.models.Book.objects.filter(id=book_id, findable=True)
984             if not book:
985                 log.error("Book does not exist for book id = %d" % book_id)
986             elif not book.get().children.exists():
987                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
988             return []
989         finally:
990             snippets.close()
991
992         # remove verse end markers..
993         snips = [s.replace("/\n", "\n") if s else s for s in snips]
994
995         searchresult.snippets = snips
996
997         return snips
998
999     @staticmethod
1000     def apply_filters(query, filters):
1001         """
1002         Apply filters to a query
1003         """
1004         if filters is None:
1005             filters = []
1006         filters = filter(lambda x: x is not None, filters)
1007         for f in filters:
1008             query = query.query(f)
1009         return query
1010
1011
1012 if getattr(settings, 'SEARCH_MOCK', False):
1013     from .mock_search import Search