Merge commit 'afb3cc28'
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 import picture.models
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from itertools import chain
16 import sunburnt
17 import custom
18 import operator
19 import logging
20 from wolnelektury.utils import makedirs
21
22 log = logging.getLogger('search')
23
24 if os.path.isfile(settings.SOLR_STOPWORDS):
25     stopwords = set(
26         line.decode('utf-8').strip()
27         for line in open(settings.SOLR_STOPWORDS) if not line.startswith('#'))
28 else:
29     stopwords = set()
30
31
32 class SolrIndex(object):
33     def __init__(self, mode=None):
34         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
35
36
37 class Snippets(object):
38     """
39     This class manages snippet files for indexed object (book)
40     the snippets are concatenated together, and their positions and
41     lengths are kept in lucene index fields.
42     """
43     SNIPPET_DIR = "snippets"
44
45     def __init__(self, book_id, revision=None):
46         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
47         self.book_id = book_id
48         self.revision = revision
49         self.file = None
50         self.position = None
51
52     @property
53     def path(self):
54         if self.revision:
55             fn = "%d.%d" % (self.book_id, self.revision)
56         else:
57             fn = "%d" % self.book_id
58
59         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
60
61     def open(self, mode='r'):
62         """
63         Open the snippet file. Call .close() afterwards.
64         """
65         if 'b' not in mode:
66             mode += 'b'
67
68         if 'w' in mode:
69             if os.path.exists(self.path):
70                 self.revision = 1
71                 while True:
72                     if not os.path.exists(self.path):
73                         break
74                     self.revision += 1
75
76         self.file = open(self.path, mode)
77         self.position = 0
78         return self
79
80     def add(self, snippet):
81         """
82         Append a snippet (unicode) to the snippet file.
83         Return a (position, length) tuple
84         """
85         txt = snippet.encode('utf-8')
86         l = len(txt)
87         self.file.write(txt)
88         pos = (self.position, l)
89         self.position += l
90         return pos
91
92     def get(self, pos):
93         """
94         Given a tuple of (position, length) return an unicode
95         of the snippet stored there.
96         """
97         self.file.seek(pos[0], 0)
98         txt = self.file.read(pos[1]).decode('utf-8')
99         return txt
100
101     def close(self):
102         """Close snippet file"""
103         if self.file:
104             self.file.close()
105
106     def remove(self):
107         self.revision = None
108         try:
109             os.unlink(self.path)
110             self.revision = 0
111             while True:
112                 self.revision += 1
113                 os.unlink(self.path)
114         except OSError:
115             pass
116
117
118 class Index(SolrIndex):
119     """
120     Class indexing books.
121     """
122     def __init__(self):
123         super(Index, self).__init__(mode='rw')
124
125     def delete_query(self, *queries):
126         """
127         index.delete(queries=...) doesn't work, so let's reimplement it
128         using deletion of list of uids.
129         """
130         uids = set()
131         for q in queries:
132             if isinstance(q, sunburnt.search.LuceneQuery):
133                 q = self.index.query(q)
134             q.field_limiter.update(['uid'])
135             st = 0
136             rows = 100
137             while True:
138                 ids = q.paginate(start=st, rows=rows).execute()
139                 if not len(ids):
140                     break
141                 for res in ids:
142                     uids.add(res['uid'])
143                 st += rows
144         if uids:
145             self.index.delete(uids)
146             return True
147         else:
148             return False
149
150     def index_tags(self, *tags, **kw):
151         """
152         Re-index global tag list.
153         Removes all tags from index, then index them again.
154         Indexed fields include: id, name (with and without polish stems), category
155         """
156         log.debug("Indexing tags")
157         remove_only = kw.get('remove_only', False)
158         # first, remove tags from index.
159         if tags:
160             tag_qs = []
161             for tag in tags:
162                 q_id = self.index.Q(tag_id=tag.id)
163
164                 if isinstance(tag, PDCounterAuthor):
165                     q_cat = self.index.Q(tag_category='pd_author')
166                 elif isinstance(tag, PDCounterBook):
167                     q_cat = self.index.Q(tag_category='pd_book')
168                 else:
169                     q_cat = self.index.Q(tag_category=tag.category)
170
171                 q_id_cat = self.index.Q(q_id & q_cat)
172                 tag_qs.append(q_id_cat)
173             self.delete_query(*tag_qs)
174         else:  # all
175             q = self.index.Q(tag_id__any=True)
176             self.delete_query(q)
177
178         if not remove_only:
179             # then add them [all or just one passed]
180             if not tags:
181                 tags = chain(
182                     catalogue.models.Tag.objects.exclude(category='set'),
183                     PDCounterAuthor.objects.all(),
184                     PDCounterBook.objects.all())
185
186             for tag in tags:
187                 if isinstance(tag, PDCounterAuthor):
188                     doc = {
189                         "tag_id": int(tag.id),
190                         "tag_name": tag.name,
191                         "tag_name_pl": tag.name,
192                         "tag_category": 'pd_author',
193                         "is_pdcounter": True,
194                         "uid": "tag%d_pd_a" % tag.id
195                         }
196                 elif isinstance(tag, PDCounterBook):
197                     doc = {
198                         "tag_id": int(tag.id),
199                         "tag_name": tag.title,
200                         "tag_name_pl": tag.title,
201                         "tag_category": 'pd_book',
202                         "is_pdcounter": True,
203                         "uid": "tag%d_pd_b" % tag.id
204                         }
205                 else:
206                     doc = {
207                         "tag_id": int(tag.id),
208                         "tag_name": tag.name,
209                         "tag_name_pl": tag.name,
210                         "tag_category": tag.category,
211                         "is_pdcounter": False,
212                         "uid": "tag%d" % tag.id
213                         }
214                 self.index.add(doc)
215
216     def create_book_doc(self, book):
217         """
218         Create a lucene document referring book id.
219         """
220         doc = {'book_id': int(book.id)}
221         if book.parent is not None:
222             doc['parent_id'] = int(book.parent.id)
223         return doc
224
225     def remove_book(self, book_or_id, remove_snippets=True):
226         """Removes a book from search index.
227         book - Book instance."""
228         if isinstance(book_or_id, catalogue.models.Book):
229             book_id = book_or_id.id
230         else:
231             book_id = book_or_id
232
233         self.delete_query(self.index.Q(book_id=book_id))
234
235         if remove_snippets:
236             snippets = Snippets(book_id)
237             snippets.remove()
238
239     def index_book(self, book, book_info=None, overwrite=True):
240         """
241         Indexes the book.
242         Creates a lucene document for extracted metadata
243         and calls self.index_content() to index the contents of the book.
244         """
245         if overwrite:
246             # we don't remove snippets, since they might be still needed by
247             # threads using not reopened index
248             self.remove_book(book, remove_snippets=False)
249
250         book_doc = self.create_book_doc(book)
251         meta_fields = self.extract_metadata(book, book_info, dc_only=[
252             'source_name', 'authors', 'translators', 'title', 'epochs', 'kinds', 'genres'])
253         # let's not index it - it's only used for extracting publish date
254         if 'source_name' in meta_fields:
255             del meta_fields['source_name']
256
257         for n, f in meta_fields.items():
258             book_doc[n] = f
259
260         book_doc['uid'] = "book%s" % book_doc['book_id']
261         self.index.add(book_doc)
262         del book_doc
263         book_fields = {
264             'title': meta_fields['title'],
265             'authors': meta_fields['authors'],
266             'published_date': meta_fields['published_date']
267             }
268
269         for tag_name in ('translators', 'epochs', 'kinds', 'genres'):
270             if tag_name in meta_fields:
271                 book_fields[tag_name] = meta_fields[tag_name]
272
273         self.index_content(book, book_fields=book_fields)
274
275     master_tags = [
276         'opowiadanie',
277         'powiesc',
278         'dramat_wierszowany_l',
279         'dramat_wierszowany_lp',
280         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
281         'wywiad',
282     ]
283
284     ignore_content_tags = [
285         'uwaga', 'extra', 'nota_red', 'abstrakt',
286         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
287         'didaskalia',
288         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
289     ]
290
291     footnote_tags = ['pa', 'pt', 'pr', 'pe']
292
293     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
294                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
295
296     published_date_re = re.compile("([0-9]+)[\]. ]*$")
297
298     def extract_metadata(self, book, book_info=None, dc_only=None):
299         """
300         Extract metadata from book and returns a map of fields keyed by fieldname
301         """
302         fields = {}
303
304         if book_info is None:
305             book_info = dcparser.parse(open(book.xml_file.path))
306
307         fields['slug'] = book.slug
308         fields['is_book'] = True
309
310         # validator, name
311         for field in dcparser.BookInfo.FIELDS:
312             if dc_only and field.name not in dc_only:
313                 continue
314             if hasattr(book_info, field.name):
315                 if not getattr(book_info, field.name):
316                     continue
317                 # since no type information is available, we use validator
318                 type_indicator = field.validator
319                 if type_indicator == dcparser.as_unicode:
320                     s = getattr(book_info, field.name)
321                     if field.multiple:
322                         s = ', '.join(s)
323                     fields[field.name] = s
324                 elif type_indicator == dcparser.as_person:
325                     p = getattr(book_info, field.name)
326                     if isinstance(p, dcparser.Person):
327                         persons = unicode(p)
328                     else:
329                         persons = ', '.join(map(unicode, p))
330                     fields[field.name] = persons
331                 elif type_indicator == dcparser.as_date:
332                     dt = getattr(book_info, field.name)
333                     fields[field.name] = dt
334
335         # get published date
336         pd = None
337         if hasattr(book_info, 'source_name') and book_info.source_name:
338             match = self.published_date_re.search(book_info.source_name)
339             if match is not None:
340                 pd = str(match.groups()[0])
341         if not pd:
342             pd = ""
343         fields["published_date"] = pd
344
345         return fields
346
347     # def add_gaps(self, fields, fieldname):
348     #     """
349     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
350     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
351     #     """
352     #     def gap():
353     #         while True:
354     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
355     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
356
357     def get_master(self, root):
358         """
359         Returns the first master tag from an etree.
360         """
361         for master in root.iter():
362             if master.tag in self.master_tags:
363                 return master
364
365     def index_content(self, book, book_fields):
366         """
367         Walks the book XML and extract content from it.
368         Adds parts for each header tag and for each fragment.
369         """
370         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
371         root = wld.edoc.getroot()
372
373         master = self.get_master(root)
374         if master is None:
375             return []
376
377         def walker(node):
378             if node.tag not in self.ignore_content_tags:
379                 yield node, None, None
380                 if node.text is not None:
381                     yield None, node.text, None
382                 for child in list(node):
383                     for b, t, e in walker(child):
384                         yield b, t, e
385                 yield None, None, node
386
387             if node.tail is not None:
388                 yield None, node.tail, None
389             return
390
391         def fix_format(text):
392             # separator = [u" ", u"\t", u".", u";", u","]
393             if isinstance(text, list):
394                 # need to join it first
395                 text = filter(lambda s: s is not None, content)
396                 text = u' '.join(text)
397                 # for i in range(len(text)):
398                 #     if i > 0:
399                 #         if text[i][0] not in separator\
400                 #             and text[i - 1][-1] not in separator:
401                 #          text.insert(i, u" ")
402
403             return re.sub("(?m)/$", "", text)
404
405         def add_part(snippets, **fields):
406             doc = self.create_book_doc(book)
407             for n, v in book_fields.items():
408                 doc[n] = v
409
410             doc['header_index'] = fields["header_index"]
411             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
412             doc['header_type'] = fields['header_type']
413
414             doc['text'] = fields['text']
415
416             # snippets
417             snip_pos = snippets.add(fields["text"])
418
419             doc['snippets_position'] = snip_pos[0]
420             doc['snippets_length'] = snip_pos[1]
421             if snippets.revision:
422                 doc["snippets_revision"] = snippets.revision
423
424             if 'fragment_anchor' in fields:
425                 doc["fragment_anchor"] = fields['fragment_anchor']
426
427             if 'themes' in fields:
428                 doc['themes'] = fields['themes']
429             doc['uid'] = "part%s-%s-%s-%s" % (
430                 book.id, doc['header_index'], doc['header_span'], doc.get('fragment_anchor', ''))
431             return doc
432
433         fragments = {}
434         snippets = Snippets(book.id).open('w')
435         try:
436             for header, position in zip(list(master), range(len(master))):
437
438                 if header.tag in self.skip_header_tags:
439                     continue
440                 if header.tag is etree.Comment:
441                     continue
442
443                 # section content
444                 content = []
445                 footnote = []
446
447                 def all_content(text):
448                     for frag in fragments.values():
449                         frag['text'].append(text)
450                     content.append(text)
451                 handle_text = [all_content]
452
453                 for start, text, end in walker(header):
454                     # handle footnotes
455                     if start is not None and start.tag in self.footnote_tags:
456                         footnote = []
457
458                         def collect_footnote(t):
459                             footnote.append(t)
460
461                         handle_text.append(collect_footnote)
462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463                         handle_text.pop()
464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
465                                        text=u''.join(footnote),
466                                        is_footnote=True)
467                         self.index.add(doc)
468                         footnote = []
469
470                     # handle fragments and themes.
471                     if start is not None and start.tag == 'begin':
472                         fid = start.attrib['id'][1:]
473                         fragments[fid] = {
474                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(lambda text: None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if not frag['themes']:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        text=fix_format(frag['text']),
500                                        themes=frag['themes'])
501                         self.index.add(doc)
502
503                         # Collect content.
504
505                     if text is not None and handle_text is not []:
506                         hdl = handle_text[-1]
507                         hdl(text)
508
509                         # in the end, add a section text.
510                 doc = add_part(snippets, header_index=position,
511                                header_type=header.tag, text=fix_format(content))
512
513                 self.index.add(doc)
514
515         finally:
516             snippets.close()
517
518     def remove_picture(self, picture_or_id):
519         """Removes a picture from search index."""
520         if isinstance(picture_or_id, picture.models.Picture):
521             picture_id = picture_or_id.id
522         else:
523             picture_id = picture_or_id
524         self.delete_query(self.index.Q(picture_id=picture_id))
525
526     def index_picture(self, picture, picture_info=None, overwrite=True):
527         """
528         Indexes the picture.
529         Creates a lucene document for extracted metadata
530         and calls self.index_area() to index the contents of the picture.
531         """
532         if overwrite:
533             # we don't remove snippets, since they might be still needed by
534             # threads using not reopened index
535             self.remove_picture(picture)
536
537         picture_doc = {'picture_id': int(picture.id)}
538         meta_fields = self.extract_metadata(picture, picture_info, dc_only=[
539             'authors', 'title', 'epochs', 'kinds', 'genres'])
540
541         picture_doc.update(meta_fields)
542
543         picture_doc['uid'] = "picture%s" % picture_doc['picture_id']
544         self.index.add(picture_doc)
545         del picture_doc['is_book']
546         for area in picture.areas.all():
547             self.index_area(area, picture_fields=picture_doc)
548
549     def index_area(self, area, picture_fields):
550         """
551         Indexes themes and objects on the area.
552         """
553         doc = dict(picture_fields)
554         doc['area_id'] = area.id
555         doc['themes'] = list(area.tags.filter(category__in=('theme', 'thing')).values_list('name', flat=True))
556         doc['uid'] = 'area%s' % area.id
557         self.index.add(doc)
558
559
560 class SearchResult(object):
561     def __init__(self, doc, how_found=None, query_terms=None):
562         self.boost = 1.0
563         self._hits = []
564         self._processed_hits = None  # processed hits
565         self.snippets = []
566         self.query_terms = query_terms
567         self._book = None
568
569         if 'score' in doc:
570             self._score = doc['score']
571         else:
572             self._score = 0
573
574         self.book_id = int(doc["book_id"])
575
576         try:
577             self.published_date = int(doc.get("published_date"))
578         except ValueError:
579             self.published_date = 0
580
581         # content hits
582         header_type = doc.get("header_type", None)
583         # we have a content hit in some header of fragment
584         if header_type is not None:
585             sec = (header_type, int(doc["header_index"]))
586             header_span = doc['header_span']
587             header_span = header_span is not None and int(header_span) or 1
588             fragment = doc.get("fragment_anchor", None)
589             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
590             snippets_rev = doc.get('snippets_revision', None)
591
592             hit = (sec + (header_span,), fragment, self._score, {
593                 'how_found': how_found,
594                 'snippets_pos': snippets_pos,
595                 'snippets_revision': snippets_rev,
596                 'themes': doc.get('themes', []),
597                 'themes_pl': doc.get('themes_pl', [])
598                 })
599
600             self._hits.append(hit)
601
602     @classmethod
603     def from_book(cls, book, how_found=None, query_terms=None):
604         doc = {
605             'score': book.popularity.count,
606             'book_id': book.id,
607             'published_date': 0,
608         }
609         result = cls(doc, how_found=how_found, query_terms=query_terms)
610         result._book = book
611         return result
612
613     def __unicode__(self):
614         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
615             (self.book_id, len(self._hits),
616              len(self._processed_hits) if self._processed_hits else -1,
617              self._score, len(self.snippets))
618
619     def __str__(self):
620         return unicode(self).encode('utf-8')
621
622     @property
623     def score(self):
624         return self._score * self.boost
625
626     def merge(self, other):
627         if self.book_id != other.book_id:
628             raise ValueError("this search result is for book %d; tried to merge with %d" % (self.book_id, other.book_id))
629         self._hits += other._hits
630         self._score += max(other._score, 0)
631         return self
632
633     def get_book(self):
634         if self._book is not None:
635             return self._book
636         self._book = catalogue.models.Book.objects.get(id=self.book_id)
637         return self._book
638
639     book = property(get_book)
640
641     POSITION = 0
642     FRAGMENT = 1
643     POSITION_INDEX = 1
644     POSITION_SPAN = 2
645     SCORE = 2
646     OTHER = 3
647
648     @property
649     def hits(self):
650         if self._processed_hits is not None:
651             return self._processed_hits
652
653         # to sections and fragments
654         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
655
656         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
657
658         # sections not covered by fragments
659         sect = filter(lambda s: 0 == len(filter(
660             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
661                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
662
663         def remove_duplicates(lst, keyfn, compare):
664             els = {}
665             for e in lst:
666                 eif = keyfn(e)
667                 if eif in els:
668                     if compare(els[eif], e) >= 1:
669                         continue
670                 els[eif] = e
671             return els.values()
672
673         # remove fragments with duplicated fid's and duplicated snippets
674         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
675         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
676         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
677
678         # remove duplicate sections
679         sections = {}
680
681         for s in sect:
682             si = s[self.POSITION][self.POSITION_INDEX]
683             # skip existing
684             if si in sections:
685                 if sections[si]['score'] >= s[self.SCORE]:
686                     continue
687
688             m = {'score': s[self.SCORE],
689                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
690                  }
691             m.update(s[self.OTHER])
692             sections[si] = m
693
694         hits = sections.values()
695
696         for f in frags:
697             try:
698                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
699             except catalogue.models.Fragment.DoesNotExist:
700                 # stale index
701                 continue
702             # Figure out if we were searching for a token matching some word in theme name.
703             themes = frag.tags.filter(category='theme')
704             themes_hit = set()
705             if self.query_terms is not None:
706                 for i in range(0, len(f[self.OTHER]['themes'])):
707                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
708                     tms = map(unicode.lower, tms)
709                     for qt in self.query_terms:
710                         if qt in tms:
711                             themes_hit.add(f[self.OTHER]['themes'][i])
712                             break
713
714             def theme_by_name(n):
715                 th = filter(lambda t: t.name == n, themes)
716                 if th:
717                     return th[0]
718                 else:
719                     return None
720             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
721
722             m = {'score': f[self.SCORE],
723                  'fragment': frag,
724                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
725                  'themes': themes,
726                  'themes_hit': themes_hit
727                  }
728             m.update(f[self.OTHER])
729             hits.append(m)
730
731         hits.sort(key=lambda h: h['score'], reverse=True)
732
733         self._processed_hits = hits
734
735         return hits
736
737     @staticmethod
738     def aggregate(*result_lists):
739         books = {}
740         for rl in result_lists:
741             for r in rl:
742                 if r.book_id in books:
743                     books[r.book_id].merge(r)
744                 else:
745                     books[r.book_id] = r
746         return books.values()
747
748     def __cmp__(self, other):
749         c = cmp(self.score, other.score)
750         if c == 0:
751             # this is inverted, because earlier date is better
752             return cmp(other.published_date, self.published_date)
753         else:
754             return c
755
756     def __len__(self):
757         return len(self.hits)
758
759     def snippet_pos(self, idx=0):
760         return self.hits[idx]['snippets_pos']
761
762     def snippet_revision(self, idx=0):
763         try:
764             return self.hits[idx]['snippets_revision']
765         except (IndexError, KeyError):
766             return None
767
768
769 class PictureResult(object):
770     def __init__(self, doc, how_found=None, query_terms=None):
771         self.boost = 1.0
772         self.query_terms = query_terms
773         self._picture = None
774         self._hits = []
775         self._processed_hits = None
776
777         if 'score' in doc:
778             self._score = doc['score']
779         else:
780             self._score = 0
781
782         self.picture_id = int(doc["picture_id"])
783
784         if doc.get('area_id'):
785             hit = (self._score, {
786                 'how_found': how_found,
787                 'area_id': doc['area_id'],
788                 'themes': doc.get('themes', []),
789                 'themes_pl': doc.get('themes_pl', []),
790             })
791
792             self._hits.append(hit)
793
794     def __unicode__(self):
795         return u"<PR id=%d score=%f >" % (self.picture_id, self._score)
796
797     def __repr__(self):
798         return unicode(self)
799
800     @property
801     def score(self):
802         return self._score * self.boost
803
804     def merge(self, other):
805         if self.picture_id != other.picture_id:
806             raise ValueError(
807                 "this search result is for picture %d; tried to merge with %d" % (self.picture_id, other.picture_id))
808         self._hits += other._hits
809         self._score += max(other._score, 0)
810         return self
811
812     SCORE = 0
813     OTHER = 1
814
815     @property
816     def hits(self):
817         if self._processed_hits is not None:
818             return self._processed_hits
819
820         hits = []
821         for hit in self._hits:
822             try:
823                 area = picture.models.PictureArea.objects.get(id=hit[self.OTHER]['area_id'])
824             except picture.models.PictureArea.DoesNotExist:
825                 # stale index
826                 continue
827             # Figure out if we were searching for a token matching some word in theme name.
828             themes_hit = set()
829             if self.query_terms is not None:
830                 for i in range(0, len(hit[self.OTHER]['themes'])):
831                     tms = hit[self.OTHER]['themes'][i].split(r' +') + hit[self.OTHER]['themes_pl'][i].split(' ')
832                     tms = map(unicode.lower, tms)
833                     for qt in self.query_terms:
834                         if qt in tms:
835                             themes_hit.add(hit[self.OTHER]['themes'][i])
836                             break
837
838             m = {
839                 'score': hit[self.SCORE],
840                 'area': area,
841                 'themes_hit': themes_hit,
842             }
843             m.update(hit[self.OTHER])
844             hits.append(m)
845
846         hits.sort(key=lambda h: h['score'], reverse=True)
847         hits = hits[:1]
848         self._processed_hits = hits
849         return hits
850
851     def get_picture(self):
852         if self._picture is None:
853             self._picture = picture.models.Picture.objects.get(id=self.picture_id)
854         return self._picture
855
856     picture = property(get_picture)
857
858     @staticmethod
859     def aggregate(*result_lists):
860         books = {}
861         for rl in result_lists:
862             for r in rl:
863                 if r.picture_id in books:
864                     books[r.picture_id].merge(r)
865                 else:
866                     books[r.picture_id] = r
867         return books.values()
868
869     def __cmp__(self, other):
870         return cmp(self.score, other.score)
871
872
873 class Search(SolrIndex):
874     """
875     Search facilities.
876     """
877     def __init__(self, default_field="text"):
878         super(Search, self).__init__(mode='r')
879
880     def make_term_query(self, query, field='text', modal=operator.or_):
881         """
882         Returns term queries joined by boolean query.
883         modal - applies to boolean query
884         fuzzy - should the query by fuzzy.
885         """
886         if query is None:
887             query = ''
888         q = self.index.Q()
889         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
890
891         return q
892
893     def search_by_author(self, words):
894         from catalogue.models import Book
895         books = Book.objects.filter(parent=None).order_by('-popularity__count')
896         for word in words:
897             books = books.filter(cached_author__iregex='\m%s\M' % word).select_related('popularity__count')
898         return [SearchResult.from_book(book, how_found='search_by_author', query_terms=words) for book in books[:30]]
899
900     def search_words(self, words, fields, required=None, book=True, picture=False):
901         if book and not picture and fields == ['authors']:
902             return self.search_by_author(words)
903         filters = []
904         for word in words:
905             if book or picture or (word not in stopwords):
906                 word_filter = None
907                 for field in fields:
908                     q = self.index.Q(**{field: word})
909                     if word_filter is None:
910                         word_filter = q
911                     else:
912                         word_filter |= q
913                 filters.append(word_filter)
914         if required:
915             required_filter = None
916             for field in required:
917                 for word in words:
918                     if book or picture or (word not in stopwords):
919                         q = self.index.Q(**{field: word})
920                         if required_filter is None:
921                             required_filter = q
922                         else:
923                             required_filter |= q
924             filters.append(required_filter)
925         if not filters:
926             return []
927         params = {}
928         if book:
929             params['is_book'] = True
930         if picture:
931             params['picture_id__gt'] = 0
932         else:
933             params['book_id__gt'] = 0
934         query = self.index.query(**params)
935         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
936         result_class = PictureResult if picture else SearchResult
937         return [result_class(found, how_found='search_words', query_terms=words) for found in query.execute()]
938
939     def get_snippets(self, searchresult, query, field='text', num=1):
940         """
941         Returns a snippet for found scoreDoc.
942         """
943         maxnum = len(searchresult)
944         if num is None or num < 0 or num > maxnum:
945             num = maxnum
946         book_id = searchresult.book_id
947         revision = searchresult.snippet_revision()
948         snippets = Snippets(book_id, revision=revision)
949         snips = [None] * maxnum
950         try:
951             snippets.open()
952             idx = 0
953             while idx < maxnum and num > 0:
954                 position, length = searchresult.snippet_pos(idx)
955                 if position is None or length is None:
956                     continue
957                 text = snippets.get((int(position),
958                                      int(length)))
959                 snip = self.index.highlight(text=text, field=field, q=query)
960                 if not snip and field == 'text':
961                     snip = self.index.highlight(text=text, field='text_nonstem', q=query)
962                 if snip not in snips:
963                     snips[idx] = snip
964                     if snip:
965                         num -= 1
966                 idx += 1
967
968         except IOError, e:
969             book = catalogue.models.Book.objects.filter(id=book_id)
970             if not book:
971                 log.error("Book does not exist for book id = %d" % book_id)
972             elif not book.get().children.exists():
973                 log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
974             return []
975         finally:
976             snippets.close()
977
978             # remove verse end markers..
979         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
980
981         searchresult.snippets = snips
982
983         return snips
984
985     @staticmethod
986     def apply_filters(query, filters):
987         """
988         Apply filters to a query
989         """
990         if filters is None:
991             filters = []
992         filters = filter(lambda x: x is not None, filters)
993         for f in filters:
994             query = query.query(f)
995         return query
996
997
998 if getattr(settings, 'SEARCH_MOCK', False):
999     from .mock_search import Search