66a7b34203b003e6ee39562dcd829a2705836a62
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21 log = logging.getLogger('search')
22
23 class SolrIndex(object):
24     def __init__(self, mode=None):
25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
26
27
28 class Snippets(object):
29     """
30     This class manages snippet files for indexed object (book)
31     the snippets are concatenated together, and their positions and
32     lengths are kept in lucene index fields.
33     """
34     SNIPPET_DIR = "snippets"
35
36     def __init__(self, book_id, revision=None):
37         try:
38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         except OSError as exc:
40             if exc.errno == errno.EEXIST:
41                 pass
42             else: raise
43         self.book_id = book_id
44         self.revision = revision
45         self.file = None
46
47     @property
48     def path(self):
49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
50         else: fn = "%d" % self.book_id
51
52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53
54     def open(self, mode='r'):
55         """
56         Open the snippet file. Call .close() afterwards.
57         """
58         if not 'b' in mode:
59             mode += 'b'
60
61         if 'w' in mode:
62             if os.path.exists(self.path):
63                 self.revision = 1
64                 while True:
65                     if not os.path.exists(self.path):
66                         break
67                     self.revision += 1
68
69         self.file = open(self.path, mode)
70         self.position = 0
71         return self
72
73     def add(self, snippet):
74         """
75         Append a snippet (unicode) to the snippet file.
76         Return a (position, length) tuple
77         """
78         txt = snippet.encode('utf-8')
79         l = len(txt)
80         self.file.write(txt)
81         pos = (self.position, l)
82         self.position += l
83         return pos
84
85     def get(self, pos):
86         """
87         Given a tuple of (position, length) return an unicode
88         of the snippet stored there.
89         """
90         self.file.seek(pos[0], 0)
91         txt = self.file.read(pos[1]).decode('utf-8')
92         return txt
93
94     def close(self):
95         """Close snippet file"""
96         self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136                 #        print "Will delete %s" % ','.join([x for x in uids])
137         if uids:
138             self.index.delete(uids)
139             return True
140         else:
141             return False
142
143     def index_tags(self, *tags, **kw):
144         """
145         Re-index global tag list.
146         Removes all tags from index, then index them again.
147         Indexed fields include: id, name (with and without polish stems), category
148         """
149         log.debug("Indexing tags")
150         remove_only = kw.get('remove_only', False)
151         # first, remove tags from index.
152         if tags:
153             tag_qs = []
154             for tag in tags:
155                 q_id = self.index.Q(tag_id=tag.id)
156
157                 if isinstance(tag, PDCounterAuthor):
158                     q_cat = self.index.Q(tag_category='pd_author')
159                 elif isinstance(tag, PDCounterBook):
160                     q_cat = self.index.Q(tag_category='pd_book')
161                 else:
162                     q_cat = self.index.Q(tag_category=tag.category)
163
164                 q_id_cat = self.index.Q(q_id & q_cat)
165                 tag_qs.append(q_id_cat)
166             self.delete_query(tag_qs)
167         else:  # all
168             q = self.index.Q(tag_id__any=True)
169             self.delete_query(q)
170
171         if not remove_only:
172             # then add them [all or just one passed]
173             if not tags:
174                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
175                     PDCounterAuthor.objects.all(), \
176                     PDCounterBook.objects.all())
177
178             for tag in tags:
179                 if isinstance(tag, PDCounterAuthor):
180                     doc = {
181                         "tag_id": int(tag.id),
182                         "tag_name": tag.name,
183                         "tag_name_pl": tag.name,
184                         "tag_category": 'pd_author',
185                         "is_pdcounter": True,
186                         "uid": "tag%d_pd_a" % tag.id
187                         }
188                 elif isinstance(tag, PDCounterBook):
189                     doc = {
190                         "tag_id": int(tag.id),
191                         "tag_name": tag.title,
192                         "tag_name_pl": tag.title,
193                         "tag_category": 'pd_book',
194                         "is_pdcounter": True,
195                         "uid": "tag%d_pd_b" % tag.id
196                         }
197                 else:
198                     doc = {
199                         "tag_id": int(tag.id),
200                         "tag_name": tag.name,
201                         "tag_name_pl": tag.name,
202                         "tag_category": tag.category,
203                         "is_pdcounter": False,
204                         "uid": "tag%d" % tag.id
205                         }
206                 self.index.add(doc)
207
208     def create_book_doc(self, book):
209         """
210         Create a lucene document referring book id.
211         """
212         doc = {
213             'book_id': int(book.id),
214             }
215         if book.parent is not None:
216             doc["parent_id"] = int(book.parent.id)
217         return doc
218
219     def remove_book(self, book_or_id, remove_snippets=True):
220         """Removes a book from search index.
221         book - Book instance."""
222         if isinstance(book_or_id, catalogue.models.Book):
223             book_id = book_or_id.id
224         else:
225             book_id = book_or_id
226
227         self.delete_query(self.index.Q(book_id=book_id))
228
229         if remove_snippets:
230             snippets = Snippets(book_id)
231             snippets.remove()
232
233     def index_book(self, book, book_info=None, overwrite=True):
234         """
235         Indexes the book.
236         Creates a lucene document for extracted metadata
237         and calls self.index_content() to index the contents of the book.
238         """
239         if overwrite:
240             # we don't remove snippets, since they might be still needed by
241             # threads using not reopened index
242             self.remove_book(book, remove_snippets=False)
243
244         book_doc = self.create_book_doc(book)
245         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
246         # let's not index it - it's only used for extracting publish date
247         if 'source_name' in meta_fields:
248             del meta_fields['source_name']
249
250         for n, f in meta_fields.items():
251             book_doc[n] = f
252
253         book_doc['uid'] = "book%s" % book_doc['book_id']
254         self.index.add(book_doc)
255         del book_doc
256         book_fields = {
257             'title': meta_fields['title'],
258             'authors': meta_fields['authors'],
259             'published_date': meta_fields['published_date']
260             }
261
262         if 'translators' in meta_fields:
263             book_fields['translators'] = meta_fields['translators']
264
265         self.index_content(book, book_fields=book_fields)
266
267     master_tags = [
268         'opowiadanie',
269         'powiesc',
270         'dramat_wierszowany_l',
271         'dramat_wierszowany_lp',
272         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
273         'wywiad',
274         ]
275
276     ignore_content_tags = [
277         'uwaga', 'extra',
278         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
279         'didaskalia',
280         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
281         ]
282
283     footnote_tags = ['pa', 'pt', 'pr', 'pe']
284
285     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
286
287     published_date_re = re.compile("([0-9]+)[\]. ]*$")
288
289     def extract_metadata(self, book, book_info=None, dc_only=None):
290         """
291         Extract metadata from book and returns a map of fields keyed by fieldname
292         """
293         fields = {}
294
295         if book_info is None:
296             book_info = dcparser.parse(open(book.xml_file.path))
297
298         fields['slug'] = book.slug
299         fields['tags'] = [t.name  for t in book.tags]
300         fields['is_book'] = True
301
302         # validator, name
303         for field in dcparser.BookInfo.FIELDS:
304             if dc_only and field.name not in dc_only:
305                 continue
306             if hasattr(book_info, field.name):
307                 if not getattr(book_info, field.name):
308                     continue
309                 # since no type information is available, we use validator
310                 type_indicator = field.validator
311                 if type_indicator == dcparser.as_unicode:
312                     s = getattr(book_info, field.name)
313                     if field.multiple:
314                         s = ', '.join(s)
315                     fields[field.name] = s
316                 elif type_indicator == dcparser.as_person:
317                     p = getattr(book_info, field.name)
318                     if isinstance(p, dcparser.Person):
319                         persons = unicode(p)
320                     else:
321                         persons = ', '.join(map(unicode, p))
322                     fields[field.name] = persons
323                 elif type_indicator == dcparser.as_date:
324                     dt = getattr(book_info, field.name)
325                     fields[field.name] = dt
326
327         # get published date
328         pd = None
329         if hasattr(book_info, 'source_name') and book_info.source_name:
330             match = self.published_date_re.search(book_info.source_name)
331             if match is not None:
332                 pd = str(match.groups()[0])
333         if not pd: pd = ""
334         fields["published_date"] = pd
335
336         return fields
337
338     # def add_gaps(self, fields, fieldname):
339     #     """
340     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
341     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
342     #     """
343     #     def gap():
344     #         while True:
345     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
346     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
347
348     def get_master(self, root):
349         """
350         Returns the first master tag from an etree.
351         """
352         for master in root.iter():
353             if master.tag in self.master_tags:
354                 return master
355
356     def index_content(self, book, book_fields={}):
357         """
358         Walks the book XML and extract content from it.
359         Adds parts for each header tag and for each fragment.
360         """
361         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
362         root = wld.edoc.getroot()
363
364         master = self.get_master(root)
365         if master is None:
366             return []
367
368         def walker(node, ignore_tags=[]):
369
370             if node.tag not in ignore_tags:
371                 yield node, None, None
372                 if node.text is not None:
373                     yield None, node.text, None
374                 for child in list(node):
375                     for b, t, e in walker(child):
376                         yield b, t, e
377                 yield None, None, node
378
379             if node.tail is not None:
380                 yield None, node.tail, None
381             return
382
383         def fix_format(text):
384             #            separator = [u" ", u"\t", u".", u";", u","]
385             if isinstance(text, list):
386                 # need to join it first
387                 text = filter(lambda s: s is not None, content)
388                 text = u' '.join(text)
389                 # for i in range(len(text)):
390                 #     if i > 0:
391                 #         if text[i][0] not in separator\
392                 #             and text[i - 1][-1] not in separator:
393                 #          text.insert(i, u" ")
394
395             return re.sub("(?m)/$", "", text)
396
397         def add_part(snippets, **fields):
398             doc = self.create_book_doc(book)
399             for n, v in book_fields.items():
400                 doc[n] = v
401
402             doc['header_index'] = fields["header_index"]
403             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
404             doc['header_type'] = fields['header_type']
405
406             doc['text'] = fields['text']
407
408             # snippets
409             snip_pos = snippets.add(fields["text"])
410
411             doc['snippets_position'] = snip_pos[0]
412             doc['snippets_length'] = snip_pos[1]
413             if snippets.revision:
414                 doc["snippets_revision"] = snippets.revision
415
416             if 'fragment_anchor' in fields:
417                 doc["fragment_anchor"] = fields['fragment_anchor']
418
419             if 'themes' in fields:
420                 doc['themes'] = fields['themes']
421             doc['uid'] = "part%s%s%s" % (doc['header_index'],
422                                          doc['header_span'],
423                                          doc.get('fragment_anchor', ''))
424             return doc
425
426         def give_me_utf8(s):
427             if isinstance(s, unicode):
428                 return s.encode('utf-8')
429             else:
430                 return s
431
432         fragments = {}
433         snippets = Snippets(book.id).open('w')
434         try:
435             for header, position in zip(list(master), range(len(master))):
436
437                 if header.tag in self.skip_header_tags:
438                     continue
439                 if header.tag is etree.Comment:
440                     continue
441
442                 # section content
443                 content = []
444                 footnote = []
445
446                 def all_content(text):
447                     for frag in fragments.values():
448                         frag['text'].append(text)
449                     content.append(text)
450                 handle_text = [all_content]
451
452                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
453                     # handle footnotes
454                     if start is not None and start.tag in self.footnote_tags:
455                         footnote = []
456
457                         def collect_footnote(t):
458                             footnote.append(t)
459
460                         handle_text.append(collect_footnote)
461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
462                         handle_text.pop()
463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
464                                        text=u''.join(footnote),
465                                        is_footnote=True)
466                         self.index.add(doc)
467                         #print "@ footnote text: %s" % footnote
468                         footnote = []
469
470                     # handle fragments and themes.
471                     if start is not None and start.tag == 'begin':
472                         fid = start.attrib['id'][1:]
473                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
474
475                     # themes for this fragment
476                     elif start is not None and start.tag == 'motyw':
477                         fid = start.attrib['id'][1:]
478                         handle_text.append(None)
479                         if start.text is not None:
480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481                     elif end is not None and end.tag == 'motyw':
482                         handle_text.pop()
483
484                     elif start is not None and start.tag == 'end':
485                         fid = start.attrib['id'][1:]
486                         if fid not in fragments:
487                             continue  # a broken <end> node, skip it
488                         frag = fragments[fid]
489                         if frag['themes'] == []:
490                             continue  # empty themes list.
491                         del fragments[fid]
492
493                         doc = add_part(snippets,
494                                        header_type=frag['start_header'],
495                                        header_index=frag['start_section'],
496                                        header_span=position - frag['start_section'] + 1,
497                                        fragment_anchor=fid,
498                                        text=fix_format(frag['text']),
499                                        themes=frag['themes'])
500                         #print '@ FRAG %s' % frag['content']
501                         self.index.add(doc)
502
503                         # Collect content.
504
505                     if text is not None and handle_text is not []:
506                         hdl = handle_text[-1]
507                         if hdl is not None:
508                             hdl(text)
509
510                         # in the end, add a section text.
511                 doc = add_part(snippets, header_index=position,
512                                header_type=header.tag, text=fix_format(content))
513                 #print '@ CONTENT: %s' % fix_format(content)
514
515                 self.index.add(doc)
516
517         finally:
518             snippets.close()
519
520
521 class SearchResult(object):
522     def __init__(self, doc, how_found=None, query=None, query_terms=None):
523         #        self.search = search
524         self.boost = 1.0
525         self._hits = []
526         self._processed_hits = None  # processed hits
527         self.snippets = []
528         self.query_terms = query_terms
529
530         if 'score' in doc:
531             self._score = doc['score']
532         else:
533             self._score = 0
534
535         self.book_id = int(doc["book_id"])
536
537         try:
538             self.published_date = int(doc.get("published_date"))
539         except ValueError:
540             self.published_date = 0
541
542         # content hits
543         header_type = doc.get("header_type", None)
544         # we have a content hit in some header of fragment
545         if header_type is not None:
546             sec = (header_type, int(doc["header_index"]))
547             header_span = doc['header_span']
548             header_span = header_span is not None and int(header_span) or 1
549             fragment = doc.get("fragment_anchor", None)
550             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
551             snippets_rev = doc.get('snippets_revision', None)
552
553             hit = (sec + (header_span,), fragment, self._score, {
554                 'how_found': how_found,
555                 'snippets_pos': snippets_pos,
556                 'snippets_revision': snippets_rev,
557                 'themes': doc.get('themes', []),
558                 'themes_pl': doc.get('themes_pl', [])
559                 })
560
561             self._hits.append(hit)
562
563     def __unicode__(self):
564         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
565             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
566
567     def __str__(self):
568         return unicode(self).encode('utf-8')
569
570     @property
571     def score(self):
572         return self._score * self.boost
573
574     def merge(self, other):
575         if self.book_id != other.book_id:
576             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
577         self._hits += other._hits
578         if other.score > self.score:
579             self._score = other._score
580         return self
581
582     def get_book(self):
583         if hasattr(self, '_book'):
584             return self._book
585         self._book = catalogue.models.Book.objects.get(id=self.book_id)
586         return self._book
587
588     book = property(get_book)
589
590     POSITION = 0
591     FRAGMENT = 1
592     POSITION_INDEX = 1
593     POSITION_SPAN = 2
594     SCORE = 2
595     OTHER = 3
596
597     @property
598     def hits(self):
599         if self._processed_hits is not None:
600             return self._processed_hits
601
602         # to sections and fragments
603         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
604
605         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
606
607         # sections not covered by fragments
608         sect = filter(lambda s: 0 == len(filter(
609             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
610             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
611             frags)), sect)
612
613         hits = []
614
615         def remove_duplicates(lst, keyfn, compare):
616             els = {}
617             for e in lst:
618                 eif = keyfn(e)
619                 if eif in els:
620                     if compare(els[eif], e) >= 1:
621                         continue
622                 els[eif] = e
623             return els.values()
624
625         # remove fragments with duplicated fid's and duplicated snippets
626         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
627         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
628         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
629
630         # remove duplicate sections
631         sections = {}
632
633         for s in sect:
634             si = s[self.POSITION][self.POSITION_INDEX]
635             # skip existing
636             if si in sections:
637                 if sections[si]['score'] >= s[self.SCORE]:
638                     continue
639
640             m = {'score': s[self.SCORE],
641                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
642                  }
643             m.update(s[self.OTHER])
644             sections[si] = m
645
646         hits = sections.values()
647
648         for f in frags:
649             try:
650                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
651             except catalogue.models.Fragment.DoesNotExist:
652                 # stale index
653                 continue
654             # Figure out if we were searching for a token matching some word in theme name.
655             themes = frag.tags.filter(category='theme')
656             themes_hit = set()
657             if self.query_terms is not None:
658                 for i in range(0, len(f[self.OTHER]['themes'])):
659                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
660                     tms = map(unicode.lower, tms)
661                     for qt in self.query_terms:
662                         if qt in tms:
663                             themes_hit.add(f[self.OTHER]['themes'][i])
664                             break
665
666             def theme_by_name(n):
667                 th = filter(lambda t: t.name == n, themes)
668                 if th:
669                     return th[0]
670                 else:
671                     return None
672             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
673
674             m = {'score': f[self.SCORE],
675                  'fragment': frag,
676                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
677                  'themes': themes,
678                  'themes_hit': themes_hit
679                  }
680             m.update(f[self.OTHER])
681             hits.append(m)
682
683         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
684
685         self._processed_hits = hits
686
687         return hits
688
689     @staticmethod
690     def aggregate(*result_lists):
691         books = {}
692         for rl in result_lists:
693             for r in rl:
694                 if r.book_id in books:
695                     books[r.book_id].merge(r)
696                 else:
697                     books[r.book_id] = r
698         return books.values()
699
700     def __cmp__(self, other):
701         c = cmp(self.score, other.score)
702         if c == 0:
703             # this is inverted, because earlier date is better
704             return cmp(other.published_date, self.published_date)
705         else:
706             return c
707
708     def __len__(self):
709         return len(self.hits)
710
711     def snippet_pos(self, idx=0):
712         return self.hits[idx]['snippets_pos']
713
714     def snippet_revision(self, idx=0):
715         try:
716             return self.hits[idx]['snippets_revision']
717         except:
718             return None
719
720
721 class Search(SolrIndex):
722     """
723     Search facilities.
724     """
725     def __init__(self, default_field="text"):
726         super(Search, self).__init__(mode='r')
727
728
729     def make_term_query(self, query, field='text', modal=operator.or_):
730         """
731         Returns term queries joined by boolean query.
732         modal - applies to boolean query
733         fuzzy - should the query by fuzzy.
734         """
735         if query is None: query = ''
736         q = self.index.Q()
737         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
738                         query.split(r" ")), q)
739
740         return q
741
742     def search_phrase(self, searched, field='text', book=False,
743                       filters=None,
744                       snippets=False):
745         if filters is None: filters = []
746         if book: filters.append(self.index.Q(is_book=True))
747
748         q = self.index.query(**{field: searched})
749         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
750         res = q.execute()
751         return [SearchResult(found, how_found=u'search_phrase') for found in res]
752
753     def search_some(self, searched, fields, book=True,
754                     filters=None, snippets=True, query_terms=None):
755         assert isinstance(fields, list)
756         if filters is None: filters = []
757         if book: filters.append(self.index.Q(is_book=True))
758
759         query = self.index.Q()
760
761         for fld in fields:
762             query = self.index.Q(query | self.make_term_query(searched, fld))
763
764         query = self.index.query(query)
765         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
766         res = query.execute()
767         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
768
769
770     def search_everywhere(self, searched, query_terms=None):
771         """
772         Tries to use search terms to match different fields of book (or its parts).
773         E.g. one word can be an author survey, another be a part of the title, and the rest
774         are some words from third chapter.
775         """
776         books = []
777         # content only query : themes x content
778         q = self.make_term_query(searched, 'text')
779         q_themes = self.make_term_query(searched, 'themes_pl')
780
781         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
782         res = query.execute()
783
784         for found in res:
785             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
786
787         # query themes/content x author/title/tags
788         in_content = self.index.Q()
789         in_meta = self.index.Q()
790
791         for fld in ['themes_pl', 'text']:
792             in_content |= self.make_term_query(searched, field=fld)
793
794         for fld in ['tags', 'authors', 'title']:
795             in_meta |= self.make_term_query(searched, field=fld)
796
797         q = in_content & in_meta
798         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
799
800         for found in res:
801             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
802
803         return books
804
805     def get_snippets(self, searchresult, query, field='text', num=1):
806         """
807         Returns a snippet for found scoreDoc.
808         """
809         maxnum = len(searchresult)
810         if num is None or num < 0 or num > maxnum:
811             num = maxnum
812         book_id = searchresult.book_id
813         revision = searchresult.snippet_revision()
814         snippets = Snippets(book_id, revision=revision)
815         snips = [None] * maxnum
816         try:
817             snippets.open()
818             idx = 0
819             while idx < maxnum and num > 0:
820                 position, length = searchresult.snippet_pos(idx)
821                 if position is None or length is None:
822                     continue
823                 text = snippets.get((int(position),
824                                      int(length)))
825                 snip = self.index.highlight(text=text, field=field, q=query)
826                 snips[idx] = snip
827                 if snip:
828                     num -= 1
829                 idx += 1
830
831         except IOError, e:
832             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
833             return []
834         finally:
835             if snippets:
836                 snippets.close()
837
838             # remove verse end markers..
839         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
840
841         searchresult.snippets = snips
842
843         return snips
844
845     def hint_tags(self, query, pdcounter=True, prefix=True):
846         """
847         Return auto-complete hints for tags
848         using prefix search.
849         """
850         q = self.index.Q()
851         query = query.strip()
852         for field in ['tag_name', 'tag_name_pl']:
853             if prefix:
854                 q |= self.index.Q(**{field: query + "*"})
855             else:
856                 q |= self.make_term_query(query, field=field)
857         qu = self.index.query(q).exclude(tag_category="book")
858
859         return self.search_tags(qu, pdcounter=pdcounter)
860
861     def search_tags(self, query, filters=None, pdcounter=False):
862         """
863         Search for Tag objects using query.
864         """
865         if not filters: filters = []
866         if not pdcounter:
867             filters.append(~self.index.Q(is_pdcounter=True))
868         res = self.apply_filters(query, filters).execute()
869
870         tags = []
871         pd_tags = []
872
873         for doc in res:
874             is_pdcounter = doc.get('is_pdcounter', False)
875             category = doc.get('tag_category')
876             try:
877                 if is_pdcounter == True:
878                     if category == 'pd_author':
879                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
880                     elif category == 'pd_book':
881                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
882                         tag.category = 'pd_book'  # make it look more lik a tag.
883                     else:
884                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
885                     pd_tags.append(tag)
886                 else:
887                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
888                     tags.append(tag)
889
890             except catalogue.models.Tag.DoesNotExist: pass
891             except PDCounterAuthor.DoesNotExist: pass
892             except PDCounterBook.DoesNotExist: pass
893
894         tags_slugs = set(map(lambda t: t.slug, tags))
895         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
896
897         log.debug('search_tags: %s' % tags)
898
899         return tags
900
901     def hint_books(self, query, prefix=True):
902         """
903         Returns auto-complete hints for book titles
904         Because we do not index 'pseudo' title-tags.
905         Prefix search.
906         """
907         q = self.index.Q()
908         query = query.strip()
909         if prefix:
910             q |= self.index.Q(title=query + "*")
911         else:
912             q |= self.make_term_query(query, field='title')
913         qu = self.index.query(q)
914         only_books = self.index.Q(is_book=True)
915         return self.search_books(qu, [only_books])
916
917     def search_books(self, query, filters=None, max_results=10):
918         """
919         Searches for Book objects using query
920         """
921         bks = []
922         bks_found = set()
923         query = query.query(is_book=True)
924         res = self.apply_filters(query, filters).field_limit(['book_id'])
925         for r in res:
926             try:
927                 bid = r['book_id']
928                 if not bid in bks_found:
929                     bks.append(catalogue.models.Book.objects.get(id=bid))
930                     bks_found.add(bid)
931             except catalogue.models.Book.DoesNotExist: pass
932         return bks
933  
934
935     @staticmethod
936     def apply_filters(query, filters):
937         """
938         Apply filters to a query
939         """
940         if filters is None: filters = []
941         filters = filter(lambda x: x is not None, filters)
942         for f in filters:
943             query = query.query(f)
944         return query