Add 1% ad to funding.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 import errno
10 from librarian import dcparser
11 from librarian.parser import WLDocument
12 from lxml import etree
13 import catalogue.models
14 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
15 from itertools import chain
16 import traceback
17 import logging
18 log = logging.getLogger('search')
19 import sunburnt
20 import custom
21 import operator
22
23 log = logging.getLogger('search')
24
25 class SolrIndex(object):
26     def __init__(self, mode=None):
27         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
28
29
30 class Snippets(object):
31     """
32     This class manages snippet files for indexed object (book)
33     the snippets are concatenated together, and their positions and
34     lengths are kept in lucene index fields.
35     """
36     SNIPPET_DIR = "snippets"
37
38     def __init__(self, book_id, revision=None):
39         try:
40             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
41         except OSError as exc:
42             if exc.errno == errno.EEXIST:
43                 pass
44             else: raise
45         self.book_id = book_id
46         self.revision = revision
47         self.file = None
48
49     @property
50     def path(self):
51         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
52         else: fn = "%d" % self.book_id
53
54         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
55
56     def open(self, mode='r'):
57         """
58         Open the snippet file. Call .close() afterwards.
59         """
60         if not 'b' in mode:
61             mode += 'b'
62
63         if 'w' in mode:
64             if os.path.exists(self.path):
65                 self.revision = 1
66                 while True:
67                     if not os.path.exists(self.path):
68                         break
69                     self.revision += 1
70
71         self.file = open(self.path, mode)
72         self.position = 0
73         return self
74
75     def add(self, snippet):
76         """
77         Append a snippet (unicode) to the snippet file.
78         Return a (position, length) tuple
79         """
80         txt = snippet.encode('utf-8')
81         l = len(txt)
82         self.file.write(txt)
83         pos = (self.position, l)
84         self.position += l
85         return pos
86
87     def get(self, pos):
88         """
89         Given a tuple of (position, length) return an unicode
90         of the snippet stored there.
91         """
92         self.file.seek(pos[0], 0)
93         txt = self.file.read(pos[1]).decode('utf-8')
94         return txt
95
96     def close(self):
97         """Close snippet file"""
98         if self.file:
99             self.file.close()
100
101     def remove(self):
102         self.revision = None
103         try:
104             os.unlink(self.path)
105             self.revision = 0
106             while True:
107                 self.revision += 1
108                 os.unlink(self.path)
109         except OSError:
110             pass
111
112
113 class Index(SolrIndex):
114     """
115     Class indexing books.
116     """
117     def __init__(self):
118         super(Index, self).__init__(mode='rw')
119
120     def delete_query(self, *queries):
121         """
122         index.delete(queries=...) doesn't work, so let's reimplement it
123         using deletion of list of uids.
124         """
125         uids = set()
126         for q in queries:
127             if isinstance(q, sunburnt.search.LuceneQuery):
128                 q = self.index.query(q)
129             q.field_limiter.update(['uid'])
130             st = 0
131             rows = 100
132             while True:
133                 ids = q.paginate(start=st, rows=rows).execute()
134                 if not len(ids):
135                     break
136                 for res in ids:
137                     uids.add(res['uid'])
138                 st += rows
139         if uids:
140             self.index.delete(uids)
141             return True
142         else:
143             return False
144
145     def index_tags(self, *tags, **kw):
146         """
147         Re-index global tag list.
148         Removes all tags from index, then index them again.
149         Indexed fields include: id, name (with and without polish stems), category
150         """
151         log.debug("Indexing tags")
152         remove_only = kw.get('remove_only', False)
153         # first, remove tags from index.
154         if tags:
155             tag_qs = []
156             for tag in tags:
157                 q_id = self.index.Q(tag_id=tag.id)
158
159                 if isinstance(tag, PDCounterAuthor):
160                     q_cat = self.index.Q(tag_category='pd_author')
161                 elif isinstance(tag, PDCounterBook):
162                     q_cat = self.index.Q(tag_category='pd_book')
163                 else:
164                     q_cat = self.index.Q(tag_category=tag.category)
165
166                 q_id_cat = self.index.Q(q_id & q_cat)
167                 tag_qs.append(q_id_cat)
168             self.delete_query(*tag_qs)
169         else:  # all
170             q = self.index.Q(tag_id__any=True)
171             self.delete_query(q)
172
173         if not remove_only:
174             # then add them [all or just one passed]
175             if not tags:
176                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
177                     PDCounterAuthor.objects.all(), \
178                     PDCounterBook.objects.all())
179
180             for tag in tags:
181                 if isinstance(tag, PDCounterAuthor):
182                     doc = {
183                         "tag_id": int(tag.id),
184                         "tag_name": tag.name,
185                         "tag_name_pl": tag.name,
186                         "tag_category": 'pd_author',
187                         "is_pdcounter": True,
188                         "uid": "tag%d_pd_a" % tag.id
189                         }
190                 elif isinstance(tag, PDCounterBook):
191                     doc = {
192                         "tag_id": int(tag.id),
193                         "tag_name": tag.title,
194                         "tag_name_pl": tag.title,
195                         "tag_category": 'pd_book',
196                         "is_pdcounter": True,
197                         "uid": "tag%d_pd_b" % tag.id
198                         }
199                 else:
200                     doc = {
201                         "tag_id": int(tag.id),
202                         "tag_name": tag.name,
203                         "tag_name_pl": tag.name,
204                         "tag_category": tag.category,
205                         "is_pdcounter": False,
206                         "uid": "tag%d" % tag.id
207                         }
208                 self.index.add(doc)
209
210     def create_book_doc(self, book):
211         """
212         Create a lucene document referring book id.
213         """
214         doc = {
215             'book_id': int(book.id),
216             }
217         if book.parent is not None:
218             doc["parent_id"] = int(book.parent.id)
219         return doc
220
221     def remove_book(self, book_or_id, remove_snippets=True):
222         """Removes a book from search index.
223         book - Book instance."""
224         if isinstance(book_or_id, catalogue.models.Book):
225             book_id = book_or_id.id
226         else:
227             book_id = book_or_id
228
229         self.delete_query(self.index.Q(book_id=book_id))
230
231         if remove_snippets:
232             snippets = Snippets(book_id)
233             snippets.remove()
234
235     def index_book(self, book, book_info=None, overwrite=True):
236         """
237         Indexes the book.
238         Creates a lucene document for extracted metadata
239         and calls self.index_content() to index the contents of the book.
240         """
241         if overwrite:
242             # we don't remove snippets, since they might be still needed by
243             # threads using not reopened index
244             self.remove_book(book, remove_snippets=False)
245
246         book_doc = self.create_book_doc(book)
247         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
248         # let's not index it - it's only used for extracting publish date
249         if 'source_name' in meta_fields:
250             del meta_fields['source_name']
251
252         for n, f in meta_fields.items():
253             book_doc[n] = f
254
255         book_doc['uid'] = "book%s" % book_doc['book_id']
256         self.index.add(book_doc)
257         del book_doc
258         book_fields = {
259             'title': meta_fields['title'],
260             'authors': meta_fields['authors'],
261             'published_date': meta_fields['published_date']
262             }
263
264         if 'translators' in meta_fields:
265             book_fields['translators'] = meta_fields['translators']
266
267         self.index_content(book, book_fields=book_fields)
268
269     master_tags = [
270         'opowiadanie',
271         'powiesc',
272         'dramat_wierszowany_l',
273         'dramat_wierszowany_lp',
274         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
275         'wywiad',
276         ]
277
278     ignore_content_tags = [
279         'uwaga', 'extra',
280         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
281         'didaskalia',
282         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
283         ]
284
285     footnote_tags = ['pa', 'pt', 'pr', 'pe']
286
287     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
288
289     published_date_re = re.compile("([0-9]+)[\]. ]*$")
290
291     def extract_metadata(self, book, book_info=None, dc_only=None):
292         """
293         Extract metadata from book and returns a map of fields keyed by fieldname
294         """
295         fields = {}
296
297         if book_info is None:
298             book_info = dcparser.parse(open(book.xml_file.path))
299
300         fields['slug'] = book.slug
301         fields['tags'] = [t.name  for t in book.tags]
302         fields['is_book'] = True
303
304         # validator, name
305         for field in dcparser.BookInfo.FIELDS:
306             if dc_only and field.name not in dc_only:
307                 continue
308             if hasattr(book_info, field.name):
309                 if not getattr(book_info, field.name):
310                     continue
311                 # since no type information is available, we use validator
312                 type_indicator = field.validator
313                 if type_indicator == dcparser.as_unicode:
314                     s = getattr(book_info, field.name)
315                     if field.multiple:
316                         s = ', '.join(s)
317                     fields[field.name] = s
318                 elif type_indicator == dcparser.as_person:
319                     p = getattr(book_info, field.name)
320                     if isinstance(p, dcparser.Person):
321                         persons = unicode(p)
322                     else:
323                         persons = ', '.join(map(unicode, p))
324                     fields[field.name] = persons
325                 elif type_indicator == dcparser.as_date:
326                     dt = getattr(book_info, field.name)
327                     fields[field.name] = dt
328
329         # get published date
330         pd = None
331         if hasattr(book_info, 'source_name') and book_info.source_name:
332             match = self.published_date_re.search(book_info.source_name)
333             if match is not None:
334                 pd = str(match.groups()[0])
335         if not pd: pd = ""
336         fields["published_date"] = pd
337
338         return fields
339
340     # def add_gaps(self, fields, fieldname):
341     #     """
342     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
343     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
344     #     """
345     #     def gap():
346     #         while True:
347     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
348     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
349
350     def get_master(self, root):
351         """
352         Returns the first master tag from an etree.
353         """
354         for master in root.iter():
355             if master.tag in self.master_tags:
356                 return master
357
358     def index_content(self, book, book_fields={}):
359         """
360         Walks the book XML and extract content from it.
361         Adds parts for each header tag and for each fragment.
362         """
363         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
364         root = wld.edoc.getroot()
365
366         master = self.get_master(root)
367         if master is None:
368             return []
369
370         def walker(node, ignore_tags=[]):
371
372             if node.tag not in ignore_tags:
373                 yield node, None, None
374                 if node.text is not None:
375                     yield None, node.text, None
376                 for child in list(node):
377                     for b, t, e in walker(child):
378                         yield b, t, e
379                 yield None, None, node
380
381             if node.tail is not None:
382                 yield None, node.tail, None
383             return
384
385         def fix_format(text):
386             #            separator = [u" ", u"\t", u".", u";", u","]
387             if isinstance(text, list):
388                 # need to join it first
389                 text = filter(lambda s: s is not None, content)
390                 text = u' '.join(text)
391                 # for i in range(len(text)):
392                 #     if i > 0:
393                 #         if text[i][0] not in separator\
394                 #             and text[i - 1][-1] not in separator:
395                 #          text.insert(i, u" ")
396
397             return re.sub("(?m)/$", "", text)
398
399         def add_part(snippets, **fields):
400             doc = self.create_book_doc(book)
401             for n, v in book_fields.items():
402                 doc[n] = v
403
404             doc['header_index'] = fields["header_index"]
405             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
406             doc['header_type'] = fields['header_type']
407
408             doc['text'] = fields['text']
409
410             # snippets
411             snip_pos = snippets.add(fields["text"])
412
413             doc['snippets_position'] = snip_pos[0]
414             doc['snippets_length'] = snip_pos[1]
415             if snippets.revision:
416                 doc["snippets_revision"] = snippets.revision
417
418             if 'fragment_anchor' in fields:
419                 doc["fragment_anchor"] = fields['fragment_anchor']
420
421             if 'themes' in fields:
422                 doc['themes'] = fields['themes']
423             doc['uid'] = "part%s%s%s" % (doc['header_index'],
424                                          doc['header_span'],
425                                          doc.get('fragment_anchor', ''))
426             return doc
427
428         def give_me_utf8(s):
429             if isinstance(s, unicode):
430                 return s.encode('utf-8')
431             else:
432                 return s
433
434         fragments = {}
435         snippets = Snippets(book.id).open('w')
436         try:
437             for header, position in zip(list(master), range(len(master))):
438
439                 if header.tag in self.skip_header_tags:
440                     continue
441                 if header.tag is etree.Comment:
442                     continue
443
444                 # section content
445                 content = []
446                 footnote = []
447
448                 def all_content(text):
449                     for frag in fragments.values():
450                         frag['text'].append(text)
451                     content.append(text)
452                 handle_text = [all_content]
453
454                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
455                     # handle footnotes
456                     if start is not None and start.tag in self.footnote_tags:
457                         footnote = []
458
459                         def collect_footnote(t):
460                             footnote.append(t)
461
462                         handle_text.append(collect_footnote)
463                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
464                         handle_text.pop()
465                         doc = add_part(snippets, header_index=position, header_type=header.tag,
466                                        text=u''.join(footnote),
467                                        is_footnote=True)
468                         self.index.add(doc)
469                         footnote = []
470
471                     # handle fragments and themes.
472                     if start is not None and start.tag == 'begin':
473                         fid = start.attrib['id'][1:]
474                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if frag['themes'] == []:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        text=fix_format(frag['text']),
500                                        themes=frag['themes'])
501                         self.index.add(doc)
502
503                         # Collect content.
504
505                     if text is not None and handle_text is not []:
506                         hdl = handle_text[-1]
507                         if hdl is not None:
508                             hdl(text)
509
510                         # in the end, add a section text.
511                 doc = add_part(snippets, header_index=position,
512                                header_type=header.tag, text=fix_format(content))
513
514                 self.index.add(doc)
515
516         finally:
517             snippets.close()
518
519
520 class SearchResult(object):
521     def __init__(self, doc, how_found=None, query=None, query_terms=None):
522         #        self.search = search
523         self.boost = 1.0
524         self._hits = []
525         self._processed_hits = None  # processed hits
526         self.snippets = []
527         self.query_terms = query_terms
528
529         if 'score' in doc:
530             self._score = doc['score']
531         else:
532             self._score = 0
533
534         self.book_id = int(doc["book_id"])
535
536         try:
537             self.published_date = int(doc.get("published_date"))
538         except ValueError:
539             self.published_date = 0
540
541         # content hits
542         header_type = doc.get("header_type", None)
543         # we have a content hit in some header of fragment
544         if header_type is not None:
545             sec = (header_type, int(doc["header_index"]))
546             header_span = doc['header_span']
547             header_span = header_span is not None and int(header_span) or 1
548             fragment = doc.get("fragment_anchor", None)
549             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
550             snippets_rev = doc.get('snippets_revision', None)
551
552             hit = (sec + (header_span,), fragment, self._score, {
553                 'how_found': how_found,
554                 'snippets_pos': snippets_pos,
555                 'snippets_revision': snippets_rev,
556                 'themes': doc.get('themes', []),
557                 'themes_pl': doc.get('themes_pl', [])
558                 })
559
560             self._hits.append(hit)
561
562     def __unicode__(self):
563         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
564             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
565
566     def __str__(self):
567         return unicode(self).encode('utf-8')
568
569     @property
570     def score(self):
571         return self._score * self.boost
572
573     def merge(self, other):
574         if self.book_id != other.book_id:
575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
576         self._hits += other._hits
577         if other.score > self.score:
578             self._score = other._score
579         return self
580
581     def get_book(self):
582         if hasattr(self, '_book'):
583             return self._book
584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
585         return self._book
586
587     book = property(get_book)
588
589     POSITION = 0
590     FRAGMENT = 1
591     POSITION_INDEX = 1
592     POSITION_SPAN = 2
593     SCORE = 2
594     OTHER = 3
595
596     @property
597     def hits(self):
598         if self._processed_hits is not None:
599             return self._processed_hits
600
601         # to sections and fragments
602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
603
604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
605
606         # sections not covered by fragments
607         sect = filter(lambda s: 0 == len(filter(
608             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
609             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
610             frags)), sect)
611
612         hits = []
613
614         def remove_duplicates(lst, keyfn, compare):
615             els = {}
616             for e in lst:
617                 eif = keyfn(e)
618                 if eif in els:
619                     if compare(els[eif], e) >= 1:
620                         continue
621                 els[eif] = e
622             return els.values()
623
624         # remove fragments with duplicated fid's and duplicated snippets
625         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
626         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
627         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
628
629         # remove duplicate sections
630         sections = {}
631
632         for s in sect:
633             si = s[self.POSITION][self.POSITION_INDEX]
634             # skip existing
635             if si in sections:
636                 if sections[si]['score'] >= s[self.SCORE]:
637                     continue
638
639             m = {'score': s[self.SCORE],
640                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
641                  }
642             m.update(s[self.OTHER])
643             sections[si] = m
644
645         hits = sections.values()
646
647         for f in frags:
648             try:
649                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
650             except catalogue.models.Fragment.DoesNotExist:
651                 # stale index
652                 continue
653             # Figure out if we were searching for a token matching some word in theme name.
654             themes = frag.tags.filter(category='theme')
655             themes_hit = set()
656             if self.query_terms is not None:
657                 for i in range(0, len(f[self.OTHER]['themes'])):
658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
659                     tms = map(unicode.lower, tms)
660                     for qt in self.query_terms:
661                         if qt in tms:
662                             themes_hit.add(f[self.OTHER]['themes'][i])
663                             break
664
665             def theme_by_name(n):
666                 th = filter(lambda t: t.name == n, themes)
667                 if th:
668                     return th[0]
669                 else:
670                     return None
671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
672
673             m = {'score': f[self.SCORE],
674                  'fragment': frag,
675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
676                  'themes': themes,
677                  'themes_hit': themes_hit
678                  }
679             m.update(f[self.OTHER])
680             hits.append(m)
681
682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
683
684         self._processed_hits = hits
685
686         return hits
687
688     @staticmethod
689     def aggregate(*result_lists):
690         books = {}
691         for rl in result_lists:
692             for r in rl:
693                 if r.book_id in books:
694                     books[r.book_id].merge(r)
695                 else:
696                     books[r.book_id] = r
697         return books.values()
698
699     def __cmp__(self, other):
700         c = cmp(self.score, other.score)
701         if c == 0:
702             # this is inverted, because earlier date is better
703             return cmp(other.published_date, self.published_date)
704         else:
705             return c
706
707     def __len__(self):
708         return len(self.hits)
709
710     def snippet_pos(self, idx=0):
711         return self.hits[idx]['snippets_pos']
712
713     def snippet_revision(self, idx=0):
714         try:
715             return self.hits[idx]['snippets_revision']
716         except:
717             return None
718
719
720 class Search(SolrIndex):
721     """
722     Search facilities.
723     """
724     def __init__(self, default_field="text"):
725         super(Search, self).__init__(mode='r')
726
727
728     def make_term_query(self, query, field='text', modal=operator.or_):
729         """
730         Returns term queries joined by boolean query.
731         modal - applies to boolean query
732         fuzzy - should the query by fuzzy.
733         """
734         if query is None: query = ''
735         q = self.index.Q()
736         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
737                         query.split(r" ")), q)
738
739         return q
740
741     def search_phrase(self, searched, field='text', book=False,
742                       filters=None,
743                       snippets=False):
744         if filters is None: filters = []
745         if book: filters.append(self.index.Q(is_book=True))
746
747         q = self.index.query(**{field: searched})
748         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
749         res = q.execute()
750         return [SearchResult(found, how_found=u'search_phrase') for found in res]
751
752     def search_some(self, searched, fields, book=True,
753                     filters=None, snippets=True, query_terms=None):
754         assert isinstance(fields, list)
755         if filters is None: filters = []
756         if book: filters.append(self.index.Q(is_book=True))
757
758         query = self.index.Q()
759
760         for fld in fields:
761             query = self.index.Q(query | self.make_term_query(searched, fld))
762
763         query = self.index.query(query)
764         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
765         res = query.execute()
766         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
767
768
769     def search_everywhere(self, searched, query_terms=None):
770         """
771         Tries to use search terms to match different fields of book (or its parts).
772         E.g. one word can be an author survey, another be a part of the title, and the rest
773         are some words from third chapter.
774         """
775         books = []
776         # content only query : themes x content
777         q = self.make_term_query(searched, 'text')
778         q_themes = self.make_term_query(searched, 'themes_pl')
779
780         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
781         res = query.execute()
782
783         for found in res:
784             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
785
786         # query themes/content x author/title/tags
787         in_content = self.index.Q()
788         in_meta = self.index.Q()
789
790         for fld in ['themes_pl', 'text']:
791             in_content |= self.make_term_query(searched, field=fld)
792
793         for fld in ['tags', 'authors', 'title']:
794             in_meta |= self.make_term_query(searched, field=fld)
795
796         q = in_content & in_meta
797         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
798
799         for found in res:
800             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
801
802         return books
803
804     def get_snippets(self, searchresult, query, field='text', num=1):
805         """
806         Returns a snippet for found scoreDoc.
807         """
808         maxnum = len(searchresult)
809         if num is None or num < 0 or num > maxnum:
810             num = maxnum
811         book_id = searchresult.book_id
812         revision = searchresult.snippet_revision()
813         snippets = Snippets(book_id, revision=revision)
814         snips = [None] * maxnum
815         try:
816             snippets.open()
817             idx = 0
818             while idx < maxnum and num > 0:
819                 position, length = searchresult.snippet_pos(idx)
820                 if position is None or length is None:
821                     continue
822                 text = snippets.get((int(position),
823                                      int(length)))
824                 snip = self.index.highlight(text=text, field=field, q=query)
825                 snips[idx] = snip
826                 if snip:
827                     num -= 1
828                 idx += 1
829
830         except IOError, e:
831             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
832             return []
833         finally:
834             snippets.close()
835
836             # remove verse end markers..
837         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
838
839         searchresult.snippets = snips
840
841         return snips
842
843     def hint_tags(self, query, pdcounter=True, prefix=True):
844         """
845         Return auto-complete hints for tags
846         using prefix search.
847         """
848         q = self.index.Q()
849         query = query.strip()
850         for field in ['tag_name', 'tag_name_pl']:
851             if prefix:
852                 q |= self.index.Q(**{field: query + "*"})
853             else:
854                 q |= self.make_term_query(query, field=field)
855         qu = self.index.query(q).exclude(tag_category="book")
856
857         return self.search_tags(qu, pdcounter=pdcounter)
858
859     def search_tags(self, query, filters=None, pdcounter=False):
860         """
861         Search for Tag objects using query.
862         """
863         if not filters: filters = []
864         if not pdcounter:
865             filters.append(~self.index.Q(is_pdcounter=True))
866         res = self.apply_filters(query, filters).execute()
867
868         tags = []
869         pd_tags = []
870
871         for doc in res:
872             is_pdcounter = doc.get('is_pdcounter', False)
873             category = doc.get('tag_category')
874             try:
875                 if is_pdcounter == True:
876                     if category == 'pd_author':
877                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
878                     elif category == 'pd_book':
879                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
880                         tag.category = 'pd_book'  # make it look more lik a tag.
881                     else:
882                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)).encode('utf-8')
883                     pd_tags.append(tag)
884                 else:
885                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
886                     tags.append(tag)
887
888             except catalogue.models.Tag.DoesNotExist: pass
889             except PDCounterAuthor.DoesNotExist: pass
890             except PDCounterBook.DoesNotExist: pass
891
892         tags_slugs = set(map(lambda t: t.slug, tags))
893         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
894
895         log.debug('search_tags: %s' % tags)
896
897         return tags
898
899     def hint_books(self, query, prefix=True):
900         """
901         Returns auto-complete hints for book titles
902         Because we do not index 'pseudo' title-tags.
903         Prefix search.
904         """
905         q = self.index.Q()
906         query = query.strip()
907         if prefix:
908             q |= self.index.Q(title=query + "*")
909         else:
910             q |= self.make_term_query(query, field='title')
911         qu = self.index.query(q)
912         only_books = self.index.Q(is_book=True)
913         return self.search_books(qu, [only_books])
914
915     def search_books(self, query, filters=None, max_results=10):
916         """
917         Searches for Book objects using query
918         """
919         bks = []
920         bks_found = set()
921         query = query.query(is_book=True)
922         res = self.apply_filters(query, filters).field_limit(['book_id'])
923         for r in res:
924             try:
925                 bid = r['book_id']
926                 if not bid in bks_found:
927                     bks.append(catalogue.models.Book.objects.get(id=bid))
928                     bks_found.add(bid)
929             except catalogue.models.Book.DoesNotExist: pass
930         return bks
931  
932
933     @staticmethod
934     def apply_filters(query, filters):
935         """
936         Apply filters to a query
937         """
938         if filters is None: filters = []
939         filters = filter(lambda x: x is not None, filters)
940         for f in filters:
941             query = query.query(f)
942         return query