remove jvm leftovers, gaa!
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21 log = logging.getLogger('search')
22
23 class SolrIndex(object):
24     def __init__(self, mode=None):
25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
26
27
28 class Snippets(object):
29     """
30     This class manages snippet files for indexed object (book)
31     the snippets are concatenated together, and their positions and
32     lengths are kept in lucene index fields.
33     """
34     SNIPPET_DIR = "snippets"
35
36     def __init__(self, book_id, revision=None):
37         try:
38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         except OSError as exc:
40             if exc.errno == errno.EEXIST:
41                 pass
42             else: raise
43         self.book_id = book_id
44         self.revision = revision
45         self.file = None
46
47     @property
48     def path(self):
49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
50         else: fn = "%d" % self.book_id
51
52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53
54     def open(self, mode='r'):
55         """
56         Open the snippet file. Call .close() afterwards.
57         """
58         if not 'b' in mode:
59             mode += 'b'
60
61         if 'w' in mode:
62             if os.path.exists(self.path):
63                 self.revision = 1
64                 while True:
65                     if not os.path.exists(self.path):
66                         break
67                     self.revision += 1
68
69         self.file = open(self.path, mode)
70         self.position = 0
71         return self
72
73     def add(self, snippet):
74         """
75         Append a snippet (unicode) to the snippet file.
76         Return a (position, length) tuple
77         """
78         txt = snippet.encode('utf-8')
79         l = len(txt)
80         self.file.write(txt)
81         pos = (self.position, l)
82         self.position += l
83         return pos
84
85     def get(self, pos):
86         """
87         Given a tuple of (position, length) return an unicode
88         of the snippet stored there.
89         """
90         self.file.seek(pos[0], 0)
91         txt = self.file.read(pos[1]).decode('utf-8')
92         return txt
93
94     def close(self):
95         """Close snippet file"""
96         if self.file:
97             self.file.close()
98
99     def remove(self):
100         self.revision = None
101         try:
102             os.unlink(self.path)
103             self.revision = 0
104             while True:
105                 self.revision += 1
106                 os.unlink(self.path)
107         except OSError:
108             pass
109
110
111 class Index(SolrIndex):
112     """
113     Class indexing books.
114     """
115     def __init__(self):
116         super(Index, self).__init__(mode='rw')
117
118     def delete_query(self, *queries):
119         """
120         index.delete(queries=...) doesn't work, so let's reimplement it
121         using deletion of list of uids.
122         """
123         uids = set()
124         for q in queries:
125             if isinstance(q, sunburnt.search.LuceneQuery):
126                 q = self.index.query(q)
127             q.field_limiter.update(['uid'])
128             st = 0
129             rows = 100
130             while True:
131                 ids = q.paginate(start=st, rows=rows).execute()
132                 if not len(ids):
133                     break
134                 for res in ids:
135                     uids.add(res['uid'])
136                 st += rows
137                 #        print "Will delete %s" % ','.join([x for x in uids])
138         if uids:
139             self.index.delete(uids)
140             return True
141         else:
142             return False
143
144     def index_tags(self, *tags, **kw):
145         """
146         Re-index global tag list.
147         Removes all tags from index, then index them again.
148         Indexed fields include: id, name (with and without polish stems), category
149         """
150         log.debug("Indexing tags")
151         remove_only = kw.get('remove_only', False)
152         # first, remove tags from index.
153         if tags:
154             tag_qs = []
155             for tag in tags:
156                 q_id = self.index.Q(tag_id=tag.id)
157
158                 if isinstance(tag, PDCounterAuthor):
159                     q_cat = self.index.Q(tag_category='pd_author')
160                 elif isinstance(tag, PDCounterBook):
161                     q_cat = self.index.Q(tag_category='pd_book')
162                 else:
163                     q_cat = self.index.Q(tag_category=tag.category)
164
165                 q_id_cat = self.index.Q(q_id & q_cat)
166                 tag_qs.append(q_id_cat)
167             self.delete_query(tag_qs)
168         else:  # all
169             q = self.index.Q(tag_id__any=True)
170             self.delete_query(q)
171
172         if not remove_only:
173             # then add them [all or just one passed]
174             if not tags:
175                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
176                     PDCounterAuthor.objects.all(), \
177                     PDCounterBook.objects.all())
178
179             for tag in tags:
180                 if isinstance(tag, PDCounterAuthor):
181                     doc = {
182                         "tag_id": int(tag.id),
183                         "tag_name": tag.name,
184                         "tag_name_pl": tag.name,
185                         "tag_category": 'pd_author',
186                         "is_pdcounter": True,
187                         "uid": "tag%d_pd_a" % tag.id
188                         }
189                 elif isinstance(tag, PDCounterBook):
190                     doc = {
191                         "tag_id": int(tag.id),
192                         "tag_name": tag.title,
193                         "tag_name_pl": tag.title,
194                         "tag_category": 'pd_book',
195                         "is_pdcounter": True,
196                         "uid": "tag%d_pd_b" % tag.id
197                         }
198                 else:
199                     doc = {
200                         "tag_id": int(tag.id),
201                         "tag_name": tag.name,
202                         "tag_name_pl": tag.name,
203                         "tag_category": tag.category,
204                         "is_pdcounter": False,
205                         "uid": "tag%d" % tag.id
206                         }
207                 self.index.add(doc)
208
209     def create_book_doc(self, book):
210         """
211         Create a lucene document referring book id.
212         """
213         doc = {
214             'book_id': int(book.id),
215             }
216         if book.parent is not None:
217             doc["parent_id"] = int(book.parent.id)
218         return doc
219
220     def remove_book(self, book_or_id, remove_snippets=True):
221         """Removes a book from search index.
222         book - Book instance."""
223         if isinstance(book_or_id, catalogue.models.Book):
224             book_id = book_or_id.id
225         else:
226             book_id = book_or_id
227
228         self.delete_query(self.index.Q(book_id=book_id))
229
230         if remove_snippets:
231             snippets = Snippets(book_id)
232             snippets.remove()
233
234     def index_book(self, book, book_info=None, overwrite=True):
235         """
236         Indexes the book.
237         Creates a lucene document for extracted metadata
238         and calls self.index_content() to index the contents of the book.
239         """
240         if overwrite:
241             # we don't remove snippets, since they might be still needed by
242             # threads using not reopened index
243             self.remove_book(book, remove_snippets=False)
244
245         book_doc = self.create_book_doc(book)
246         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
247         # let's not index it - it's only used for extracting publish date
248         if 'source_name' in meta_fields:
249             del meta_fields['source_name']
250
251         for n, f in meta_fields.items():
252             book_doc[n] = f
253
254         book_doc['uid'] = "book%s" % book_doc['book_id']
255         self.index.add(book_doc)
256         del book_doc
257         book_fields = {
258             'title': meta_fields['title'],
259             'authors': meta_fields['authors'],
260             'published_date': meta_fields['published_date']
261             }
262
263         if 'translators' in meta_fields:
264             book_fields['translators'] = meta_fields['translators']
265
266         self.index_content(book, book_fields=book_fields)
267
268     master_tags = [
269         'opowiadanie',
270         'powiesc',
271         'dramat_wierszowany_l',
272         'dramat_wierszowany_lp',
273         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
274         'wywiad',
275         ]
276
277     ignore_content_tags = [
278         'uwaga', 'extra',
279         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
280         'didaskalia',
281         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
282         ]
283
284     footnote_tags = ['pa', 'pt', 'pr', 'pe']
285
286     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
287
288     published_date_re = re.compile("([0-9]+)[\]. ]*$")
289
290     def extract_metadata(self, book, book_info=None, dc_only=None):
291         """
292         Extract metadata from book and returns a map of fields keyed by fieldname
293         """
294         fields = {}
295
296         if book_info is None:
297             book_info = dcparser.parse(open(book.xml_file.path))
298
299         fields['slug'] = book.slug
300         fields['tags'] = [t.name  for t in book.tags]
301         fields['is_book'] = True
302
303         # validator, name
304         for field in dcparser.BookInfo.FIELDS:
305             if dc_only and field.name not in dc_only:
306                 continue
307             if hasattr(book_info, field.name):
308                 if not getattr(book_info, field.name):
309                     continue
310                 # since no type information is available, we use validator
311                 type_indicator = field.validator
312                 if type_indicator == dcparser.as_unicode:
313                     s = getattr(book_info, field.name)
314                     if field.multiple:
315                         s = ', '.join(s)
316                     fields[field.name] = s
317                 elif type_indicator == dcparser.as_person:
318                     p = getattr(book_info, field.name)
319                     if isinstance(p, dcparser.Person):
320                         persons = unicode(p)
321                     else:
322                         persons = ', '.join(map(unicode, p))
323                     fields[field.name] = persons
324                 elif type_indicator == dcparser.as_date:
325                     dt = getattr(book_info, field.name)
326                     fields[field.name] = dt
327
328         # get published date
329         pd = None
330         if hasattr(book_info, 'source_name') and book_info.source_name:
331             match = self.published_date_re.search(book_info.source_name)
332             if match is not None:
333                 pd = str(match.groups()[0])
334         if not pd: pd = ""
335         fields["published_date"] = pd
336
337         return fields
338
339     # def add_gaps(self, fields, fieldname):
340     #     """
341     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
342     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
343     #     """
344     #     def gap():
345     #         while True:
346     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
347     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
348
349     def get_master(self, root):
350         """
351         Returns the first master tag from an etree.
352         """
353         for master in root.iter():
354             if master.tag in self.master_tags:
355                 return master
356
357     def index_content(self, book, book_fields={}):
358         """
359         Walks the book XML and extract content from it.
360         Adds parts for each header tag and for each fragment.
361         """
362         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
363         root = wld.edoc.getroot()
364
365         master = self.get_master(root)
366         if master is None:
367             return []
368
369         def walker(node, ignore_tags=[]):
370
371             if node.tag not in ignore_tags:
372                 yield node, None, None
373                 if node.text is not None:
374                     yield None, node.text, None
375                 for child in list(node):
376                     for b, t, e in walker(child):
377                         yield b, t, e
378                 yield None, None, node
379
380             if node.tail is not None:
381                 yield None, node.tail, None
382             return
383
384         def fix_format(text):
385             #            separator = [u" ", u"\t", u".", u";", u","]
386             if isinstance(text, list):
387                 # need to join it first
388                 text = filter(lambda s: s is not None, content)
389                 text = u' '.join(text)
390                 # for i in range(len(text)):
391                 #     if i > 0:
392                 #         if text[i][0] not in separator\
393                 #             and text[i - 1][-1] not in separator:
394                 #          text.insert(i, u" ")
395
396             return re.sub("(?m)/$", "", text)
397
398         def add_part(snippets, **fields):
399             doc = self.create_book_doc(book)
400             for n, v in book_fields.items():
401                 doc[n] = v
402
403             doc['header_index'] = fields["header_index"]
404             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
405             doc['header_type'] = fields['header_type']
406
407             doc['text'] = fields['text']
408
409             # snippets
410             snip_pos = snippets.add(fields["text"])
411
412             doc['snippets_position'] = snip_pos[0]
413             doc['snippets_length'] = snip_pos[1]
414             if snippets.revision:
415                 doc["snippets_revision"] = snippets.revision
416
417             if 'fragment_anchor' in fields:
418                 doc["fragment_anchor"] = fields['fragment_anchor']
419
420             if 'themes' in fields:
421                 doc['themes'] = fields['themes']
422             doc['uid'] = "part%s%s%s" % (doc['header_index'],
423                                          doc['header_span'],
424                                          doc.get('fragment_anchor', ''))
425             return doc
426
427         def give_me_utf8(s):
428             if isinstance(s, unicode):
429                 return s.encode('utf-8')
430             else:
431                 return s
432
433         fragments = {}
434         snippets = Snippets(book.id).open('w')
435         try:
436             for header, position in zip(list(master), range(len(master))):
437
438                 if header.tag in self.skip_header_tags:
439                     continue
440                 if header.tag is etree.Comment:
441                     continue
442
443                 # section content
444                 content = []
445                 footnote = []
446
447                 def all_content(text):
448                     for frag in fragments.values():
449                         frag['text'].append(text)
450                     content.append(text)
451                 handle_text = [all_content]
452
453                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
454                     # handle footnotes
455                     if start is not None and start.tag in self.footnote_tags:
456                         footnote = []
457
458                         def collect_footnote(t):
459                             footnote.append(t)
460
461                         handle_text.append(collect_footnote)
462                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
463                         handle_text.pop()
464                         doc = add_part(snippets, header_index=position, header_type=header.tag,
465                                        text=u''.join(footnote),
466                                        is_footnote=True)
467                         self.index.add(doc)
468                         #print "@ footnote text: %s" % footnote
469                         footnote = []
470
471                     # handle fragments and themes.
472                     if start is not None and start.tag == 'begin':
473                         fid = start.attrib['id'][1:]
474                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
475
476                     # themes for this fragment
477                     elif start is not None and start.tag == 'motyw':
478                         fid = start.attrib['id'][1:]
479                         handle_text.append(None)
480                         if start.text is not None:
481                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
482                     elif end is not None and end.tag == 'motyw':
483                         handle_text.pop()
484
485                     elif start is not None and start.tag == 'end':
486                         fid = start.attrib['id'][1:]
487                         if fid not in fragments:
488                             continue  # a broken <end> node, skip it
489                         frag = fragments[fid]
490                         if frag['themes'] == []:
491                             continue  # empty themes list.
492                         del fragments[fid]
493
494                         doc = add_part(snippets,
495                                        header_type=frag['start_header'],
496                                        header_index=frag['start_section'],
497                                        header_span=position - frag['start_section'] + 1,
498                                        fragment_anchor=fid,
499                                        text=fix_format(frag['text']),
500                                        themes=frag['themes'])
501                         #print '@ FRAG %s' % frag['content']
502                         self.index.add(doc)
503
504                         # Collect content.
505
506                     if text is not None and handle_text is not []:
507                         hdl = handle_text[-1]
508                         if hdl is not None:
509                             hdl(text)
510
511                         # in the end, add a section text.
512                 doc = add_part(snippets, header_index=position,
513                                header_type=header.tag, text=fix_format(content))
514                 #print '@ CONTENT: %s' % fix_format(content)
515
516                 self.index.add(doc)
517
518         finally:
519             snippets.close()
520
521
522 class SearchResult(object):
523     def __init__(self, doc, how_found=None, query=None, query_terms=None):
524         #        self.search = search
525         self.boost = 1.0
526         self._hits = []
527         self._processed_hits = None  # processed hits
528         self.snippets = []
529         self.query_terms = query_terms
530
531         if 'score' in doc:
532             self._score = doc['score']
533         else:
534             self._score = 0
535
536         self.book_id = int(doc["book_id"])
537
538         try:
539             self.published_date = int(doc.get("published_date"))
540         except ValueError:
541             self.published_date = 0
542
543         # content hits
544         header_type = doc.get("header_type", None)
545         # we have a content hit in some header of fragment
546         if header_type is not None:
547             sec = (header_type, int(doc["header_index"]))
548             header_span = doc['header_span']
549             header_span = header_span is not None and int(header_span) or 1
550             fragment = doc.get("fragment_anchor", None)
551             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
552             snippets_rev = doc.get('snippets_revision', None)
553
554             hit = (sec + (header_span,), fragment, self._score, {
555                 'how_found': how_found,
556                 'snippets_pos': snippets_pos,
557                 'snippets_revision': snippets_rev,
558                 'themes': doc.get('themes', []),
559                 'themes_pl': doc.get('themes_pl', [])
560                 })
561
562             self._hits.append(hit)
563
564     def __unicode__(self):
565         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
566             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
567
568     def __str__(self):
569         return unicode(self).encode('utf-8')
570
571     @property
572     def score(self):
573         return self._score * self.boost
574
575     def merge(self, other):
576         if self.book_id != other.book_id:
577             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
578         self._hits += other._hits
579         if other.score > self.score:
580             self._score = other._score
581         return self
582
583     def get_book(self):
584         if hasattr(self, '_book'):
585             return self._book
586         self._book = catalogue.models.Book.objects.get(id=self.book_id)
587         return self._book
588
589     book = property(get_book)
590
591     POSITION = 0
592     FRAGMENT = 1
593     POSITION_INDEX = 1
594     POSITION_SPAN = 2
595     SCORE = 2
596     OTHER = 3
597
598     @property
599     def hits(self):
600         if self._processed_hits is not None:
601             return self._processed_hits
602
603         # to sections and fragments
604         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
605
606         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
607
608         # sections not covered by fragments
609         sect = filter(lambda s: 0 == len(filter(
610             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
611             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
612             frags)), sect)
613
614         hits = []
615
616         def remove_duplicates(lst, keyfn, compare):
617             els = {}
618             for e in lst:
619                 eif = keyfn(e)
620                 if eif in els:
621                     if compare(els[eif], e) >= 1:
622                         continue
623                 els[eif] = e
624             return els.values()
625
626         # remove fragments with duplicated fid's and duplicated snippets
627         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
628         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
629         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
630
631         # remove duplicate sections
632         sections = {}
633
634         for s in sect:
635             si = s[self.POSITION][self.POSITION_INDEX]
636             # skip existing
637             if si in sections:
638                 if sections[si]['score'] >= s[self.SCORE]:
639                     continue
640
641             m = {'score': s[self.SCORE],
642                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
643                  }
644             m.update(s[self.OTHER])
645             sections[si] = m
646
647         hits = sections.values()
648
649         for f in frags:
650             try:
651                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
652             except catalogue.models.Fragment.DoesNotExist:
653                 # stale index
654                 continue
655             # Figure out if we were searching for a token matching some word in theme name.
656             themes = frag.tags.filter(category='theme')
657             themes_hit = set()
658             if self.query_terms is not None:
659                 for i in range(0, len(f[self.OTHER]['themes'])):
660                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
661                     tms = map(unicode.lower, tms)
662                     for qt in self.query_terms:
663                         if qt in tms:
664                             themes_hit.add(f[self.OTHER]['themes'][i])
665                             break
666
667             def theme_by_name(n):
668                 th = filter(lambda t: t.name == n, themes)
669                 if th:
670                     return th[0]
671                 else:
672                     return None
673             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
674
675             m = {'score': f[self.SCORE],
676                  'fragment': frag,
677                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
678                  'themes': themes,
679                  'themes_hit': themes_hit
680                  }
681             m.update(f[self.OTHER])
682             hits.append(m)
683
684         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
685
686         self._processed_hits = hits
687
688         return hits
689
690     @staticmethod
691     def aggregate(*result_lists):
692         books = {}
693         for rl in result_lists:
694             for r in rl:
695                 if r.book_id in books:
696                     books[r.book_id].merge(r)
697                 else:
698                     books[r.book_id] = r
699         return books.values()
700
701     def __cmp__(self, other):
702         c = cmp(self.score, other.score)
703         if c == 0:
704             # this is inverted, because earlier date is better
705             return cmp(other.published_date, self.published_date)
706         else:
707             return c
708
709     def __len__(self):
710         return len(self.hits)
711
712     def snippet_pos(self, idx=0):
713         return self.hits[idx]['snippets_pos']
714
715     def snippet_revision(self, idx=0):
716         try:
717             return self.hits[idx]['snippets_revision']
718         except:
719             return None
720
721
722 class Search(SolrIndex):
723     """
724     Search facilities.
725     """
726     def __init__(self, default_field="text"):
727         super(Search, self).__init__(mode='r')
728
729
730     def make_term_query(self, query, field='text', modal=operator.or_):
731         """
732         Returns term queries joined by boolean query.
733         modal - applies to boolean query
734         fuzzy - should the query by fuzzy.
735         """
736         if query is None: query = ''
737         q = self.index.Q()
738         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
739                         query.split(r" ")), q)
740
741         return q
742
743     def search_phrase(self, searched, field='text', book=False,
744                       filters=None,
745                       snippets=False):
746         if filters is None: filters = []
747         if book: filters.append(self.index.Q(is_book=True))
748
749         q = self.index.query(**{field: searched})
750         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
751         res = q.execute()
752         return [SearchResult(found, how_found=u'search_phrase') for found in res]
753
754     def search_some(self, searched, fields, book=True,
755                     filters=None, snippets=True, query_terms=None):
756         assert isinstance(fields, list)
757         if filters is None: filters = []
758         if book: filters.append(self.index.Q(is_book=True))
759
760         query = self.index.Q()
761
762         for fld in fields:
763             query = self.index.Q(query | self.make_term_query(searched, fld))
764
765         query = self.index.query(query)
766         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
767         res = query.execute()
768         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
769
770
771     def search_everywhere(self, searched, query_terms=None):
772         """
773         Tries to use search terms to match different fields of book (or its parts).
774         E.g. one word can be an author survey, another be a part of the title, and the rest
775         are some words from third chapter.
776         """
777         books = []
778         # content only query : themes x content
779         q = self.make_term_query(searched, 'text')
780         q_themes = self.make_term_query(searched, 'themes_pl')
781
782         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
783         res = query.execute()
784
785         for found in res:
786             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
787
788         # query themes/content x author/title/tags
789         in_content = self.index.Q()
790         in_meta = self.index.Q()
791
792         for fld in ['themes_pl', 'text']:
793             in_content |= self.make_term_query(searched, field=fld)
794
795         for fld in ['tags', 'authors', 'title']:
796             in_meta |= self.make_term_query(searched, field=fld)
797
798         q = in_content & in_meta
799         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
800
801         for found in res:
802             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
803
804         return books
805
806     def get_snippets(self, searchresult, query, field='text', num=1):
807         """
808         Returns a snippet for found scoreDoc.
809         """
810         maxnum = len(searchresult)
811         if num is None or num < 0 or num > maxnum:
812             num = maxnum
813         book_id = searchresult.book_id
814         revision = searchresult.snippet_revision()
815         snippets = Snippets(book_id, revision=revision)
816         snips = [None] * maxnum
817         try:
818             snippets.open()
819             idx = 0
820             while idx < maxnum and num > 0:
821                 position, length = searchresult.snippet_pos(idx)
822                 if position is None or length is None:
823                     continue
824                 text = snippets.get((int(position),
825                                      int(length)))
826                 snip = self.index.highlight(text=text, field=field, q=query)
827                 snips[idx] = snip
828                 if snip:
829                     num -= 1
830                 idx += 1
831
832         except IOError, e:
833             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
834             return []
835         finally:
836             snippets.close()
837
838             # remove verse end markers..
839         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
840
841         searchresult.snippets = snips
842
843         return snips
844
845     def hint_tags(self, query, pdcounter=True, prefix=True):
846         """
847         Return auto-complete hints for tags
848         using prefix search.
849         """
850         q = self.index.Q()
851         query = query.strip()
852         for field in ['tag_name', 'tag_name_pl']:
853             if prefix:
854                 q |= self.index.Q(**{field: query + "*"})
855             else:
856                 q |= self.make_term_query(query, field=field)
857         qu = self.index.query(q).exclude(tag_category="book")
858
859         return self.search_tags(qu, pdcounter=pdcounter)
860
861     def search_tags(self, query, filters=None, pdcounter=False):
862         """
863         Search for Tag objects using query.
864         """
865         if not filters: filters = []
866         if not pdcounter:
867             filters.append(~self.index.Q(is_pdcounter=True))
868         res = self.apply_filters(query, filters).execute()
869
870         tags = []
871         pd_tags = []
872
873         for doc in res:
874             is_pdcounter = doc.get('is_pdcounter', False)
875             category = doc.get('tag_category')
876             try:
877                 if is_pdcounter == True:
878                     if category == 'pd_author':
879                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
880                     elif category == 'pd_book':
881                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
882                         tag.category = 'pd_book'  # make it look more lik a tag.
883                     else:
884                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
885                     pd_tags.append(tag)
886                 else:
887                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
888                     tags.append(tag)
889
890             except catalogue.models.Tag.DoesNotExist: pass
891             except PDCounterAuthor.DoesNotExist: pass
892             except PDCounterBook.DoesNotExist: pass
893
894         tags_slugs = set(map(lambda t: t.slug, tags))
895         tags = tags + filter(lambda t: not t.slug in tags_slugs, pd_tags)
896
897         log.debug('search_tags: %s' % tags)
898
899         return tags
900
901     def hint_books(self, query, prefix=True):
902         """
903         Returns auto-complete hints for book titles
904         Because we do not index 'pseudo' title-tags.
905         Prefix search.
906         """
907         q = self.index.Q()
908         query = query.strip()
909         if prefix:
910             q |= self.index.Q(title=query + "*")
911         else:
912             q |= self.make_term_query(query, field='title')
913         qu = self.index.query(q)
914         only_books = self.index.Q(is_book=True)
915         return self.search_books(qu, [only_books])
916
917     def search_books(self, query, filters=None, max_results=10):
918         """
919         Searches for Book objects using query
920         """
921         bks = []
922         bks_found = set()
923         query = query.query(is_book=True)
924         res = self.apply_filters(query, filters).field_limit(['book_id'])
925         for r in res:
926             try:
927                 bid = r['book_id']
928                 if not bid in bks_found:
929                     bks.append(catalogue.models.Book.objects.get(id=bid))
930                     bks_found.add(bid)
931             except catalogue.models.Book.DoesNotExist: pass
932         return bks
933  
934
935     @staticmethod
936     def apply_filters(query, filters):
937         """
938         Apply filters to a query
939         """
940         if filters is None: filters = []
941         filters = filter(lambda x: x is not None, filters)
942         for f in filters:
943             query = query.query(f)
944         return query