da54fe7f0b8c8c2fcc44328b3aced1446867077e
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21
22 class SolrIndex(object):
23     def __init__(self, mode=None):
24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
25
26
27 class Snippets(object):
28     """
29     This class manages snippet files for indexed object (book)
30     the snippets are concatenated together, and their positions and
31     lengths are kept in lucene index fields.
32     """
33     SNIPPET_DIR = "snippets"
34
35     def __init__(self, book_id, revision=None):
36         try:
37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
38         except OSError as exc:
39             if exc.errno == errno.EEXIST:
40                 pass
41             else: raise
42         self.book_id = book_id
43         self.revision = revision
44         self.file = None
45
46     @property
47     def path(self):
48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
49         else: fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if not 'b' in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         self.file.close()
96
97     def remove(self):
98         self.revision = None
99         try:
100             os.unlink(self.path)
101             self.revision = 0
102             while True:
103                 self.revision += 1
104                 os.unlink(self.path)
105         except OSError:
106             pass
107
108
109 class Index(SolrIndex):
110     """
111     Class indexing books.
112     """
113     def __init__(self):
114         super(Index, self).__init__(mode='rw')
115
116     def delete_query(self, *queries):
117         """
118         index.delete(queries=...) doesn't work, so let's reimplement it
119         using deletion of list of uids.
120         """
121         uids = set()
122         for q in queries:
123             if isinstance(q, sunburnt.search.LuceneQuery):
124                 q = self.index.query(q)
125             q.field_limiter.update(['uid'])
126             st = 0
127             rows = 100
128             while True:
129                 ids = q.paginate(start=st, rows=rows).execute()
130                 if not len(ids):
131                     break
132                 for res in ids:
133                     uids.add(res['uid'])
134                 st += rows
135                 #        print "Will delete %s" % ','.join([x for x in uids])
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         remove_only = kw.get('remove_only', False)
149         # first, remove tags from index.
150         if tags:
151             tag_qs = []
152             for tag in tags:
153                 q_id = self.index.Q(tag_id=tag.id)
154
155                 if isinstance(tag, PDCounterAuthor):
156                     q_cat = self.index.Q(tag_category='pd_author')
157                 elif isinstance(tag, PDCounterBook):
158                     q_cat = self.index.Q(tag_category='pd_book')
159                 else:
160                     q_cat = self.index.Q(tag_category=tag.category)
161
162                 q_id_cat = self.index.Q(q_id & q_cat)
163                 tag_qs.append(q_id_cat)
164             self.delete_query(tag_qs)
165         else:  # all
166             q = self.index.Q(tag_id__any=True)
167             self.delete_query(q)
168
169         if not remove_only:
170             # then add them [all or just one passed]
171             if not tags:
172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
173                     PDCounterAuthor.objects.all(), \
174                     PDCounterBook.objects.all())
175
176             for tag in tags:
177                 if isinstance(tag, PDCounterAuthor):
178                     doc = {
179                         "tag_id": int(tag.id),
180                         "tag_name": tag.name,
181                         "tag_name_pl": tag.name,
182                         "tag_category": 'pd_author',
183                         "is_pdcounter": True,
184                         "uid": "tag%d_pd_a" % tag.id
185                         }
186                 elif isinstance(tag, PDCounterBook):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.title,
190                         "tag_name_pl": tag.title,
191                         "tag_category": 'pd_book',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_b" % tag.id
194                         }
195                 else:
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.name,
199                         "tag_name_pl": tag.name,
200                         "tag_category": tag.category,
201                         "is_pdcounter": False,
202                         "uid": "tag%d" % tag.id
203                         }
204                 print "ADD 1 %s" % doc
205                 self.index.add(doc)
206
207     def create_book_doc(self, book):
208         """
209         Create a lucene document referring book id.
210         """
211         doc = {
212             'book_id': int(book.id),
213             }
214         if book.parent is not None:
215             doc["parent_id"] = int(book.parent.id)
216         return doc
217
218     def remove_book(self, book_or_id, remove_snippets=True):
219         """Removes a book from search index.
220         book - Book instance."""
221         if isinstance(book_or_id, catalogue.models.Book):
222             book_id = book_or_id.id
223         else:
224             book_id = book_or_id
225
226         self.delete_query(self.index.Q(book_id=book_id))
227
228         if remove_snippets:
229             snippets = Snippets(book_id)
230             snippets.remove()
231
232     def index_book(self, book, book_info=None, overwrite=True):
233         """
234         Indexes the book.
235         Creates a lucene document for extracted metadata
236         and calls self.index_content() to index the contents of the book.
237         """
238         if overwrite:
239             # we don't remove snippets, since they might be still needed by
240             # threads using not reopened index
241             self.remove_book(book, remove_snippets=False)
242
243         book_doc = self.create_book_doc(book)
244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         print "ADD 2 %s" % book_doc
254         self.index.add(book_doc)
255         del book_doc
256         book_fields = {
257             'title': meta_fields['title'],
258             'authors': meta_fields['authors'],
259             'published_date': meta_fields['published_date']
260             }
261         if 'translators' in meta_fields:
262             book_fields['translators'] = meta_fields['translators']
263
264         self.index_content(book, book_fields=book_fields)
265
266     master_tags = [
267         'opowiadanie',
268         'powiesc',
269         'dramat_wierszowany_l',
270         'dramat_wierszowany_lp',
271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
272         'wywiad',
273         ]
274
275     ignore_content_tags = [
276         'uwaga', 'extra',
277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278         'didaskalia',
279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
280         ]
281
282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
283
284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
285
286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
287
288     def extract_metadata(self, book, book_info=None, dc_only=None):
289         """
290         Extract metadata from book and returns a map of fields keyed by fieldname
291         """
292         fields = {}
293
294         if book_info is None:
295             book_info = dcparser.parse(open(book.xml_file.path))
296
297         fields['slug'] = book.slug
298         fields['tags'] = [t.name  for t in book.tags]
299         fields['is_book'] = True
300
301         # validator, name
302         for field in dcparser.BookInfo.FIELDS:
303             if dc_only and field.name not in dc_only:
304                 continue
305             if hasattr(book_info, field.name):
306                 if not getattr(book_info, field.name):
307                     continue
308                 # since no type information is available, we use validator
309                 type_indicator = field.validator
310                 if type_indicator == dcparser.as_unicode:
311                     s = getattr(book_info, field.name)
312                     if field.multiple:
313                         s = ', '.join(s)
314                     fields[field.name] = s
315                 elif type_indicator == dcparser.as_person:
316                     p = getattr(book_info, field.name)
317                     if isinstance(p, dcparser.Person):
318                         persons = unicode(p)
319                     else:
320                         persons = ', '.join(map(unicode, p))
321                     fields[field.name] = persons
322                 elif type_indicator == dcparser.as_date:
323                     dt = getattr(book_info, field.name)
324                     fields[field.name] = dt
325
326         # get published date
327         pd = None
328         if hasattr(book_info, 'source_name') and book_info.source_name:
329             match = self.published_date_re.search(book_info.source_name)
330             if match is not None:
331                 pd = str(match.groups()[0])
332         if not pd: pd = ""
333         fields["published_date"] = pd
334
335         return fields
336
337     # def add_gaps(self, fields, fieldname):
338     #     """
339     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
340     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
341     #     """
342     #     def gap():
343     #         while True:
344     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
345     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
346
347     def get_master(self, root):
348         """
349         Returns the first master tag from an etree.
350         """
351         for master in root.iter():
352             if master.tag in self.master_tags:
353                 return master
354
355     def index_content(self, book, book_fields={}):
356         """
357         Walks the book XML and extract content from it.
358         Adds parts for each header tag and for each fragment.
359         """
360         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
361         root = wld.edoc.getroot()
362
363         master = self.get_master(root)
364         if master is None:
365             return []
366
367         def walker(node, ignore_tags=[]):
368
369             if node.tag not in ignore_tags:
370                 yield node, None, None
371                 if node.text is not None:
372                     yield None, node.text, None
373                 for child in list(node):
374                     for b, t, e in walker(child):
375                         yield b, t, e
376                 yield None, None, node
377
378             if node.tail is not None:
379                 yield None, node.tail, None
380             return
381
382         def fix_format(text):
383             #            separator = [u" ", u"\t", u".", u";", u","]
384             if isinstance(text, list):
385                 # need to join it first
386                 text = filter(lambda s: s is not None, content)
387                 text = u' '.join(text)
388                 # for i in range(len(text)):
389                 #     if i > 0:
390                 #         if text[i][0] not in separator\
391                 #             and text[i - 1][-1] not in separator:
392                 #          text.insert(i, u" ")
393
394             return re.sub("(?m)/$", "", text)
395
396         def add_part(snippets, **fields):
397             doc = self.create_book_doc(book)
398             for n, v in book_fields.items():
399                 doc[n] = v
400
401             doc['header_index'] = fields["header_index"]
402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403             doc['header_type'] = fields['header_type']
404
405             doc['text'] = fields['text']
406
407             # snippets
408             snip_pos = snippets.add(fields["text"])
409
410             doc['snippets_position'] = snip_pos[0]
411             doc['snippets_length'] = snip_pos[1]
412             if snippets.revision:
413                 doc["snippets_revision"] = snippets.revision
414
415             if 'fragment_anchor' in fields:
416                 doc["fragment_anchor"] = fields['fragment_anchor']
417
418             if 'themes' in fields:
419                 doc['themes'] = fields['themes']
420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
421                                          doc['header_span'],
422                                          doc.get('fragment_anchor', ''))
423             return doc
424
425         def give_me_utf8(s):
426             if isinstance(s, unicode):
427                 return s.encode('utf-8')
428             else:
429                 return s
430
431         fragments = {}
432         snippets = Snippets(book.id).open('w')
433         try:
434             for header, position in zip(list(master), range(len(master))):
435
436                 if header.tag in self.skip_header_tags:
437                     continue
438                 if header.tag is etree.Comment:
439                     continue
440
441                 # section content
442                 content = []
443                 footnote = []
444
445                 def all_content(text):
446                     for frag in fragments.values():
447                         frag['text'].append(text)
448                     content.append(text)
449                 handle_text = [all_content]
450
451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
452                     # handle footnotes
453                     if start is not None and start.tag in self.footnote_tags:
454                         footnote = []
455
456                         def collect_footnote(t):
457                             footnote.append(t)
458
459                         handle_text.append(collect_footnote)
460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
461                         handle_text.pop()
462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
463                                        text=u''.join(footnote),
464                                        is_footnote=True)
465                         print "ADD 3 %s" % doc
466                         self.index.add(doc)
467                         #print "@ footnote text: %s" % footnote
468                         footnote = []
469
470                     # handle fragments and themes.
471                     if start is not None and start.tag == 'begin':
472                         fid = start.attrib['id'][1:]
473                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
474
475                     # themes for this fragment
476                     elif start is not None and start.tag == 'motyw':
477                         fid = start.attrib['id'][1:]
478                         handle_text.append(None)
479                         if start.text is not None:
480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481                     elif end is not None and end.tag == 'motyw':
482                         handle_text.pop()
483
484                     elif start is not None and start.tag == 'end':
485                         fid = start.attrib['id'][1:]
486                         if fid not in fragments:
487                             continue  # a broken <end> node, skip it
488                         frag = fragments[fid]
489                         if frag['themes'] == []:
490                             continue  # empty themes list.
491                         del fragments[fid]
492
493                         doc = add_part(snippets,
494                                        header_type=frag['start_header'],
495                                        header_index=frag['start_section'],
496                                        header_span=position - frag['start_section'] + 1,
497                                        fragment_anchor=fid,
498                                        text=fix_format(frag['text']),
499                                        themes=frag['themes'])
500                         #print '@ FRAG %s' % frag['content']
501                         print "ADD 4 %s" % doc
502                         self.index.add(doc)
503
504                         # Collect content.
505
506                     if text is not None and handle_text is not []:
507                         hdl = handle_text[-1]
508                         if hdl is not None:
509                             hdl(text)
510
511                         # in the end, add a section text.
512                 doc = add_part(snippets, header_index=position,
513                                header_type=header.tag, text=fix_format(content))
514                 #print '@ CONTENT: %s' % fix_format(content)
515
516                 print "ADD 5 %s" % doc
517                 self.index.add(doc)
518
519         finally:
520             snippets.close()
521
522
523 class SearchResult(object):
524     def __init__(self, doc, how_found=None, query=None, query_terms=None):
525         #        self.search = search
526         self.boost = 1.0
527         self._hits = []
528         self._processed_hits = None  # processed hits
529         self.snippets = []
530         self.query_terms = query_terms
531
532         if 'score' in doc:
533             self._score = doc['score']
534         else:
535             self._score = 0
536
537         self.book_id = int(doc["book_id"])
538
539         try:
540             self.published_date = int(doc.get("published_date"))
541         except ValueError:
542             self.published_date = 0
543
544         # content hits
545         header_type = doc.get("header_type", None)
546         # we have a content hit in some header of fragment
547         if header_type is not None:
548             sec = (header_type, int(doc["header_index"]))
549             header_span = doc['header_span']
550             header_span = header_span is not None and int(header_span) or 1
551             fragment = doc.get("fragment_anchor", None)
552             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
553             snippets_rev = doc['snippets_revision']
554
555             hit = (sec + (header_span,), fragment, self._score, {
556                 'how_found': how_found,
557                 'snippets_pos': snippets_pos,
558                 'snippets_revision': snippets_rev,
559                 'themes': doc.get('themes', []),
560                 'themes_pl': doc.get('themes_pl', [])
561                 })
562
563             self._hits.append(hit)
564
565     def __unicode__(self):
566         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
567             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
568
569     def __str__(self):
570         return unicode(self).encode('utf-8')
571
572     @property
573     def score(self):
574         return self._score * self.boost
575
576     def merge(self, other):
577         if self.book_id != other.book_id:
578             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
579         self._hits += other._hits
580         if other.score > self.score:
581             self._score = other._score
582         return self
583
584     def get_book(self):
585         if hasattr(self, '_book'):
586             return self._book
587         self._book = catalogue.models.Book.objects.get(id=self.book_id)
588         return self._book
589
590     book = property(get_book)
591
592     POSITION = 0
593     FRAGMENT = 1
594     POSITION_INDEX = 1
595     POSITION_SPAN = 2
596     SCORE = 2
597     OTHER = 3
598
599     @property
600     def hits(self):
601         if self._processed_hits is not None:
602             return self._processed_hits
603
604         # to sections and fragments
605         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
606
607         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
608
609         # sections not covered by fragments
610         sect = filter(lambda s: 0 == len(filter(
611             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
612             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
613             frags)), sect)
614
615         hits = []
616
617         def remove_duplicates(lst, keyfn, compare):
618             els = {}
619             for e in lst:
620                 eif = keyfn(e)
621                 if eif in els:
622                     if compare(els[eif], e) >= 1:
623                         continue
624                 els[eif] = e
625             return els.values()
626
627         # remove fragments with duplicated fid's and duplicated snippets
628         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
629         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
630         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
631
632         # remove duplicate sections
633         sections = {}
634
635         for s in sect:
636             si = s[self.POSITION][self.POSITION_INDEX]
637             # skip existing
638             if si in sections:
639                 if sections[si]['score'] >= s[self.SCORE]:
640                     continue
641
642             m = {'score': s[self.SCORE],
643                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
644                  }
645             m.update(s[self.OTHER])
646             sections[si] = m
647
648         hits = sections.values()
649
650         for f in frags:
651             try:
652                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
653             except catalogue.models.Fragment.DoesNotExist:
654                 # stale index
655                 continue
656             print f
657             # Figure out if we were searching for a token matching some word in theme name.
658             themes = frag.tags.filter(category='theme')
659             themes_hit = set()
660             if self.query_terms is not None:
661                 for i in range(0, len(f[self.OTHER]['themes'])):
662                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
663                     tms = map(unicode.lower, tms)
664                     for qt in self.query_terms:
665                         if qt in tms:
666                             themes_hit.add(f[self.OTHER]['themes'][i])
667                             break
668
669             def theme_by_name(n):
670                 th = filter(lambda t: t.name == n, themes)
671                 if th:
672                     return th[0]
673                 else:
674                     return None
675             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
676
677             m = {'score': f[self.SCORE],
678                  'fragment': frag,
679                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
680                  'themes': themes,
681                  'themes_hit': themes_hit
682                  }
683             m.update(f[self.OTHER])
684             hits.append(m)
685
686         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
687
688         self._processed_hits = hits
689
690         return hits
691
692     @staticmethod
693     def aggregate(*result_lists):
694         books = {}
695         for rl in result_lists:
696             for r in rl:
697                 if r.book_id in books:
698                     books[r.book_id].merge(r)
699                 else:
700                     books[r.book_id] = r
701         return books.values()
702
703     def __cmp__(self, other):
704         c = cmp(self.score, other.score)
705         if c == 0:
706             # this is inverted, because earlier date is better
707             return cmp(other.published_date, self.published_date)
708         else:
709             return c
710
711     def __len__(self):
712         return len(self.hits)
713
714     def snippet_pos(self, idx=0):
715         return self.hits[idx]['snippets_pos']
716
717     def snippet_revision(self, idx=0):
718         try:
719             return self.hits[idx]['snippets_revision']
720         except:
721             return None
722
723
724 class Search(SolrIndex):
725     """
726     Search facilities.
727     """
728     def __init__(self, default_field="text"):
729         super(Search, self).__init__(mode='r')
730
731     # def get_tokens(self, searched, field='text', cached=None):
732     #     """returns tokens analyzed by a proper (for a field) analyzer
733     #     argument can be: StringReader, string/unicode, or tokens. In the last case
734     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
735     #     """
736     #     if cached is not None and field in cached:
737     #         return cached[field]
738
739     #     if isinstance(searched, str) or isinstance(searched, unicode):
740     #         searched = StringReader(searched)
741     #     elif isinstance(searched, list):
742     #         return searched
743
744     #     searched.reset()
745     #     tokens = self.analyzer.reusableTokenStream(field, searched)
746     #     toks = []
747     #     while tokens.incrementToken():
748     #         cta = tokens.getAttribute(CharTermAttribute.class_)
749     #         toks.append(cta.toString())
750
751     #     if cached is not None:
752     #         cached[field] = toks
753
754     #     return toks
755
756     # @staticmethod
757     # def fuzziness(fuzzy):
758     #     """Helper method to sanitize fuzziness"""
759     #     if not fuzzy:
760     #         return None
761     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
762     #         return fuzzy
763     #     else:
764     #         return 0.5
765
766     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
767     #     """
768     #     Return a PhraseQuery with a series of tokens.
769     #     """
770     #     if fuzzy:
771     #         phrase = MultiPhraseQuery()
772     #         for t in tokens:
773     #             term = Term(field, t)
774     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
775     #             fuzzterms = []
776
777     #             while True:
778     #                 ft = fuzzterm.term()
779     #                 if ft:
780     #                     fuzzterms.append(ft)
781     #                 if not fuzzterm.next(): break
782     #             if fuzzterms:
783     #                 phrase.add(JArray('object')(fuzzterms, Term))
784     #             else:
785     #                 phrase.add(term)
786     #     else:
787     #         phrase = PhraseQuery()
788     #         phrase.setSlop(slop)
789     #         for t in tokens:
790     #             term = Term(field, t)
791     #             phrase.add(term)
792     #     return phrase
793
794     def make_term_query(self, query, field='text', modal=operator.or_):
795         """
796         Returns term queries joined by boolean query.
797         modal - applies to boolean query
798         fuzzy - should the query by fuzzy.
799         """
800         q = self.index.Q()
801         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
802                         query.split(r" ")), q)
803
804         return q
805
806     def search_phrase(self, searched, field='text', book=False,
807                       filters=None,
808                       snippets=False):
809         if filters is None: filters = []
810         if book: filters.append(self.index.Q(is_book=True))
811
812         q = self.index.query(**{field: searched})
813         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
814         res = q.execute()
815         return [SearchResult(found, how_found=u'search_phrase') for found in res]
816
817     def search_some(self, searched, fields, book=True,
818                     filters=None, snippets=True, query_terms=None):
819         assert isinstance(fields, list)
820         if filters is None: filters = []
821         if book: filters.append(self.index.Q(is_book=True))
822
823         query = self.index.Q()
824
825         for fld in fields:
826             query = self.index.Q(query | self.make_term_query(searched, fld))
827
828         query = self.index.query(query)
829         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
830         res = query.execute()
831         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
832
833     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
834     #     """
835     #     Search for perfect book matches. Just see if the query matches with some author or title,
836     #     taking hints into account.
837     #     """
838     #     fields_to_search = ['authors', 'title']
839     #     only_in = None
840     #     if hint:
841     #         if not hint.should_search_for_book():
842     #             return []
843     #         fields_to_search = hint.just_search_in(fields_to_search)
844     #         only_in = hint.book_filter()
845
846     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
847
848     #     books = []
849     #     for q in qrys:
850     #         top = self.searcher.search(q,
851     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
852     #             max_results)
853     #         for found in top.scoreDocs:
854     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
855     #     return books
856
857     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
858     #     fields_to_search = ['tags', 'authors', 'title']
859
860     #     only_in = None
861     #     if hint:
862     #         if not hint.should_search_for_book():
863     #             return []
864     #         fields_to_search = hint.just_search_in(fields_to_search)
865     #         only_in = hint.book_filter()
866
867     #     tokens = self.get_tokens(searched, field='SIMPLE')
868
869     #     q = BooleanQuery()
870
871     #     for fld in fields_to_search:
872     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
873     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
874
875     #     books = []
876     #     top = self.searcher.search(q,
877     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
878     #         max_results)
879     #     for found in top.scoreDocs:
880     #         books.append(SearchResult(self, found, how_found="search_book"))
881
882     #     return books
883
884     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
885     #     """
886     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
887     #     some part/fragment of the book.
888     #     """
889     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
890
891     #     flt = None
892     #     if hint:
893     #         flt = hint.part_filter()
894
895     #     books = []
896     #     for q in qrys:
897     #         top = self.searcher.search(q,
898     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
899     #                                                        flt]),
900     #                                    max_results)
901     #         for found in top.scoreDocs:
902     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
903
904     #     return books
905
906     def search_everywhere(self, searched, query_terms=None):
907         """
908         Tries to use search terms to match different fields of book (or its parts).
909         E.g. one word can be an author survey, another be a part of the title, and the rest
910         are some words from third chapter.
911         """
912         books = []
913         # content only query : themes x content
914         q = self.make_term_query(searched, 'text')
915         q_themes = self.make_term_query(searched, 'themes_pl')
916
917         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
918         res = query.execute()
919
920         for found in res:
921             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
922
923         # query themes/content x author/title/tags
924         in_content = self.index.Q()
925         in_meta = self.index.Q()
926
927         for fld in ['themes_pl', 'text']:
928             in_content |= self.make_term_query(searched, field=fld)
929
930         for fld in ['tags', 'authors', 'title']:
931             in_meta |= self.make_term_query(searched, field=fld)
932
933         q = in_content & in_meta
934         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
935
936         for found in res:
937             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
938
939         return books
940
941     def get_snippets(self, searchresult, query, field='text', num=1):
942         """
943         Returns a snippet for found scoreDoc.
944         """
945         maxnum = len(searchresult)
946         if num is None or num < 0 or num > maxnum:
947             num = maxnum
948         book_id = searchresult.book_id
949         revision = searchresult.snippet_revision()
950         snippets = Snippets(book_id, revision=revision)
951         snips = [None] * maxnum
952         try:
953             snippets.open()
954             idx = 0
955             while idx < maxnum and num > 0:
956                 position, length = searchresult.snippet_pos(idx)
957                 if position is None or length is None:
958                     continue
959                 text = snippets.get((int(position),
960                                      int(length)))
961                 print "== %s -- %s ==" % (query, text)
962                 snip = self.index.highlight(text=text, field=field, q=query)
963                 snips[idx] = snip
964                 if snip:
965                     num -= 1
966                 idx += 1
967
968         except IOError, e:
969             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
970             return []
971         finally:
972             snippets.close()
973
974             # remove verse end markers..
975         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
976
977         searchresult.snippets = snips
978         return snips
979
980     def hint_tags(self, query, pdcounter=True, prefix=True):
981         """
982         Return auto-complete hints for tags
983         using prefix search.
984         """
985         q = self.index.Q()
986         query = query.strip()
987         for field in ['tag_name', 'tag_name_pl']:
988             if prefix:
989                 q |= self.index.Q(**{field: query + "*"})
990             else:
991                 q |= self.make_term_query(query, field=field)
992         qu = self.index.query(q).exclude(tag_category="book")
993
994         return self.search_tags(qu, pdcounter=pdcounter)
995
996     def search_tags(self, query, filters=None, pdcounter=False):
997         """
998         Search for Tag objects using query.
999         """
1000         if not filters: filters = []
1001         if not pdcounter:
1002             filters.append(~self.index.Q(is_pdcounter=True))
1003         res = self.apply_filters(query, filters).execute()
1004
1005         tags = []
1006         for doc in res:
1007             is_pdcounter = doc.get('is_pdcounter', False)
1008             category = doc.get('tag_category')
1009             try:
1010                 if is_pdcounter == True:
1011                     if category == 'pd_author':
1012                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1013                     elif category == 'pd_book':
1014                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1015                         tag.category = 'pd_book'  # make it look more lik a tag.
1016                     else:
1017                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1018                 else:
1019                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1020                     # don't add the pdcounter tag if same tag already exists
1021
1022                 tags.append(tag)
1023
1024             except catalogue.models.Tag.DoesNotExist: pass
1025             except PDCounterAuthor.DoesNotExist: pass
1026             except PDCounterBook.DoesNotExist: pass
1027
1028         log.debug('search_tags: %s' % tags)
1029
1030         return tags
1031
1032     def hint_books(self, query, prefix=True):
1033         """
1034         Returns auto-complete hints for book titles
1035         Because we do not index 'pseudo' title-tags.
1036         Prefix search.
1037         """
1038         q = self.index.Q()
1039         query = query.strip()
1040         if prefix:
1041             q |= self.index.Q(title=query + "*")
1042         else:
1043             q |= self.make_term_query(query, field='title')
1044         qu = self.index.query(q)
1045         only_books = self.index.Q(is_book=True)
1046         return self.search_books(qu, [only_books])
1047
1048     def search_books(self, query, filters=None, max_results=10):
1049         """
1050         Searches for Book objects using query
1051         """
1052         bks = []
1053         res = self.apply_filters(query, filters).field_limit(['book_id'])
1054         for r in res:
1055             try:
1056                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1057             except catalogue.models.Book.DoesNotExist: pass
1058         return bks
1059  
1060     # def make_prefix_phrase(self, toks, field):
1061     #     q = MultiPhraseQuery()
1062     #     for i in range(len(toks)):
1063     #         t = Term(field, toks[i])
1064     #         if i == len(toks) - 1:
1065     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1066     #             if pterms:
1067     #                 q.add(pterms)
1068     #             else:
1069     #                 q.add(t)
1070     #         else:
1071     #             q.add(t)
1072     #     return q
1073
1074     # @staticmethod
1075     # def term_filter(term, inverse=False):
1076     #     only_term = TermsFilter()
1077     #     only_term.addTerm(term)
1078
1079     #     if inverse:
1080     #         neg = BooleanFilter()
1081     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1082     #         only_term = neg
1083
1084     #     return only_term
1085
1086
1087
1088     @staticmethod
1089     def apply_filters(query, filters):
1090         """
1091         Apply filters to a query
1092         """
1093         if filters is None: filters = []
1094         filters = filter(lambda x: x is not None, filters)
1095         for f in filters:
1096             query = query.query(f)
1097         return query
1098
1099     # def filtered_categories(self, tags):
1100     #     """
1101     #     Return a list of tag categories, present in tags list.
1102     #     """
1103     #     cats = {}
1104     #     for t in tags:
1105     #         cats[t.category] = True
1106     #     return cats.keys()
1107
1108     # def hint(self):
1109     #     return Hint(self)