9f832e5908b0d76725cbb573bbf9ba12df28a085
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21
22 class SolrIndex(object):
23     def __init__(self, mode=None):
24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
25
26
27 class Snippets(object):
28     """
29     This class manages snippet files for indexed object (book)
30     the snippets are concatenated together, and their positions and
31     lengths are kept in lucene index fields.
32     """
33     SNIPPET_DIR = "snippets"
34
35     def __init__(self, book_id, revision=None):
36         try:
37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
38         except OSError as exc:
39             if exc.errno == errno.EEXIST:
40                 pass
41             else: raise
42         self.book_id = book_id
43         self.revision = revision
44         self.file = None
45
46     @property
47     def path(self):
48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
49         else: fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if not 'b' in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         self.file.close()
96
97     def remove(self):
98         self.revision = None
99         try:
100             os.unlink(self.path)
101             self.revision = 0
102             while True:
103                 self.revision += 1
104                 os.unlink(self.path)
105         except OSError:
106             pass
107
108
109 class Index(SolrIndex):
110     """
111     Class indexing books.
112     """
113     def __init__(self):
114         super(Index, self).__init__(mode='rw')
115
116     def delete_query(self, *queries):
117         """
118         index.delete(queries=...) doesn't work, so let's reimplement it
119         using deletion of list of uids.
120         """
121         uids = set()
122         for q in queries:
123             if isinstance(q, sunburnt.search.LuceneQuery):
124                 q = self.index.query(q)
125             q.field_limiter.update(['uid'])
126             st = 0
127             rows = 100
128             while True:
129                 ids = q.paginate(start=st, rows=rows).execute()
130                 if not len(ids):
131                     break
132                 for res in ids:
133                     uids.add(res['uid'])
134                 st += rows
135                 #        print "Will delete %s" % ','.join([x for x in uids])
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         remove_only = kw.get('remove_only', False)
149         # first, remove tags from index.
150         if tags:
151             tag_qs = []
152             for tag in tags:
153                 q_id = self.index.Q(tag_id=tag.id)
154
155                 if isinstance(tag, PDCounterAuthor):
156                     q_cat = self.index.Q(tag_category='pd_author')
157                 elif isinstance(tag, PDCounterBook):
158                     q_cat = self.index.Q(tag_category='pd_book')
159                 else:
160                     q_cat = self.index.Q(tag_category=tag.category)
161
162                 q_id_cat = self.index.Q(q_id & q_cat)
163                 tag_qs.append(q_id_cat)
164             self.delete_query(tag_qs)
165         else:  # all
166             q = self.index.Q(tag_id__any=True)
167             self.delete_query(q)
168
169         if not remove_only:
170             # then add them [all or just one passed]
171             if not tags:
172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
173                     PDCounterAuthor.objects.all(), \
174                     PDCounterBook.objects.all())
175
176             for tag in tags:
177                 if isinstance(tag, PDCounterAuthor):
178                     doc = {
179                         "tag_id": int(tag.id),
180                         "tag_name": tag.name,
181                         "tag_name_pl": tag.name,
182                         "tag_category": 'pd_author',
183                         "is_pdcounter": True,
184                         "uid": "tag%d_pd_a" % tag.id
185                         }
186                 elif isinstance(tag, PDCounterBook):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.title,
190                         "tag_name_pl": tag.title,
191                         "tag_category": 'pd_book',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_b" % tag.id
194                         }
195                 else:
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.name,
199                         "tag_name_pl": tag.name,
200                         "tag_category": tag.category,
201                         "is_pdcounter": False,
202                         "uid": "tag%d" % tag.id
203                         }
204                 print "ADD 1 %s" % doc
205                 self.index.add(doc)
206
207     def create_book_doc(self, book):
208         """
209         Create a lucene document referring book id.
210         """
211         doc = {
212             'book_id': int(book.id),
213             }
214         if book.parent is not None:
215             doc["parent_id"] = int(book.parent.id)
216         return doc
217
218     def remove_book(self, book_or_id, remove_snippets=True):
219         """Removes a book from search index.
220         book - Book instance."""
221         if isinstance(book_or_id, catalogue.models.Book):
222             book_id = book_or_id.id
223         else:
224             book_id = book_or_id
225
226         self.delete_query(self.index.Q(book_id=book_id))
227
228         if remove_snippets:
229             snippets = Snippets(book_id)
230             snippets.remove()
231
232     def index_book(self, book, book_info=None, overwrite=True):
233         """
234         Indexes the book.
235         Creates a lucene document for extracted metadata
236         and calls self.index_content() to index the contents of the book.
237         """
238         if overwrite:
239             # we don't remove snippets, since they might be still needed by
240             # threads using not reopened index
241             self.remove_book(book, remove_snippets=False)
242
243         book_doc = self.create_book_doc(book)
244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         self.index.add(book_doc)
254         del book_doc
255         book_fields = {
256             'title': meta_fields['title'],
257             'authors': meta_fields['authors'],
258             'published_date': meta_fields['published_date']
259             }
260         if 'translators' in meta_fields:
261             book_fields['translators'] = meta_fields['translators']
262
263         self.index_content(book, book_fields=book_fields)
264
265     master_tags = [
266         'opowiadanie',
267         'powiesc',
268         'dramat_wierszowany_l',
269         'dramat_wierszowany_lp',
270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
271         'wywiad',
272         ]
273
274     ignore_content_tags = [
275         'uwaga', 'extra',
276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
277         'didaskalia',
278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
279         ]
280
281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
282
283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
284
285     published_date_re = re.compile("([0-9]+)[\]. ]*$")
286
287     def extract_metadata(self, book, book_info=None, dc_only=None):
288         """
289         Extract metadata from book and returns a map of fields keyed by fieldname
290         """
291         fields = {}
292
293         if book_info is None:
294             book_info = dcparser.parse(open(book.xml_file.path))
295
296         fields['slug'] = book.slug
297         fields['tags'] = [t.name  for t in book.tags]
298         fields['is_book'] = True
299
300         # validator, name
301         for field in dcparser.BookInfo.FIELDS:
302             if dc_only and field.name not in dc_only:
303                 continue
304             if hasattr(book_info, field.name):
305                 if not getattr(book_info, field.name):
306                     continue
307                 # since no type information is available, we use validator
308                 type_indicator = field.validator
309                 if type_indicator == dcparser.as_unicode:
310                     s = getattr(book_info, field.name)
311                     if field.multiple:
312                         s = ', '.join(s)
313                     fields[field.name] = s
314                 elif type_indicator == dcparser.as_person:
315                     p = getattr(book_info, field.name)
316                     if isinstance(p, dcparser.Person):
317                         persons = unicode(p)
318                     else:
319                         persons = ', '.join(map(unicode, p))
320                     fields[field.name] = persons
321                 elif type_indicator == dcparser.as_date:
322                     dt = getattr(book_info, field.name)
323                     fields[field.name] = dt
324
325         # get published date
326         pd = None
327         if hasattr(book_info, 'source_name') and book_info.source_name:
328             match = self.published_date_re.search(book_info.source_name)
329             if match is not None:
330                 pd = str(match.groups()[0])
331         if not pd: pd = ""
332         fields["published_date"] = pd
333
334         return fields
335
336     # def add_gaps(self, fields, fieldname):
337     #     """
338     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
339     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
340     #     """
341     #     def gap():
342     #         while True:
343     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
344     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
345
346     def get_master(self, root):
347         """
348         Returns the first master tag from an etree.
349         """
350         for master in root.iter():
351             if master.tag in self.master_tags:
352                 return master
353
354     def index_content(self, book, book_fields={}):
355         """
356         Walks the book XML and extract content from it.
357         Adds parts for each header tag and for each fragment.
358         """
359         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
360         root = wld.edoc.getroot()
361
362         master = self.get_master(root)
363         if master is None:
364             return []
365
366         def walker(node, ignore_tags=[]):
367
368             if node.tag not in ignore_tags:
369                 yield node, None, None
370                 if node.text is not None:
371                     yield None, node.text, None
372                 for child in list(node):
373                     for b, t, e in walker(child):
374                         yield b, t, e
375                 yield None, None, node
376
377             if node.tail is not None:
378                 yield None, node.tail, None
379             return
380
381         def fix_format(text):
382             #            separator = [u" ", u"\t", u".", u";", u","]
383             if isinstance(text, list):
384                 # need to join it first
385                 text = filter(lambda s: s is not None, content)
386                 text = u' '.join(text)
387                 # for i in range(len(text)):
388                 #     if i > 0:
389                 #         if text[i][0] not in separator\
390                 #             and text[i - 1][-1] not in separator:
391                 #          text.insert(i, u" ")
392
393             return re.sub("(?m)/$", "", text)
394
395         def add_part(snippets, **fields):
396             doc = self.create_book_doc(book)
397             for n, v in book_fields.items():
398                 doc[n] = v
399
400             doc['header_index'] = fields["header_index"]
401             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
402             doc['header_type'] = fields['header_type']
403
404             doc['text'] = fields['text']
405
406             # snippets
407             snip_pos = snippets.add(fields["text"])
408
409             doc['snippets_position'] = snip_pos[0]
410             doc['snippets_length'] = snip_pos[1]
411             if snippets.revision:
412                 doc["snippets_revision"] = snippets.revision
413
414             if 'fragment_anchor' in fields:
415                 doc["fragment_anchor"] = fields['fragment_anchor']
416
417             if 'themes' in fields:
418                 doc['themes'] = fields['themes']
419             doc['uid'] = "part%s%s%s" % (doc['header_index'],
420                                          doc['header_span'],
421                                          doc.get('fragment_anchor', ''))
422             return doc
423
424         def give_me_utf8(s):
425             if isinstance(s, unicode):
426                 return s.encode('utf-8')
427             else:
428                 return s
429
430         fragments = {}
431         snippets = Snippets(book.id).open('w')
432         try:
433             for header, position in zip(list(master), range(len(master))):
434
435                 if header.tag in self.skip_header_tags:
436                     continue
437                 if header.tag is etree.Comment:
438                     continue
439
440                 # section content
441                 content = []
442                 footnote = []
443
444                 def all_content(text):
445                     for frag in fragments.values():
446                         frag['text'].append(text)
447                     content.append(text)
448                 handle_text = [all_content]
449
450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
451                     # handle footnotes
452                     if start is not None and start.tag in self.footnote_tags:
453                         footnote = []
454
455                         def collect_footnote(t):
456                             footnote.append(t)
457
458                         handle_text.append(collect_footnote)
459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
460                         handle_text.pop()
461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
462                                        text=u''.join(footnote),
463                                        is_footnote=True)
464                         self.index.add(doc)
465                         #print "@ footnote text: %s" % footnote
466                         footnote = []
467
468                     # handle fragments and themes.
469                     if start is not None and start.tag == 'begin':
470                         fid = start.attrib['id'][1:]
471                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
472
473                     # themes for this fragment
474                     elif start is not None and start.tag == 'motyw':
475                         fid = start.attrib['id'][1:]
476                         handle_text.append(None)
477                         if start.text is not None:
478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
479                     elif end is not None and end.tag == 'motyw':
480                         handle_text.pop()
481
482                     elif start is not None and start.tag == 'end':
483                         fid = start.attrib['id'][1:]
484                         if fid not in fragments:
485                             continue  # a broken <end> node, skip it
486                         frag = fragments[fid]
487                         if frag['themes'] == []:
488                             continue  # empty themes list.
489                         del fragments[fid]
490
491                         doc = add_part(snippets,
492                                        header_type=frag['start_header'],
493                                        header_index=frag['start_section'],
494                                        header_span=position - frag['start_section'] + 1,
495                                        fragment_anchor=fid,
496                                        text=fix_format(frag['text']),
497                                        themes=frag['themes'])
498                         #print '@ FRAG %s' % frag['content']
499                         self.index.add(doc)
500
501                         # Collect content.
502
503                     if text is not None and handle_text is not []:
504                         hdl = handle_text[-1]
505                         if hdl is not None:
506                             hdl(text)
507
508                         # in the end, add a section text.
509                 doc = add_part(snippets, header_index=position,
510                                header_type=header.tag, text=fix_format(content))
511                 #print '@ CONTENT: %s' % fix_format(content)
512
513                 self.index.add(doc)
514
515         finally:
516             snippets.close()
517
518
519 class SearchResult(object):
520     def __init__(self, doc, how_found=None, query=None, query_terms=None):
521         #        self.search = search
522         self.boost = 1.0
523         self._hits = []
524         self._processed_hits = None  # processed hits
525         self.snippets = []
526         self.query_terms = query_terms
527
528         if 'score' in doc:
529             self._score = doc['score']
530         else:
531             self._score = 0
532
533         self.book_id = int(doc["book_id"])
534
535         try:
536             self.published_date = int(doc.get("published_date"))
537         except ValueError:
538             self.published_date = 0
539
540         # content hits
541         header_type = doc.get("header_type", None)
542         # we have a content hit in some header of fragment
543         if header_type is not None:
544             sec = (header_type, int(doc["header_index"]))
545             header_span = doc['header_span']
546             header_span = header_span is not None and int(header_span) or 1
547             fragment = doc.get("fragment_anchor", None)
548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549             snippets_rev = doc['snippets_revision']
550
551             hit = (sec + (header_span,), fragment, self._score, {
552                 'how_found': how_found,
553                 'snippets_pos': snippets_pos,
554                 'snippets_revision': snippets_rev,
555                 'themes': doc.get('themes', []),
556                 'themes_pl': doc.get('themes_pl', [])
557                 })
558
559             self._hits.append(hit)
560
561     def __unicode__(self):
562         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
563             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
564
565     def __str__(self):
566         return unicode(self).encode('utf-8')
567
568     @property
569     def score(self):
570         return self._score * self.boost
571
572     def merge(self, other):
573         if self.book_id != other.book_id:
574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
575         self._hits += other._hits
576         if other.score > self.score:
577             self._score = other._score
578         return self
579
580     def get_book(self):
581         if hasattr(self, '_book'):
582             return self._book
583         self._book = catalogue.models.Book.objects.get(id=self.book_id)
584         return self._book
585
586     book = property(get_book)
587
588     POSITION = 0
589     FRAGMENT = 1
590     POSITION_INDEX = 1
591     POSITION_SPAN = 2
592     SCORE = 2
593     OTHER = 3
594
595     @property
596     def hits(self):
597         if self._processed_hits is not None:
598             return self._processed_hits
599
600         # to sections and fragments
601         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
602
603         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
604
605         # sections not covered by fragments
606         sect = filter(lambda s: 0 == len(filter(
607             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
608             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
609             frags)), sect)
610
611         hits = []
612
613         def remove_duplicates(lst, keyfn, compare):
614             els = {}
615             for e in lst:
616                 eif = keyfn(e)
617                 if eif in els:
618                     if compare(els[eif], e) >= 1:
619                         continue
620                 els[eif] = e
621             return els.values()
622
623         # remove fragments with duplicated fid's and duplicated snippets
624         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
625         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
626         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
627
628         # remove duplicate sections
629         sections = {}
630
631         for s in sect:
632             si = s[self.POSITION][self.POSITION_INDEX]
633             # skip existing
634             if si in sections:
635                 if sections[si]['score'] >= s[self.SCORE]:
636                     continue
637
638             m = {'score': s[self.SCORE],
639                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
640                  }
641             m.update(s[self.OTHER])
642             sections[si] = m
643
644         hits = sections.values()
645
646         for f in frags:
647             try:
648                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
649             except catalogue.models.Fragment.DoesNotExist:
650                 # stale index
651                 continue
652             print f
653             # Figure out if we were searching for a token matching some word in theme name.
654             themes = frag.tags.filter(category='theme')
655             themes_hit = set()
656             if self.query_terms is not None:
657                 for i in range(0, len(f[self.OTHER]['themes'])):
658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
659                     tms = map(unicode.lower, tms)
660                     for qt in self.query_terms:
661                         if qt in tms:
662                             themes_hit.add(f[self.OTHER]['themes'][i])
663                             break
664
665             def theme_by_name(n):
666                 th = filter(lambda t: t.name == n, themes)
667                 if th:
668                     return th[0]
669                 else:
670                     return None
671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
672
673             m = {'score': f[self.SCORE],
674                  'fragment': frag,
675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
676                  'themes': themes,
677                  'themes_hit': themes_hit
678                  }
679             m.update(f[self.OTHER])
680             hits.append(m)
681
682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
683
684         self._processed_hits = hits
685
686         return hits
687
688     @staticmethod
689     def aggregate(*result_lists):
690         books = {}
691         for rl in result_lists:
692             for r in rl:
693                 if r.book_id in books:
694                     books[r.book_id].merge(r)
695                 else:
696                     books[r.book_id] = r
697         return books.values()
698
699     def __cmp__(self, other):
700         c = cmp(self.score, other.score)
701         if c == 0:
702             # this is inverted, because earlier date is better
703             return cmp(other.published_date, self.published_date)
704         else:
705             return c
706
707     def __len__(self):
708         return len(self.hits)
709
710     def snippet_pos(self, idx=0):
711         return self.hits[idx]['snippets_pos']
712
713     def snippet_revision(self, idx=0):
714         try:
715             return self.hits[idx]['snippets_revision']
716         except:
717             return None
718
719
720 class Search(SolrIndex):
721     """
722     Search facilities.
723     """
724     def __init__(self, default_field="text"):
725         super(Search, self).__init__(mode='r')
726
727     # def get_tokens(self, searched, field='text', cached=None):
728     #     """returns tokens analyzed by a proper (for a field) analyzer
729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
731     #     """
732     #     if cached is not None and field in cached:
733     #         return cached[field]
734
735     #     if isinstance(searched, str) or isinstance(searched, unicode):
736     #         searched = StringReader(searched)
737     #     elif isinstance(searched, list):
738     #         return searched
739
740     #     searched.reset()
741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
742     #     toks = []
743     #     while tokens.incrementToken():
744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
745     #         toks.append(cta.toString())
746
747     #     if cached is not None:
748     #         cached[field] = toks
749
750     #     return toks
751
752     # @staticmethod
753     # def fuzziness(fuzzy):
754     #     """Helper method to sanitize fuzziness"""
755     #     if not fuzzy:
756     #         return None
757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
758     #         return fuzzy
759     #     else:
760     #         return 0.5
761
762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
763     #     """
764     #     Return a PhraseQuery with a series of tokens.
765     #     """
766     #     if fuzzy:
767     #         phrase = MultiPhraseQuery()
768     #         for t in tokens:
769     #             term = Term(field, t)
770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
771     #             fuzzterms = []
772
773     #             while True:
774     #                 ft = fuzzterm.term()
775     #                 if ft:
776     #                     fuzzterms.append(ft)
777     #                 if not fuzzterm.next(): break
778     #             if fuzzterms:
779     #                 phrase.add(JArray('object')(fuzzterms, Term))
780     #             else:
781     #                 phrase.add(term)
782     #     else:
783     #         phrase = PhraseQuery()
784     #         phrase.setSlop(slop)
785     #         for t in tokens:
786     #             term = Term(field, t)
787     #             phrase.add(term)
788     #     return phrase
789
790     def make_term_query(self, query, field='text', modal=operator.or_):
791         """
792         Returns term queries joined by boolean query.
793         modal - applies to boolean query
794         fuzzy - should the query by fuzzy.
795         """
796         q = self.index.Q()
797         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
798                         query.split(r" ")), q)
799
800         return q
801
802     def search_phrase(self, searched, field='text', book=False,
803                       filters=None,
804                       snippets=False):
805         if filters is None: filters = []
806         if book: filters.append(self.index.Q(is_book=True))
807
808         q = self.index.query(**{field: searched})
809         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
810         res = q.execute()
811         return [SearchResult(found, how_found=u'search_phrase') for found in res]
812
813     def search_some(self, searched, fields, book=True,
814                     filters=None, snippets=True, query_terms=None):
815         assert isinstance(fields, list)
816         if filters is None: filters = []
817         if book: filters.append(self.index.Q(is_book=True))
818
819         query = self.index.Q()
820
821         for fld in fields:
822             query = self.index.Q(query | self.make_term_query(searched, fld))
823
824         query = self.index.query(query)
825         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
826         res = query.execute()
827         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
828
829     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
830     #     """
831     #     Search for perfect book matches. Just see if the query matches with some author or title,
832     #     taking hints into account.
833     #     """
834     #     fields_to_search = ['authors', 'title']
835     #     only_in = None
836     #     if hint:
837     #         if not hint.should_search_for_book():
838     #             return []
839     #         fields_to_search = hint.just_search_in(fields_to_search)
840     #         only_in = hint.book_filter()
841
842     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
843
844     #     books = []
845     #     for q in qrys:
846     #         top = self.searcher.search(q,
847     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
848     #             max_results)
849     #         for found in top.scoreDocs:
850     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
851     #     return books
852
853     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
854     #     fields_to_search = ['tags', 'authors', 'title']
855
856     #     only_in = None
857     #     if hint:
858     #         if not hint.should_search_for_book():
859     #             return []
860     #         fields_to_search = hint.just_search_in(fields_to_search)
861     #         only_in = hint.book_filter()
862
863     #     tokens = self.get_tokens(searched, field='SIMPLE')
864
865     #     q = BooleanQuery()
866
867     #     for fld in fields_to_search:
868     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
869     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
870
871     #     books = []
872     #     top = self.searcher.search(q,
873     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
874     #         max_results)
875     #     for found in top.scoreDocs:
876     #         books.append(SearchResult(self, found, how_found="search_book"))
877
878     #     return books
879
880     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
881     #     """
882     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
883     #     some part/fragment of the book.
884     #     """
885     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
886
887     #     flt = None
888     #     if hint:
889     #         flt = hint.part_filter()
890
891     #     books = []
892     #     for q in qrys:
893     #         top = self.searcher.search(q,
894     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
895     #                                                        flt]),
896     #                                    max_results)
897     #         for found in top.scoreDocs:
898     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
899
900     #     return books
901
902     def search_everywhere(self, searched, query_terms=None):
903         """
904         Tries to use search terms to match different fields of book (or its parts).
905         E.g. one word can be an author survey, another be a part of the title, and the rest
906         are some words from third chapter.
907         """
908         books = []
909         # content only query : themes x content
910         q = self.make_term_query(searched, 'text')
911         q_themes = self.make_term_query(searched, 'themes_pl')
912
913         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
914         res = query.execute()
915
916         for found in res:
917             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
918
919         # query themes/content x author/title/tags
920         in_content = self.index.Q()
921         in_meta = self.index.Q()
922
923         for fld in ['themes_pl', 'text']:
924             in_content |= self.make_term_query(searched, field=fld)
925
926         for fld in ['tags', 'authors', 'title']:
927             in_meta |= self.make_term_query(searched, field=fld)
928
929         q = in_content & in_meta
930         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
931
932         for found in res:
933             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
934
935         return books
936
937     def get_snippets(self, searchresult, query, field='text', num=1):
938         """
939         Returns a snippet for found scoreDoc.
940         """
941         maxnum = len(searchresult)
942         if num is None or num < 0 or num > maxnum:
943             num = maxnum
944         book_id = searchresult.book_id
945         revision = searchresult.snippet_revision()
946         snippets = Snippets(book_id, revision=revision)
947         snips = [None] * maxnum
948         try:
949             snippets.open()
950             idx = 0
951             while idx < maxnum and num > 0:
952                 position, length = searchresult.snippet_pos(idx)
953                 if position is None or length is None:
954                     continue
955                 text = snippets.get((int(position),
956                                      int(length)))
957                 print "== %s -- %s ==" % (query, text)
958                 snip = self.index.highlight(text=text, field=field, q=query)
959                 snips[idx] = snip
960                 if snip:
961                     num -= 1
962                 idx += 1
963
964         except IOError, e:
965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
966             return []
967         finally:
968             snippets.close()
969
970             # remove verse end markers..
971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
972
973         searchresult.snippets = snips
974         return snips
975
976     def hint_tags(self, query, pdcounter=True, prefix=True):
977         """
978         Return auto-complete hints for tags
979         using prefix search.
980         """
981         q = self.index.Q()
982         query = query.strip()
983         for field in ['tag_name', 'tag_name_pl']:
984             if prefix:
985                 q |= self.index.Q(**{field: query + "*"})
986             else:
987                 q |= self.make_term_query(query, field=field)
988         qu = self.index.query(q).exclude(tag_category="book")
989
990         return self.search_tags(qu, pdcounter=pdcounter)
991
992     def search_tags(self, query, filters=None, pdcounter=False):
993         """
994         Search for Tag objects using query.
995         """
996         if not filters: filters = []
997         if not pdcounter:
998             filters.append(~self.index.Q(is_pdcounter=True))
999         res = self.apply_filters(query, filters).execute()
1000
1001         tags = []
1002         for doc in res:
1003             is_pdcounter = doc.get('is_pdcounter', False)
1004             category = doc.get('tag_category')
1005             try:
1006                 if is_pdcounter == True:
1007                     if category == 'pd_author':
1008                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1009                     elif category == 'pd_book':
1010                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1011                         tag.category = 'pd_book'  # make it look more lik a tag.
1012                     else:
1013                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1014                 else:
1015                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016                     # don't add the pdcounter tag if same tag already exists
1017
1018                 tags.append(tag)
1019
1020             except catalogue.models.Tag.DoesNotExist: pass
1021             except PDCounterAuthor.DoesNotExist: pass
1022             except PDCounterBook.DoesNotExist: pass
1023
1024         log.debug('search_tags: %s' % tags)
1025
1026         return tags
1027
1028     def hint_books(self, query, prefix=True):
1029         """
1030         Returns auto-complete hints for book titles
1031         Because we do not index 'pseudo' title-tags.
1032         Prefix search.
1033         """
1034         q = self.index.Q()
1035         query = query.strip()
1036         if prefix:
1037             q |= self.index.Q(title=query + "*")
1038         else:
1039             q |= self.make_term_query(query, field='title')
1040         qu = self.index.query(q)
1041         only_books = self.index.Q(is_book=True)
1042         return self.search_books(qu, [only_books])
1043
1044     def search_books(self, query, filters=None, max_results=10):
1045         """
1046         Searches for Book objects using query
1047         """
1048         bks = []
1049         res = self.apply_filters(query, filters).field_limit(['book_id'])
1050         for r in res:
1051             try:
1052                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1053             except catalogue.models.Book.DoesNotExist: pass
1054         return bks
1055  
1056     # def make_prefix_phrase(self, toks, field):
1057     #     q = MultiPhraseQuery()
1058     #     for i in range(len(toks)):
1059     #         t = Term(field, toks[i])
1060     #         if i == len(toks) - 1:
1061     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1062     #             if pterms:
1063     #                 q.add(pterms)
1064     #             else:
1065     #                 q.add(t)
1066     #         else:
1067     #             q.add(t)
1068     #     return q
1069
1070     # @staticmethod
1071     # def term_filter(term, inverse=False):
1072     #     only_term = TermsFilter()
1073     #     only_term.addTerm(term)
1074
1075     #     if inverse:
1076     #         neg = BooleanFilter()
1077     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1078     #         only_term = neg
1079
1080     #     return only_term
1081
1082
1083
1084     @staticmethod
1085     def apply_filters(query, filters):
1086         """
1087         Apply filters to a query
1088         """
1089         if filters is None: filters = []
1090         filters = filter(lambda x: x is not None, filters)
1091         for f in filters:
1092             query = query.query(f)
1093         return query
1094
1095     # def filtered_categories(self, tags):
1096     #     """
1097     #     Return a list of tag categories, present in tags list.
1098     #     """
1099     #     cats = {}
1100     #     for t in tags:
1101     #         cats[t.category] = True
1102     #     return cats.keys()
1103
1104     # def hint(self):
1105     #     return Hint(self)