4962cae99bb4fc4d2431927ee0d1b2fe2c9c5d81
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21
22 class SolrIndex(object):
23     def __init__(self, mode=None):
24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
25
26
27 class Snippets(object):
28     """
29     This class manages snippet files for indexed object (book)
30     the snippets are concatenated together, and their positions and
31     lengths are kept in lucene index fields.
32     """
33     SNIPPET_DIR = "snippets"
34
35     def __init__(self, book_id, revision=None):
36         try:
37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
38         except OSError as exc:
39             if exc.errno == errno.EEXIST:
40                 pass
41             else: raise
42         self.book_id = book_id
43         self.revision = revision
44         self.file = None
45
46     @property
47     def path(self):
48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
49         else: fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if not 'b' in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         self.file.close()
96
97     def remove(self):
98         self.revision = None
99         try:
100             os.unlink(self.path)
101             self.revision = 0
102             while True:
103                 self.revision += 1
104                 os.unlink(self.path)
105         except OSError:
106             pass
107
108
109 class Index(SolrIndex):
110     """
111     Class indexing books.
112     """
113     def __init__(self):
114         super(Index, self).__init__()
115
116     def delete_query(self, *queries):
117         """
118         index.delete(queries=...) doesn't work, so let's reimplement it
119         using deletion of list of uids.
120         """
121         uids = set()
122         for q in queries:
123             if isinstance(q, sunburnt.search.LuceneQuery):
124                 q = self.index.query(q)
125             q.field_limiter.update(['uid'])
126             st = 0
127             rows = 100
128             while True:
129                 ids = q.paginate(start=st, rows=rows).execute()
130                 if not len(ids):
131                     break
132                 for res in ids:
133                     uids.add(res['uid'])
134                 st += rows
135                 #        print "Will delete %s" % ','.join([x for x in uids])
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         remove_only = kw.get('remove_only', False)
149         # first, remove tags from index.
150         if tags:
151             tag_qs = []
152             for tag in tags:
153                 q_id = self.index.Q(tag_id=tag.id)
154
155                 if isinstance(tag, PDCounterAuthor):
156                     q_cat = self.index.Q(tag_category='pd_author')
157                 elif isinstance(tag, PDCounterBook):
158                     q_cat = self.index.Q(tag_category='pd_book')
159                 else:
160                     q_cat = self.index.Q(tag_category=tag.category)
161
162                 q_id_cat = self.index.Q(q_id & q_cat)
163                 tag_qs.append(q_id_cat)
164             self.delete_query(tag_qs)
165         else:  # all
166             q = self.index.Q(tag_id__any=True)
167             self.delete_query(q)
168
169         if not remove_only:
170             # then add them [all or just one passed]
171             if not tags:
172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
173                     PDCounterAuthor.objects.all(), \
174                     PDCounterBook.objects.all())
175
176             for tag in tags:
177                 if isinstance(tag, PDCounterAuthor):
178                     doc = {
179                         "tag_id": int(tag.id),
180                         "tag_name": tag.name,
181                         "tag_name_pl": tag.name,
182                         "tag_category": 'pd_author',
183                         "is_pdcounter": True,
184                         "uid": "tag%d_pd_a" % tag.id
185                         }
186                 elif isinstance(tag, PDCounterBook):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.title,
190                         "tag_name_pl": tag.title,
191                         "tag_category": 'pd_book',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_b" % tag.id
194                         }
195                 else:
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.name,
199                         "tag_name_pl": tag.name,
200                         "tag_category": tag.category,
201                         "is_pdcounter": False,
202                         "uid": "tag%d" % tag.id
203                         }
204                 self.index.add(doc)
205
206     def create_book_doc(self, book):
207         """
208         Create a lucene document referring book id.
209         """
210         doc = {
211             'book_id': int(book.id),
212             }
213         if book.parent is not None:
214             doc["parent_id"] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
244         # let's not index it - it's only used for extracting publish date
245         if 'source_name' in meta_fields:
246             del meta_fields['source_name']
247
248         for n, f in meta_fields.items():
249             book_doc[n] = f
250
251         book_doc['uid'] = "book%s" % book_doc['book_id']
252         self.index.add(book_doc)
253         del book_doc
254         book_fields = {
255             'title': meta_fields['title'],
256             'authors': meta_fields['authors'],
257             'published_date': meta_fields['published_date']
258             }
259         if 'translators' in meta_fields:
260             book_fields['translators'] = meta_fields['translators']
261
262         self.index_content(book, book_fields=book_fields)
263
264     master_tags = [
265         'opowiadanie',
266         'powiesc',
267         'dramat_wierszowany_l',
268         'dramat_wierszowany_lp',
269         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
270         'wywiad',
271         ]
272
273     ignore_content_tags = [
274         'uwaga', 'extra',
275         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
276         'didaskalia',
277         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
278         ]
279
280     footnote_tags = ['pa', 'pt', 'pr', 'pe']
281
282     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
283
284     published_date_re = re.compile("([0-9]+)[\]. ]*$")
285
286     def extract_metadata(self, book, book_info=None, dc_only=None):
287         """
288         Extract metadata from book and returns a map of fields keyed by fieldname
289         """
290         fields = {}
291
292         if book_info is None:
293             book_info = dcparser.parse(open(book.xml_file.path))
294
295         fields['slug'] = book.slug
296         fields['tags'] = [t.name  for t in book.tags]
297         fields['is_book'] = True
298
299         # validator, name
300         for field in dcparser.BookInfo.FIELDS:
301             if dc_only and field.name not in dc_only:
302                 continue
303             if hasattr(book_info, field.name):
304                 if not getattr(book_info, field.name):
305                     continue
306                 # since no type information is available, we use validator
307                 type_indicator = field.validator
308                 if type_indicator == dcparser.as_unicode:
309                     s = getattr(book_info, field.name)
310                     if field.multiple:
311                         s = ', '.join(s)
312                     fields[field.name] = s
313                 elif type_indicator == dcparser.as_person:
314                     p = getattr(book_info, field.name)
315                     if isinstance(p, dcparser.Person):
316                         persons = unicode(p)
317                     else:
318                         persons = ', '.join(map(unicode, p))
319                     fields[field.name] = persons
320                 elif type_indicator == dcparser.as_date:
321                     dt = getattr(book_info, field.name)
322                     fields[field.name] = dt
323
324         # get published date
325         pd = None
326         if hasattr(book_info, 'source_name') and book_info.source_name:
327             match = self.published_date_re.search(book_info.source_name)
328             if match is not None:
329                 pd = str(match.groups()[0])
330         if not pd: pd = ""
331         fields["published_date"] = pd
332
333         return fields
334
335     # def add_gaps(self, fields, fieldname):
336     #     """
337     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
338     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
339     #     """
340     #     def gap():
341     #         while True:
342     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
343     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
344
345     def get_master(self, root):
346         """
347         Returns the first master tag from an etree.
348         """
349         for master in root.iter():
350             if master.tag in self.master_tags:
351                 return master
352
353     def index_content(self, book, book_fields={}):
354         """
355         Walks the book XML and extract content from it.
356         Adds parts for each header tag and for each fragment.
357         """
358         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
359         root = wld.edoc.getroot()
360
361         master = self.get_master(root)
362         if master is None:
363             return []
364
365         def walker(node, ignore_tags=[]):
366
367             if node.tag not in ignore_tags:
368                 yield node, None, None
369                 if node.text is not None:
370                     yield None, node.text, None
371                 for child in list(node):
372                     for b, t, e in walker(child):
373                         yield b, t, e
374                 yield None, None, node
375
376             if node.tail is not None:
377                 yield None, node.tail, None
378             return
379
380         def fix_format(text):
381             #            separator = [u" ", u"\t", u".", u";", u","]
382             if isinstance(text, list):
383                 # need to join it first
384                 text = filter(lambda s: s is not None, content)
385                 text = u' '.join(text)
386                 # for i in range(len(text)):
387                 #     if i > 0:
388                 #         if text[i][0] not in separator\
389                 #             and text[i - 1][-1] not in separator:
390                 #          text.insert(i, u" ")
391
392             return re.sub("(?m)/$", "", text)
393
394         def add_part(snippets, **fields):
395             doc = self.create_book_doc(book)
396             for n, v in book_fields.items():
397                 doc[n] = v
398
399             doc['header_index'] = fields["header_index"]
400             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
401             doc['header_type'] = fields['header_type']
402
403             doc['text'] = fields['text']
404
405             # snippets
406             snip_pos = snippets.add(fields["text"])
407
408             doc['snippets_position'] = snip_pos[0]
409             doc['snippets_length'] = snip_pos[1]
410             if snippets.revision:
411                 doc["snippets_revision"] = snippets.revision
412
413             if 'fragment_anchor' in fields:
414                 doc["fragment_anchor"] = fields['fragment_anchor']
415
416             if 'themes' in fields:
417                 doc['themes'] = fields['themes']
418             doc['uid'] = "part%s%s%s" % (doc['header_index'],
419                                          doc['header_span'],
420                                          doc.get('fragment_anchor', ''))
421             return doc
422
423         def give_me_utf8(s):
424             if isinstance(s, unicode):
425                 return s.encode('utf-8')
426             else:
427                 return s
428
429         fragments = {}
430         snippets = Snippets(book.id).open('w')
431         try:
432             for header, position in zip(list(master), range(len(master))):
433
434                 if header.tag in self.skip_header_tags:
435                     continue
436                 if header.tag is etree.Comment:
437                     continue
438
439                 # section content
440                 content = []
441                 footnote = []
442
443                 def all_content(text):
444                     for frag in fragments.values():
445                         frag['text'].append(text)
446                     content.append(text)
447                 handle_text = [all_content]
448
449                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
450                     # handle footnotes
451                     if start is not None and start.tag in self.footnote_tags:
452                         footnote = []
453
454                         def collect_footnote(t):
455                             footnote.append(t)
456
457                         handle_text.append(collect_footnote)
458                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
459                         handle_text.pop()
460                         doc = add_part(snippets, header_index=position, header_type=header.tag,
461                                        text=u''.join(footnote),
462                                        is_footnote=True)
463
464                         self.index.add(doc)
465                         #print "@ footnote text: %s" % footnote
466                         footnote = []
467
468                     # handle fragments and themes.
469                     if start is not None and start.tag == 'begin':
470                         fid = start.attrib['id'][1:]
471                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
472
473                     # themes for this fragment
474                     elif start is not None and start.tag == 'motyw':
475                         fid = start.attrib['id'][1:]
476                         handle_text.append(None)
477                         if start.text is not None:
478                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
479                     elif end is not None and end.tag == 'motyw':
480                         handle_text.pop()
481
482                     elif start is not None and start.tag == 'end':
483                         fid = start.attrib['id'][1:]
484                         if fid not in fragments:
485                             continue  # a broken <end> node, skip it
486                         frag = fragments[fid]
487                         if frag['themes'] == []:
488                             continue  # empty themes list.
489                         del fragments[fid]
490
491                         doc = add_part(snippets,
492                                        header_type=frag['start_header'],
493                                        header_index=frag['start_section'],
494                                        header_span=position - frag['start_section'] + 1,
495                                        fragment_anchor=fid,
496                                        text=fix_format(frag['text']),
497                                        themes=frag['themes'])
498                         #print '@ FRAG %s' % frag['content']
499                         self.index.add(doc)
500
501                         # Collect content.
502
503                     if text is not None and handle_text is not []:
504                         hdl = handle_text[-1]
505                         if hdl is not None:
506                             hdl(text)
507
508                         # in the end, add a section text.
509                 doc = add_part(snippets, header_index=position,
510                                header_type=header.tag, text=fix_format(content))
511                 #print '@ CONTENT: %s' % fix_format(content)
512
513                 self.index.add(doc)
514
515         finally:
516             snippets.close()
517
518
519 class SearchResult(object):
520     def __init__(self, doc, how_found=None, query=None, query_terms=None):
521         #        self.search = search
522         self.boost = 1.0
523         self._hits = []
524         self._processed_hits = None  # processed hits
525         self.snippets = []
526         self.query_terms = query_terms
527
528         if 'score' in doc:
529             self._score = doc['score']
530         else:
531             self._score = 0
532
533         self.book_id = int(doc["book_id"])
534
535         try:
536             self.published_date = int(doc.get("published_date"))
537         except ValueError:
538             self.published_date = 0
539
540         # content hits
541         header_type = doc.get("header_type", None)
542         # we have a content hit in some header of fragment
543         if header_type is not None:
544             sec = (header_type, int(doc["header_index"]))
545             header_span = doc['header_span']
546             header_span = header_span is not None and int(header_span) or 1
547             fragment = doc.get("fragment_anchor", None)
548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549             snippets_rev = doc['snippets_revision']
550
551             hit = (sec + (header_span,), fragment, self._score, {
552                 'how_found': how_found,
553                 'snippets_pos': snippets_pos,
554                 'snippets_revision': snippets_rev,
555                 'themes': doc.get('themes', []),
556                 'themes_pl': doc.get('themes_pl', [])
557                 })
558
559             self._hits.append(hit)
560
561     def __unicode__(self):
562         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
563             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
564
565     def __str__(self):
566         return unicode(self).encode('utf-8')
567
568     @property
569     def score(self):
570         return self._score * self.boost
571
572     def merge(self, other):
573         if self.book_id != other.book_id:
574             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
575         self._hits += other._hits
576         if other.score > self.score:
577             self._score = other._score
578         return self
579
580     def get_book(self):
581         if hasattr(self, '_book'):
582             return self._book
583         self._book = catalogue.models.Book.objects.get(id=self.book_id)
584         return self._book
585
586     book = property(get_book)
587
588     POSITION = 0
589     FRAGMENT = 1
590     POSITION_INDEX = 1
591     POSITION_SPAN = 2
592     SCORE = 2
593     OTHER = 3
594
595     @property
596     def hits(self):
597         if self._processed_hits is not None:
598             return self._processed_hits
599
600         # to sections and fragments
601         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
602
603         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
604
605         # sections not covered by fragments
606         sect = filter(lambda s: 0 == len(filter(
607             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
608             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
609             frags)), sect)
610
611         hits = []
612
613         def remove_duplicates(lst, keyfn, compare):
614             els = {}
615             for e in lst:
616                 eif = keyfn(e)
617                 if eif in els:
618                     if compare(els[eif], e) >= 1:
619                         continue
620                 els[eif] = e
621             return els.values()
622
623         # remove fragments with duplicated fid's and duplicated snippets
624         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
625         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
626         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
627
628         # remove duplicate sections
629         sections = {}
630
631         for s in sect:
632             si = s[self.POSITION][self.POSITION_INDEX]
633             # skip existing
634             if si in sections:
635                 if sections[si]['score'] >= s[self.SCORE]:
636                     continue
637
638             m = {'score': s[self.SCORE],
639                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
640                  }
641             m.update(s[self.OTHER])
642             sections[si] = m
643
644         hits = sections.values()
645
646         for f in frags:
647             try:
648                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
649             except catalogue.models.Fragment.DoesNotExist:
650                 # stale index
651                 continue
652             print f
653             # Figure out if we were searching for a token matching some word in theme name.
654             themes = frag.tags.filter(category='theme')
655             themes_hit = set()
656             if self.query_terms is not None:
657                 for i in range(0, len(f[self.OTHER]['themes'])):
658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
659                     tms = map(unicode.lower, tms)
660                     for qt in self.query_terms:
661                         if qt in tms:
662                             themes_hit.add(f[self.OTHER]['themes'][i])
663                             break
664
665             def theme_by_name(n):
666                 th = filter(lambda t: t.name == n, themes)
667                 if th:
668                     return th[0]
669                 else:
670                     return None
671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
672
673             m = {'score': f[self.SCORE],
674                  'fragment': frag,
675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
676                  'themes': themes,
677                  'themes_hit': themes_hit
678                  }
679             m.update(f[self.OTHER])
680             hits.append(m)
681
682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
683
684         self._processed_hits = hits
685
686         return hits
687
688     @staticmethod
689     def aggregate(*result_lists):
690         books = {}
691         for rl in result_lists:
692             for r in rl:
693                 if r.book_id in books:
694                     books[r.book_id].merge(r)
695                 else:
696                     books[r.book_id] = r
697         return books.values()
698
699     def __cmp__(self, other):
700         c = cmp(self.score, other.score)
701         if c == 0:
702             # this is inverted, because earlier date is better
703             return cmp(other.published_date, self.published_date)
704         else:
705             return c
706
707     def __len__(self):
708         return len(self.hits)
709
710     def snippet_pos(self, idx=0):
711         return self.hits[idx]['snippets_pos']
712
713     def snippet_revision(self, idx=0):
714         try:
715             return self.hits[idx]['snippets_revision']
716         except:
717             return None
718
719
720 class Search(SolrIndex):
721     """
722     Search facilities.
723     """
724     def __init__(self, default_field="text"):
725         super(Search, self).__init__()
726
727     # def get_tokens(self, searched, field='text', cached=None):
728     #     """returns tokens analyzed by a proper (for a field) analyzer
729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
731     #     """
732     #     if cached is not None and field in cached:
733     #         return cached[field]
734
735     #     if isinstance(searched, str) or isinstance(searched, unicode):
736     #         searched = StringReader(searched)
737     #     elif isinstance(searched, list):
738     #         return searched
739
740     #     searched.reset()
741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
742     #     toks = []
743     #     while tokens.incrementToken():
744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
745     #         toks.append(cta.toString())
746
747     #     if cached is not None:
748     #         cached[field] = toks
749
750     #     return toks
751
752     # @staticmethod
753     # def fuzziness(fuzzy):
754     #     """Helper method to sanitize fuzziness"""
755     #     if not fuzzy:
756     #         return None
757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
758     #         return fuzzy
759     #     else:
760     #         return 0.5
761
762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
763     #     """
764     #     Return a PhraseQuery with a series of tokens.
765     #     """
766     #     if fuzzy:
767     #         phrase = MultiPhraseQuery()
768     #         for t in tokens:
769     #             term = Term(field, t)
770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
771     #             fuzzterms = []
772
773     #             while True:
774     #                 ft = fuzzterm.term()
775     #                 if ft:
776     #                     fuzzterms.append(ft)
777     #                 if not fuzzterm.next(): break
778     #             if fuzzterms:
779     #                 phrase.add(JArray('object')(fuzzterms, Term))
780     #             else:
781     #                 phrase.add(term)
782     #     else:
783     #         phrase = PhraseQuery()
784     #         phrase.setSlop(slop)
785     #         for t in tokens:
786     #             term = Term(field, t)
787     #             phrase.add(term)
788     #     return phrase
789
790     def make_term_query(self, query, field='text', modal=operator.or_):
791         """
792         Returns term queries joined by boolean query.
793         modal - applies to boolean query
794         fuzzy - should the query by fuzzy.
795         """
796         q = self.index.Q()
797         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
798                         query.split(r" ")), q)
799
800         return q
801
802     def search_phrase(self, searched, field='text', book=False,
803                       filters=None,
804                       snippets=False):
805         if filters is None: filters = []
806         if book: filters.append(self.index.Q(is_book=True))
807
808         q = self.index.query(**{field: searched})
809         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
810         res = q.execute()
811         return [SearchResult(found, how_found=u'search_phrase') for found in res]
812
813     def search_some(self, searched, fields, book=True,
814                     filters=None, snippets=True, query_terms=None):
815         assert isinstance(fields, list)
816         if filters is None: filters = []
817         if book: filters.append(self.index.Q(is_book=True))
818
819         query = self.index.Q()
820
821         for fld in fields:
822             query = self.index.Q(query | self.make_term_query(searched, fld))
823
824         query = self.index.query(query)
825         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
826         res = query.execute()
827         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
828
829     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
830     #     """
831     #     Search for perfect book matches. Just see if the query matches with some author or title,
832     #     taking hints into account.
833     #     """
834     #     fields_to_search = ['authors', 'title']
835     #     only_in = None
836     #     if hint:
837     #         if not hint.should_search_for_book():
838     #             return []
839     #         fields_to_search = hint.just_search_in(fields_to_search)
840     #         only_in = hint.book_filter()
841
842     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
843
844     #     books = []
845     #     for q in qrys:
846     #         top = self.searcher.search(q,
847     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
848     #             max_results)
849     #         for found in top.scoreDocs:
850     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
851     #     return books
852
853     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
854     #     fields_to_search = ['tags', 'authors', 'title']
855
856     #     only_in = None
857     #     if hint:
858     #         if not hint.should_search_for_book():
859     #             return []
860     #         fields_to_search = hint.just_search_in(fields_to_search)
861     #         only_in = hint.book_filter()
862
863     #     tokens = self.get_tokens(searched, field='SIMPLE')
864
865     #     q = BooleanQuery()
866
867     #     for fld in fields_to_search:
868     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
869     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
870
871     #     books = []
872     #     top = self.searcher.search(q,
873     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
874     #         max_results)
875     #     for found in top.scoreDocs:
876     #         books.append(SearchResult(self, found, how_found="search_book"))
877
878     #     return books
879
880     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
881     #     """
882     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
883     #     some part/fragment of the book.
884     #     """
885     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
886
887     #     flt = None
888     #     if hint:
889     #         flt = hint.part_filter()
890
891     #     books = []
892     #     for q in qrys:
893     #         top = self.searcher.search(q,
894     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
895     #                                                        flt]),
896     #                                    max_results)
897     #         for found in top.scoreDocs:
898     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
899
900     #     return books
901
902     def search_everywhere(self, searched, query_terms=None):
903         """
904         Tries to use search terms to match different fields of book (or its parts).
905         E.g. one word can be an author survey, another be a part of the title, and the rest
906         are some words from third chapter.
907         """
908         books = []
909         # content only query : themes x content
910         q = self.make_term_query(searched, 'text')
911         q_themes = self.make_term_query(searched, 'themes_pl')
912
913         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
914         res = query.execute()
915
916         for found in res:
917             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
918
919         # query themes/content x author/title/tags
920         in_content = self.index.Q()
921         in_meta = self.index.Q()
922
923         for fld in ['themes_pl', 'text']:
924             in_content |= self.make_term_query(searched, field=fld)
925
926         for fld in ['tags', 'authors', 'title']:
927             in_meta |= self.make_term_query(searched, field=fld)
928
929         q = in_content & in_meta
930         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
931
932         for found in res:
933             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
934
935         return books
936
937     def get_snippets(self, searchresult, query, field='text', num=1):
938         """
939         Returns a snippet for found scoreDoc.
940         """
941         maxnum = len(searchresult)
942         if num is None or num < 0 or num > maxnum:
943             num = maxnum
944         book_id = searchresult.book_id
945         revision = searchresult.snippet_revision()
946         snippets = Snippets(book_id, revision=revision)
947         snips = [None] * maxnum
948         try:
949             snippets.open()
950             idx = 0
951             while idx < maxnum and num > 0:
952                 position, length = searchresult.snippet_pos(idx)
953                 if position is None or length is None:
954                     continue
955                 text = snippets.get((int(position),
956                                      int(length)))
957                 print "== %s -- %s ==" % (query, text)
958                 snip = self.index.highlight(text=text, field=field, q=query)
959                 snips[idx] = snip
960                 if snip:
961                     num -= 1
962                 idx += 1
963
964         except IOError, e:
965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
966             return []
967         finally:
968             snippets.close()
969
970             # remove verse end markers..
971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
972
973         searchresult.snippets = snips
974         return snips
975
976     def hint_tags(self, query, pdcounter=True, prefix=True):
977         """
978         Return auto-complete hints for tags
979         using prefix search.
980         """
981         q = self.index.Q()
982         query = query.strip()
983         for field in ['tag_name', 'tag_name_pl']:
984             if prefix:
985                 q |= self.index.Q(**{field: query + "*"})
986             else:
987                 q |= self.make_term_query(query, field=field)
988         qu = self.index.query(q).exclude(tag_category="book")
989
990         return self.search_tags(qu, pdcounter=pdcounter)
991
992     def search_tags(self, query, filters=None, pdcounter=False):
993         """
994         Search for Tag objects using query.
995         """
996         if not filters: filters = []
997         if not pdcounter:
998             filters.append(~self.index.Q(is_pdcounter=True))
999         res = self.apply_filters(query, filters).execute()
1000
1001         tags = []
1002         for doc in res:
1003             is_pdcounter = doc.get('is_pdcounter', False)
1004             category = doc.get('tag_category')
1005             try:
1006                 if is_pdcounter == True:
1007                     if category == 'pd_author':
1008                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1009                     elif category == 'pd_book':
1010                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1011                         tag.category = 'pd_book'  # make it look more lik a tag.
1012                     else:
1013                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1014                 else:
1015                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1016                     # don't add the pdcounter tag if same tag already exists
1017
1018                 tags.append(tag)
1019
1020             except catalogue.models.Tag.DoesNotExist: pass
1021             except PDCounterAuthor.DoesNotExist: pass
1022             except PDCounterBook.DoesNotExist: pass
1023
1024         log.debug('search_tags: %s' % tags)
1025
1026         return tags
1027
1028     def hint_books(self, query, prefix=True):
1029         """
1030         Returns auto-complete hints for book titles
1031         Because we do not index 'pseudo' title-tags.
1032         Prefix search.
1033         """
1034         q = self.index.Q()
1035         query = query.strip()
1036         if prefix:
1037             q |= self.index.Q(title=query + "*")
1038         else:
1039             q |= self.make_term_query(query, field='title')
1040         qu = self.index.query(q)
1041         only_books = self.index.Q(is_book=True)
1042         return self.search_books(qu, [only_books])
1043
1044     def search_books(self, query, filters=None, max_results=10):
1045         """
1046         Searches for Book objects using query
1047         """
1048         bks = []
1049         res = self.apply_filters(query, filters).field_limit(['book_id'])
1050         for r in res:
1051             try:
1052                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1053             except catalogue.models.Book.DoesNotExist: pass
1054         return bks
1055  
1056     # def make_prefix_phrase(self, toks, field):
1057     #     q = MultiPhraseQuery()
1058     #     for i in range(len(toks)):
1059     #         t = Term(field, toks[i])
1060     #         if i == len(toks) - 1:
1061     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1062     #             if pterms:
1063     #                 q.add(pterms)
1064     #             else:
1065     #                 q.add(t)
1066     #         else:
1067     #             q.add(t)
1068     #     return q
1069
1070     # @staticmethod
1071     # def term_filter(term, inverse=False):
1072     #     only_term = TermsFilter()
1073     #     only_term.addTerm(term)
1074
1075     #     if inverse:
1076     #         neg = BooleanFilter()
1077     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1078     #         only_term = neg
1079
1080     #     return only_term
1081
1082
1083
1084     @staticmethod
1085     def apply_filters(query, filters):
1086         """
1087         Apply filters to a query
1088         """
1089         if filters is None: filters = []
1090         filters = filter(lambda x: x is not None, filters)
1091         for f in filters:
1092             query = query.query(f)
1093         return query
1094
1095     # def filtered_categories(self, tags):
1096     #     """
1097     #     Return a list of tag categories, present in tags list.
1098     #     """
1099     #     cats = {}
1100     #     for t in tags:
1101     #         cats[t.category] = True
1102     #     return cats.keys()
1103
1104     # def hint(self):
1105     #     return Hint(self)