OPDS with new SOLR interface.
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21 log = logging.getLogger('search')
22
23 class SolrIndex(object):
24     def __init__(self, mode=None):
25         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
26
27
28 class Snippets(object):
29     """
30     This class manages snippet files for indexed object (book)
31     the snippets are concatenated together, and their positions and
32     lengths are kept in lucene index fields.
33     """
34     SNIPPET_DIR = "snippets"
35
36     def __init__(self, book_id, revision=None):
37         try:
38             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         except OSError as exc:
40             if exc.errno == errno.EEXIST:
41                 pass
42             else: raise
43         self.book_id = book_id
44         self.revision = revision
45         self.file = None
46
47     @property
48     def path(self):
49         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
50         else: fn = "%d" % self.book_id
51
52         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
53
54     def open(self, mode='r'):
55         """
56         Open the snippet file. Call .close() afterwards.
57         """
58         if not 'b' in mode:
59             mode += 'b'
60
61         if 'w' in mode:
62             if os.path.exists(self.path):
63                 self.revision = 1
64                 while True:
65                     if not os.path.exists(self.path):
66                         break
67                     self.revision += 1
68
69         self.file = open(self.path, mode)
70         self.position = 0
71         return self
72
73     def add(self, snippet):
74         """
75         Append a snippet (unicode) to the snippet file.
76         Return a (position, length) tuple
77         """
78         txt = snippet.encode('utf-8')
79         l = len(txt)
80         self.file.write(txt)
81         pos = (self.position, l)
82         self.position += l
83         return pos
84
85     def get(self, pos):
86         """
87         Given a tuple of (position, length) return an unicode
88         of the snippet stored there.
89         """
90         self.file.seek(pos[0], 0)
91         txt = self.file.read(pos[1]).decode('utf-8')
92         return txt
93
94     def close(self):
95         """Close snippet file"""
96         self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136                 #        print "Will delete %s" % ','.join([x for x in uids])
137         if uids:
138             self.index.delete(uids)
139             return True
140         else:
141             return False
142
143     def index_tags(self, *tags, **kw):
144         """
145         Re-index global tag list.
146         Removes all tags from index, then index them again.
147         Indexed fields include: id, name (with and without polish stems), category
148         """
149         remove_only = kw.get('remove_only', False)
150         # first, remove tags from index.
151         if tags:
152             tag_qs = []
153             for tag in tags:
154                 q_id = self.index.Q(tag_id=tag.id)
155
156                 if isinstance(tag, PDCounterAuthor):
157                     q_cat = self.index.Q(tag_category='pd_author')
158                 elif isinstance(tag, PDCounterBook):
159                     q_cat = self.index.Q(tag_category='pd_book')
160                 else:
161                     q_cat = self.index.Q(tag_category=tag.category)
162
163                 q_id_cat = self.index.Q(q_id & q_cat)
164                 tag_qs.append(q_id_cat)
165             self.delete_query(tag_qs)
166         else:  # all
167             q = self.index.Q(tag_id__any=True)
168             self.delete_query(q)
169
170         if not remove_only:
171             # then add them [all or just one passed]
172             if not tags:
173                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
174                     PDCounterAuthor.objects.all(), \
175                     PDCounterBook.objects.all())
176
177             for tag in tags:
178                 if isinstance(tag, PDCounterAuthor):
179                     doc = {
180                         "tag_id": int(tag.id),
181                         "tag_name": tag.name,
182                         "tag_name_pl": tag.name,
183                         "tag_category": 'pd_author',
184                         "is_pdcounter": True,
185                         "uid": "tag%d_pd_a" % tag.id
186                         }
187                 elif isinstance(tag, PDCounterBook):
188                     doc = {
189                         "tag_id": int(tag.id),
190                         "tag_name": tag.title,
191                         "tag_name_pl": tag.title,
192                         "tag_category": 'pd_book',
193                         "is_pdcounter": True,
194                         "uid": "tag%d_pd_b" % tag.id
195                         }
196                 else:
197                     doc = {
198                         "tag_id": int(tag.id),
199                         "tag_name": tag.name,
200                         "tag_name_pl": tag.name,
201                         "tag_category": tag.category,
202                         "is_pdcounter": False,
203                         "uid": "tag%d" % tag.id
204                         }
205                 self.index.add(doc)
206
207     def create_book_doc(self, book):
208         """
209         Create a lucene document referring book id.
210         """
211         doc = {
212             'book_id': int(book.id),
213             }
214         if book.parent is not None:
215             doc["parent_id"] = int(book.parent.id)
216         return doc
217
218     def remove_book(self, book_or_id, remove_snippets=True):
219         """Removes a book from search index.
220         book - Book instance."""
221         if isinstance(book_or_id, catalogue.models.Book):
222             book_id = book_or_id.id
223         else:
224             book_id = book_or_id
225
226         self.delete_query(self.index.Q(book_id=book_id))
227
228         if remove_snippets:
229             snippets = Snippets(book_id)
230             snippets.remove()
231
232     def index_book(self, book, book_info=None, overwrite=True):
233         """
234         Indexes the book.
235         Creates a lucene document for extracted metadata
236         and calls self.index_content() to index the contents of the book.
237         """
238         if overwrite:
239             # we don't remove snippets, since they might be still needed by
240             # threads using not reopened index
241             self.remove_book(book, remove_snippets=False)
242
243         book_doc = self.create_book_doc(book)
244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         self.index.add(book_doc)
254         del book_doc
255         book_fields = {
256             'title': meta_fields['title'],
257             'authors': meta_fields['authors'],
258             'published_date': meta_fields['published_date']
259             }
260
261         if 'translators' in meta_fields:
262             book_fields['translators'] = meta_fields['translators']
263
264         self.index_content(book, book_fields=book_fields)
265
266     master_tags = [
267         'opowiadanie',
268         'powiesc',
269         'dramat_wierszowany_l',
270         'dramat_wierszowany_lp',
271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
272         'wywiad',
273         ]
274
275     ignore_content_tags = [
276         'uwaga', 'extra',
277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278         'didaskalia',
279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
280         ]
281
282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
283
284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
285
286     published_date_re = re.compile("([0-9]+)[\]. ]*$")
287
288     def extract_metadata(self, book, book_info=None, dc_only=None):
289         """
290         Extract metadata from book and returns a map of fields keyed by fieldname
291         """
292         fields = {}
293
294         if book_info is None:
295             book_info = dcparser.parse(open(book.xml_file.path))
296
297         fields['slug'] = book.slug
298         fields['tags'] = [t.name  for t in book.tags]
299         fields['is_book'] = True
300
301         # validator, name
302         for field in dcparser.BookInfo.FIELDS:
303             if dc_only and field.name not in dc_only:
304                 continue
305             if hasattr(book_info, field.name):
306                 if not getattr(book_info, field.name):
307                     continue
308                 # since no type information is available, we use validator
309                 type_indicator = field.validator
310                 if type_indicator == dcparser.as_unicode:
311                     s = getattr(book_info, field.name)
312                     if field.multiple:
313                         s = ', '.join(s)
314                     fields[field.name] = s
315                 elif type_indicator == dcparser.as_person:
316                     p = getattr(book_info, field.name)
317                     if isinstance(p, dcparser.Person):
318                         persons = unicode(p)
319                     else:
320                         persons = ', '.join(map(unicode, p))
321                     fields[field.name] = persons
322                 elif type_indicator == dcparser.as_date:
323                     dt = getattr(book_info, field.name)
324                     fields[field.name] = dt
325
326         # get published date
327         pd = None
328         if hasattr(book_info, 'source_name') and book_info.source_name:
329             match = self.published_date_re.search(book_info.source_name)
330             if match is not None:
331                 pd = str(match.groups()[0])
332         if not pd: pd = ""
333         fields["published_date"] = pd
334
335         return fields
336
337     # def add_gaps(self, fields, fieldname):
338     #     """
339     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
340     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
341     #     """
342     #     def gap():
343     #         while True:
344     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
345     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
346
347     def get_master(self, root):
348         """
349         Returns the first master tag from an etree.
350         """
351         for master in root.iter():
352             if master.tag in self.master_tags:
353                 return master
354
355     def index_content(self, book, book_fields={}):
356         """
357         Walks the book XML and extract content from it.
358         Adds parts for each header tag and for each fragment.
359         """
360         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
361         root = wld.edoc.getroot()
362
363         master = self.get_master(root)
364         if master is None:
365             return []
366
367         def walker(node, ignore_tags=[]):
368
369             if node.tag not in ignore_tags:
370                 yield node, None, None
371                 if node.text is not None:
372                     yield None, node.text, None
373                 for child in list(node):
374                     for b, t, e in walker(child):
375                         yield b, t, e
376                 yield None, None, node
377
378             if node.tail is not None:
379                 yield None, node.tail, None
380             return
381
382         def fix_format(text):
383             #            separator = [u" ", u"\t", u".", u";", u","]
384             if isinstance(text, list):
385                 # need to join it first
386                 text = filter(lambda s: s is not None, content)
387                 text = u' '.join(text)
388                 # for i in range(len(text)):
389                 #     if i > 0:
390                 #         if text[i][0] not in separator\
391                 #             and text[i - 1][-1] not in separator:
392                 #          text.insert(i, u" ")
393
394             return re.sub("(?m)/$", "", text)
395
396         def add_part(snippets, **fields):
397             doc = self.create_book_doc(book)
398             for n, v in book_fields.items():
399                 doc[n] = v
400
401             doc['header_index'] = fields["header_index"]
402             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
403             doc['header_type'] = fields['header_type']
404
405             doc['text'] = fields['text']
406
407             # snippets
408             snip_pos = snippets.add(fields["text"])
409
410             doc['snippets_position'] = snip_pos[0]
411             doc['snippets_length'] = snip_pos[1]
412             if snippets.revision:
413                 doc["snippets_revision"] = snippets.revision
414
415             if 'fragment_anchor' in fields:
416                 doc["fragment_anchor"] = fields['fragment_anchor']
417
418             if 'themes' in fields:
419                 doc['themes'] = fields['themes']
420             doc['uid'] = "part%s%s%s" % (doc['header_index'],
421                                          doc['header_span'],
422                                          doc.get('fragment_anchor', ''))
423             return doc
424
425         def give_me_utf8(s):
426             if isinstance(s, unicode):
427                 return s.encode('utf-8')
428             else:
429                 return s
430
431         fragments = {}
432         snippets = Snippets(book.id).open('w')
433         try:
434             for header, position in zip(list(master), range(len(master))):
435
436                 if header.tag in self.skip_header_tags:
437                     continue
438                 if header.tag is etree.Comment:
439                     continue
440
441                 # section content
442                 content = []
443                 footnote = []
444
445                 def all_content(text):
446                     for frag in fragments.values():
447                         frag['text'].append(text)
448                     content.append(text)
449                 handle_text = [all_content]
450
451                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
452                     # handle footnotes
453                     if start is not None and start.tag in self.footnote_tags:
454                         footnote = []
455
456                         def collect_footnote(t):
457                             footnote.append(t)
458
459                         handle_text.append(collect_footnote)
460                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
461                         handle_text.pop()
462                         doc = add_part(snippets, header_index=position, header_type=header.tag,
463                                        text=u''.join(footnote),
464                                        is_footnote=True)
465                         self.index.add(doc)
466                         #print "@ footnote text: %s" % footnote
467                         footnote = []
468
469                     # handle fragments and themes.
470                     if start is not None and start.tag == 'begin':
471                         fid = start.attrib['id'][1:]
472                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
473
474                     # themes for this fragment
475                     elif start is not None and start.tag == 'motyw':
476                         fid = start.attrib['id'][1:]
477                         handle_text.append(None)
478                         if start.text is not None:
479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
480                     elif end is not None and end.tag == 'motyw':
481                         handle_text.pop()
482
483                     elif start is not None and start.tag == 'end':
484                         fid = start.attrib['id'][1:]
485                         if fid not in fragments:
486                             continue  # a broken <end> node, skip it
487                         frag = fragments[fid]
488                         if frag['themes'] == []:
489                             continue  # empty themes list.
490                         del fragments[fid]
491
492                         doc = add_part(snippets,
493                                        header_type=frag['start_header'],
494                                        header_index=frag['start_section'],
495                                        header_span=position - frag['start_section'] + 1,
496                                        fragment_anchor=fid,
497                                        text=fix_format(frag['text']),
498                                        themes=frag['themes'])
499                         #print '@ FRAG %s' % frag['content']
500                         self.index.add(doc)
501
502                         # Collect content.
503
504                     if text is not None and handle_text is not []:
505                         hdl = handle_text[-1]
506                         if hdl is not None:
507                             hdl(text)
508
509                         # in the end, add a section text.
510                 doc = add_part(snippets, header_index=position,
511                                header_type=header.tag, text=fix_format(content))
512                 #print '@ CONTENT: %s' % fix_format(content)
513
514                 self.index.add(doc)
515
516         finally:
517             snippets.close()
518
519
520 class SearchResult(object):
521     def __init__(self, doc, how_found=None, query=None, query_terms=None):
522         #        self.search = search
523         self.boost = 1.0
524         self._hits = []
525         self._processed_hits = None  # processed hits
526         self.snippets = []
527         self.query_terms = query_terms
528
529         if 'score' in doc:
530             self._score = doc['score']
531         else:
532             self._score = 0
533
534         self.book_id = int(doc["book_id"])
535
536         try:
537             self.published_date = int(doc.get("published_date"))
538         except ValueError:
539             self.published_date = 0
540
541         # content hits
542         header_type = doc.get("header_type", None)
543         # we have a content hit in some header of fragment
544         if header_type is not None:
545             sec = (header_type, int(doc["header_index"]))
546             header_span = doc['header_span']
547             header_span = header_span is not None and int(header_span) or 1
548             fragment = doc.get("fragment_anchor", None)
549             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
550             snippets_rev = doc['snippets_revision']
551
552             hit = (sec + (header_span,), fragment, self._score, {
553                 'how_found': how_found,
554                 'snippets_pos': snippets_pos,
555                 'snippets_revision': snippets_rev,
556                 'themes': doc.get('themes', []),
557                 'themes_pl': doc.get('themes_pl', [])
558                 })
559
560             self._hits.append(hit)
561
562     def __unicode__(self):
563         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
564             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
565
566     def __str__(self):
567         return unicode(self).encode('utf-8')
568
569     @property
570     def score(self):
571         return self._score * self.boost
572
573     def merge(self, other):
574         if self.book_id != other.book_id:
575             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
576         self._hits += other._hits
577         if other.score > self.score:
578             self._score = other._score
579         return self
580
581     def get_book(self):
582         if hasattr(self, '_book'):
583             return self._book
584         self._book = catalogue.models.Book.objects.get(id=self.book_id)
585         return self._book
586
587     book = property(get_book)
588
589     POSITION = 0
590     FRAGMENT = 1
591     POSITION_INDEX = 1
592     POSITION_SPAN = 2
593     SCORE = 2
594     OTHER = 3
595
596     @property
597     def hits(self):
598         if self._processed_hits is not None:
599             return self._processed_hits
600
601         # to sections and fragments
602         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
603
604         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
605
606         # sections not covered by fragments
607         sect = filter(lambda s: 0 == len(filter(
608             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
609             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
610             frags)), sect)
611
612         hits = []
613
614         def remove_duplicates(lst, keyfn, compare):
615             els = {}
616             for e in lst:
617                 eif = keyfn(e)
618                 if eif in els:
619                     if compare(els[eif], e) >= 1:
620                         continue
621                 els[eif] = e
622             return els.values()
623
624         # remove fragments with duplicated fid's and duplicated snippets
625         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
626         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
627         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
628
629         # remove duplicate sections
630         sections = {}
631
632         for s in sect:
633             si = s[self.POSITION][self.POSITION_INDEX]
634             # skip existing
635             if si in sections:
636                 if sections[si]['score'] >= s[self.SCORE]:
637                     continue
638
639             m = {'score': s[self.SCORE],
640                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
641                  }
642             m.update(s[self.OTHER])
643             sections[si] = m
644
645         hits = sections.values()
646
647         for f in frags:
648             try:
649                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
650             except catalogue.models.Fragment.DoesNotExist:
651                 # stale index
652                 continue
653             # Figure out if we were searching for a token matching some word in theme name.
654             themes = frag.tags.filter(category='theme')
655             themes_hit = set()
656             if self.query_terms is not None:
657                 for i in range(0, len(f[self.OTHER]['themes'])):
658                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
659                     tms = map(unicode.lower, tms)
660                     for qt in self.query_terms:
661                         if qt in tms:
662                             themes_hit.add(f[self.OTHER]['themes'][i])
663                             break
664
665             def theme_by_name(n):
666                 th = filter(lambda t: t.name == n, themes)
667                 if th:
668                     return th[0]
669                 else:
670                     return None
671             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
672
673             m = {'score': f[self.SCORE],
674                  'fragment': frag,
675                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
676                  'themes': themes,
677                  'themes_hit': themes_hit
678                  }
679             m.update(f[self.OTHER])
680             hits.append(m)
681
682         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
683
684         self._processed_hits = hits
685
686         return hits
687
688     @staticmethod
689     def aggregate(*result_lists):
690         books = {}
691         for rl in result_lists:
692             for r in rl:
693                 if r.book_id in books:
694                     books[r.book_id].merge(r)
695                 else:
696                     books[r.book_id] = r
697         return books.values()
698
699     def __cmp__(self, other):
700         c = cmp(self.score, other.score)
701         if c == 0:
702             # this is inverted, because earlier date is better
703             return cmp(other.published_date, self.published_date)
704         else:
705             return c
706
707     def __len__(self):
708         return len(self.hits)
709
710     def snippet_pos(self, idx=0):
711         return self.hits[idx]['snippets_pos']
712
713     def snippet_revision(self, idx=0):
714         try:
715             return self.hits[idx]['snippets_revision']
716         except:
717             return None
718
719
720 class Search(SolrIndex):
721     """
722     Search facilities.
723     """
724     def __init__(self, default_field="text"):
725         super(Search, self).__init__(mode='r')
726
727     # def get_tokens(self, searched, field='text', cached=None):
728     #     """returns tokens analyzed by a proper (for a field) analyzer
729     #     argument can be: StringReader, string/unicode, or tokens. In the last case
730     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
731     #     """
732     #     if cached is not None and field in cached:
733     #         return cached[field]
734
735     #     if isinstance(searched, str) or isinstance(searched, unicode):
736     #         searched = StringReader(searched)
737     #     elif isinstance(searched, list):
738     #         return searched
739
740     #     searched.reset()
741     #     tokens = self.analyzer.reusableTokenStream(field, searched)
742     #     toks = []
743     #     while tokens.incrementToken():
744     #         cta = tokens.getAttribute(CharTermAttribute.class_)
745     #         toks.append(cta.toString())
746
747     #     if cached is not None:
748     #         cached[field] = toks
749
750     #     return toks
751
752     # @staticmethod
753     # def fuzziness(fuzzy):
754     #     """Helper method to sanitize fuzziness"""
755     #     if not fuzzy:
756     #         return None
757     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
758     #         return fuzzy
759     #     else:
760     #         return 0.5
761
762     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
763     #     """
764     #     Return a PhraseQuery with a series of tokens.
765     #     """
766     #     if fuzzy:
767     #         phrase = MultiPhraseQuery()
768     #         for t in tokens:
769     #             term = Term(field, t)
770     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
771     #             fuzzterms = []
772
773     #             while True:
774     #                 ft = fuzzterm.term()
775     #                 if ft:
776     #                     fuzzterms.append(ft)
777     #                 if not fuzzterm.next(): break
778     #             if fuzzterms:
779     #                 phrase.add(JArray('object')(fuzzterms, Term))
780     #             else:
781     #                 phrase.add(term)
782     #     else:
783     #         phrase = PhraseQuery()
784     #         phrase.setSlop(slop)
785     #         for t in tokens:
786     #             term = Term(field, t)
787     #             phrase.add(term)
788     #     return phrase
789
790     def make_term_query(self, query, field='text', modal=operator.or_):
791         """
792         Returns term queries joined by boolean query.
793         modal - applies to boolean query
794         fuzzy - should the query by fuzzy.
795         """
796         if query is None: query = ''
797         q = self.index.Q()
798         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
799                         query.split(r" ")), q)
800
801         return q
802
803     def search_phrase(self, searched, field='text', book=False,
804                       filters=None,
805                       snippets=False):
806         if filters is None: filters = []
807         if book: filters.append(self.index.Q(is_book=True))
808
809         q = self.index.query(**{field: searched})
810         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
811         res = q.execute()
812         return [SearchResult(found, how_found=u'search_phrase') for found in res]
813
814     def search_some(self, searched, fields, book=True,
815                     filters=None, snippets=True, query_terms=None):
816         assert isinstance(fields, list)
817         if filters is None: filters = []
818         if book: filters.append(self.index.Q(is_book=True))
819
820         query = self.index.Q()
821
822         for fld in fields:
823             query = self.index.Q(query | self.make_term_query(searched, fld))
824
825         query = self.index.query(query)
826         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
827         res = query.execute()
828         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
829
830     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
831     #     """
832     #     Search for perfect book matches. Just see if the query matches with some author or title,
833     #     taking hints into account.
834     #     """
835     #     fields_to_search = ['authors', 'title']
836     #     only_in = None
837     #     if hint:
838     #         if not hint.should_search_for_book():
839     #             return []
840     #         fields_to_search = hint.just_search_in(fields_to_search)
841     #         only_in = hint.book_filter()
842
843     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
844
845     #     books = []
846     #     for q in qrys:
847     #         top = self.searcher.search(q,
848     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
849     #             max_results)
850     #         for found in top.scoreDocs:
851     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
852     #     return books
853
854     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
855     #     fields_to_search = ['tags', 'authors', 'title']
856
857     #     only_in = None
858     #     if hint:
859     #         if not hint.should_search_for_book():
860     #             return []
861     #         fields_to_search = hint.just_search_in(fields_to_search)
862     #         only_in = hint.book_filter()
863
864     #     tokens = self.get_tokens(searched, field='SIMPLE')
865
866     #     q = BooleanQuery()
867
868     #     for fld in fields_to_search:
869     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
870     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
871
872     #     books = []
873     #     top = self.searcher.search(q,
874     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
875     #         max_results)
876     #     for found in top.scoreDocs:
877     #         books.append(SearchResult(self, found, how_found="search_book"))
878
879     #     return books
880
881     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
882     #     """
883     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
884     #     some part/fragment of the book.
885     #     """
886     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
887
888     #     flt = None
889     #     if hint:
890     #         flt = hint.part_filter()
891
892     #     books = []
893     #     for q in qrys:
894     #         top = self.searcher.search(q,
895     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
896     #                                                        flt]),
897     #                                    max_results)
898     #         for found in top.scoreDocs:
899     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
900
901     #     return books
902
903     def search_everywhere(self, searched, query_terms=None):
904         """
905         Tries to use search terms to match different fields of book (or its parts).
906         E.g. one word can be an author survey, another be a part of the title, and the rest
907         are some words from third chapter.
908         """
909         books = []
910         # content only query : themes x content
911         q = self.make_term_query(searched, 'text')
912         q_themes = self.make_term_query(searched, 'themes_pl')
913
914         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
915         res = query.execute()
916
917         for found in res:
918             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
919
920         # query themes/content x author/title/tags
921         in_content = self.index.Q()
922         in_meta = self.index.Q()
923
924         for fld in ['themes_pl', 'text']:
925             in_content |= self.make_term_query(searched, field=fld)
926
927         for fld in ['tags', 'authors', 'title']:
928             in_meta |= self.make_term_query(searched, field=fld)
929
930         q = in_content & in_meta
931         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
932
933         for found in res:
934             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
935
936         return books
937
938     def get_snippets(self, searchresult, query, field='text', num=1):
939         """
940         Returns a snippet for found scoreDoc.
941         """
942         maxnum = len(searchresult)
943         if num is None or num < 0 or num > maxnum:
944             num = maxnum
945         book_id = searchresult.book_id
946         revision = searchresult.snippet_revision()
947         snippets = Snippets(book_id, revision=revision)
948         snips = [None] * maxnum
949         try:
950             snippets.open()
951             idx = 0
952             while idx < maxnum and num > 0:
953                 position, length = searchresult.snippet_pos(idx)
954                 if position is None or length is None:
955                     continue
956                 text = snippets.get((int(position),
957                                      int(length)))
958                 snip = self.index.highlight(text=text, field=field, q=query)
959                 snips[idx] = snip
960                 if snip:
961                     num -= 1
962                 idx += 1
963
964         except IOError, e:
965             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
966             return []
967         finally:
968             snippets.close()
969
970             # remove verse end markers..
971         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
972
973         searchresult.snippets = snips
974
975         return snips
976
977     def hint_tags(self, query, pdcounter=True, prefix=True):
978         """
979         Return auto-complete hints for tags
980         using prefix search.
981         """
982         q = self.index.Q()
983         query = query.strip()
984         for field in ['tag_name', 'tag_name_pl']:
985             if prefix:
986                 q |= self.index.Q(**{field: query + "*"})
987             else:
988                 q |= self.make_term_query(query, field=field)
989         qu = self.index.query(q).exclude(tag_category="book")
990
991         return self.search_tags(qu, pdcounter=pdcounter)
992
993     def search_tags(self, query, filters=None, pdcounter=False):
994         """
995         Search for Tag objects using query.
996         """
997         if not filters: filters = []
998         if not pdcounter:
999             filters.append(~self.index.Q(is_pdcounter=True))
1000         res = self.apply_filters(query, filters).execute()
1001
1002         tags = []
1003         for doc in res:
1004             is_pdcounter = doc.get('is_pdcounter', False)
1005             category = doc.get('tag_category')
1006             try:
1007                 if is_pdcounter == True:
1008                     if category == 'pd_author':
1009                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1010                     elif category == 'pd_book':
1011                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1012                         tag.category = 'pd_book'  # make it look more lik a tag.
1013                     else:
1014                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1015                 else:
1016                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1017                     # don't add the pdcounter tag if same tag already exists
1018
1019                 tags.append(tag)
1020
1021             except catalogue.models.Tag.DoesNotExist: pass
1022             except PDCounterAuthor.DoesNotExist: pass
1023             except PDCounterBook.DoesNotExist: pass
1024
1025         log.debug('search_tags: %s' % tags)
1026
1027         return tags
1028
1029     def hint_books(self, query, prefix=True):
1030         """
1031         Returns auto-complete hints for book titles
1032         Because we do not index 'pseudo' title-tags.
1033         Prefix search.
1034         """
1035         q = self.index.Q()
1036         query = query.strip()
1037         if prefix:
1038             q |= self.index.Q(title=query + "*")
1039         else:
1040             q |= self.make_term_query(query, field='title')
1041         qu = self.index.query(q)
1042         only_books = self.index.Q(is_book=True)
1043         return self.search_books(qu, [only_books])
1044
1045     def search_books(self, query, filters=None, max_results=10):
1046         """
1047         Searches for Book objects using query
1048         """
1049         bks = []
1050         res = self.apply_filters(query, filters).field_limit(['book_id'])
1051         for r in res:
1052             try:
1053                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1054             except catalogue.models.Book.DoesNotExist: pass
1055         return bks
1056  
1057     # def make_prefix_phrase(self, toks, field):
1058     #     q = MultiPhraseQuery()
1059     #     for i in range(len(toks)):
1060     #         t = Term(field, toks[i])
1061     #         if i == len(toks) - 1:
1062     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1063     #             if pterms:
1064     #                 q.add(pterms)
1065     #             else:
1066     #                 q.add(t)
1067     #         else:
1068     #             q.add(t)
1069     #     return q
1070
1071     # @staticmethod
1072     # def term_filter(term, inverse=False):
1073     #     only_term = TermsFilter()
1074     #     only_term.addTerm(term)
1075
1076     #     if inverse:
1077     #         neg = BooleanFilter()
1078     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1079     #         only_term = neg
1080
1081     #     return only_term
1082
1083
1084
1085     @staticmethod
1086     def apply_filters(query, filters):
1087         """
1088         Apply filters to a query
1089         """
1090         if filters is None: filters = []
1091         filters = filter(lambda x: x is not None, filters)
1092         for f in filters:
1093             query = query.query(f)
1094         return query
1095
1096     # def filtered_categories(self, tags):
1097     #     """
1098     #     Return a list of tag categories, present in tags list.
1099     #     """
1100     #     cats = {}
1101     #     for t in tags:
1102     #         cats[t.category] = True
1103     #     return cats.keys()
1104
1105     # def hint(self):
1106     #     return Hint(self)