just choosing themes is missing
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21
22 class SolrIndex(object):
23     def __init__(self, mode=None):
24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
25
26
27 class Snippets(object):
28     """
29     This class manages snippet files for indexed object (book)
30     the snippets are concatenated together, and their positions and
31     lengths are kept in lucene index fields.
32     """
33     SNIPPET_DIR = "snippets"
34
35     def __init__(self, book_id, revision=None):
36         try:
37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
38         except OSError as exc:
39             if exc.errno == errno.EEXIST:
40                 pass
41             else: raise
42         self.book_id = book_id
43         self.revision = revision
44         self.file = None
45
46     @property
47     def path(self):
48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
49         else: fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if not 'b' in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         self.file.close()
96
97     def remove(self):
98         self.revision = None
99         try:
100             os.unlink(self.path)
101             self.revision = 0
102             while True:
103                 self.revision += 1
104                 os.unlink(self.path)
105         except OSError:
106             pass
107
108
109 class Index(SolrIndex):
110     """
111     Class indexing books.
112     """
113     def __init__(self):
114         super(Index, self).__init__()
115
116     def delete_query(self, *queries):
117         """
118         index.delete(queries=...) doesn't work, so let's reimplement it
119         using deletion of list of uids.
120         """
121         uids = set()
122         for q in queries:
123             if isinstance(q, sunburnt.search.LuceneQuery):
124                 q = self.index.query(q)
125             q.field_limiter.update(['uid'])
126             st = 0
127             rows = 100
128             while True:
129                 ids = q.paginate(start=st, rows=rows).execute()
130                 if not len(ids):
131                     break
132                 for res in ids:
133                     uids.add(res['uid'])
134                 st += rows
135                 #        print "Will delete %s" % ','.join([x for x in uids])
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         remove_only = kw.get('remove_only', False)
149         # first, remove tags from index.
150         if tags:
151             tag_qs = []
152             for tag in tags:
153                 q_id = self.index.Q(tag_id=tag.id)
154
155                 if isinstance(tag, PDCounterAuthor):
156                     q_cat = self.index.Q(tag_category='pd_author')
157                 elif isinstance(tag, PDCounterBook):
158                     q_cat = self.index.Q(tag_category='pd_book')
159                 else:
160                     q_cat = self.index.Q(tag_category=tag.category)
161
162                 q_id_cat = self.index.Q(q_id & q_cat)
163                 tag_qs.append(q_id_cat)
164             self.delete_query(tag_qs)
165         else:  # all
166             q = self.index.Q(tag_id__any=True)
167             self.delete_query(q)
168
169         if not remove_only:
170             # then add them [all or just one passed]
171             if not tags:
172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
173                     PDCounterAuthor.objects.all(), \
174                     PDCounterBook.objects.all())
175
176             for tag in tags:
177                 if isinstance(tag, PDCounterAuthor):
178                     doc = {
179                         "tag_id": int(tag.id),
180                         "tag_name": tag.name,
181                         "tag_name_pl": tag.name,
182                         "tag_category": 'pd_author',
183                         "is_pdcounter": True,
184                         "uid": "tag%d_pd_a" % tag.id
185                         }
186                 elif isinstance(tag, PDCounterBook):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.title,
190                         "tag_name_pl": tag.title,
191                         "tag_category": 'pd_book',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_b" % tag.id
194                         }
195                 else:
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.name,
199                         "tag_name_pl": tag.name,
200                         "tag_category": tag.category,
201                         "is_pdcounter": False,
202                         "uid": "tag%d" % tag.id
203                         }
204                 self.index.add(doc)
205                 print "%s %s" % (doc['tag_name'], doc['tag_category'])
206
207     def create_book_doc(self, book):
208         """
209         Create a lucene document referring book id.
210         """
211         doc = {
212             'book_id': int(book.id),
213             }
214         if book.parent is not None:
215             doc["parent_id"] = int(book.parent.id)
216         return doc
217
218     def remove_book(self, book_or_id, remove_snippets=True):
219         """Removes a book from search index.
220         book - Book instance."""
221         if isinstance(book_or_id, catalogue.models.Book):
222             book_id = book_or_id.id
223         else:
224             book_id = book_or_id
225
226         self.delete_query(self.index.Q(book_id=book_id))
227
228         if remove_snippets:
229             snippets = Snippets(book_id)
230             snippets.remove()
231
232     def index_book(self, book, book_info=None, overwrite=True):
233         """
234         Indexes the book.
235         Creates a lucene document for extracted metadata
236         and calls self.index_content() to index the contents of the book.
237         """
238         if overwrite:
239             # we don't remove snippets, since they might be still needed by
240             # threads using not reopened index
241             self.remove_book(book, remove_snippets=False)
242
243         book_doc = self.create_book_doc(book)
244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         self.index.add(book_doc)
254         del book_doc
255         book_fields = {
256             'title': meta_fields['title'],
257             'authors': meta_fields['authors'],
258             'published_date': meta_fields['published_date']
259             }
260         if 'translators' in meta_fields:
261             book_fields['translators'] = meta_fields['translators']
262
263         self.index_content(book, book_fields=book_fields)
264
265     master_tags = [
266         'opowiadanie',
267         'powiesc',
268         'dramat_wierszowany_l',
269         'dramat_wierszowany_lp',
270         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
271         'wywiad',
272         ]
273
274     ignore_content_tags = [
275         'uwaga', 'extra',
276         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
277         'didaskalia',
278         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
279         ]
280
281     footnote_tags = ['pa', 'pt', 'pr', 'pe']
282
283     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
284
285     published_date_re = re.compile("([0-9]+)[\]. ]*$")
286
287     def extract_metadata(self, book, book_info=None, dc_only=None):
288         """
289         Extract metadata from book and returns a map of fields keyed by fieldname
290         """
291         fields = {}
292
293         if book_info is None:
294             book_info = dcparser.parse(open(book.xml_file.path))
295
296         fields['slug'] = book.slug
297         fields['tags'] = [t.name  for t in book.tags]
298         fields['is_book'] = True
299
300         # validator, name
301         for field in dcparser.BookInfo.FIELDS:
302             if dc_only and field.name not in dc_only:
303                 continue
304             if hasattr(book_info, field.name):
305                 if not getattr(book_info, field.name):
306                     continue
307                 # since no type information is available, we use validator
308                 type_indicator = field.validator
309                 if type_indicator == dcparser.as_unicode:
310                     s = getattr(book_info, field.name)
311                     if field.multiple:
312                         s = ', '.join(s)
313                     fields[field.name] = s
314                 elif type_indicator == dcparser.as_person:
315                     p = getattr(book_info, field.name)
316                     if isinstance(p, dcparser.Person):
317                         persons = unicode(p)
318                     else:
319                         persons = ', '.join(map(unicode, p))
320                     fields[field.name] = persons
321                 elif type_indicator == dcparser.as_date:
322                     dt = getattr(book_info, field.name)
323                     fields[field.name] = dt
324
325         # get published date
326         pd = None
327         if hasattr(book_info, 'source_name') and book_info.source_name:
328             match = self.published_date_re.search(book_info.source_name)
329             if match is not None:
330                 pd = str(match.groups()[0])
331         if not pd: pd = ""
332         fields["published_date"] = pd
333
334         return fields
335
336     # def add_gaps(self, fields, fieldname):
337     #     """
338     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
339     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
340     #     """
341     #     def gap():
342     #         while True:
343     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
344     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
345
346     def get_master(self, root):
347         """
348         Returns the first master tag from an etree.
349         """
350         for master in root.iter():
351             if master.tag in self.master_tags:
352                 return master
353
354     def index_content(self, book, book_fields={}):
355         """
356         Walks the book XML and extract content from it.
357         Adds parts for each header tag and for each fragment.
358         """
359         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
360         root = wld.edoc.getroot()
361
362         master = self.get_master(root)
363         if master is None:
364             return []
365
366         def walker(node, ignore_tags=[]):
367
368             if node.tag not in ignore_tags:
369                 yield node, None, None
370                 if node.text is not None:
371                     yield None, node.text, None
372                 for child in list(node):
373                     for b, t, e in walker(child):
374                         yield b, t, e
375                 yield None, None, node
376
377             if node.tail is not None:
378                 yield None, node.tail, None
379             return
380
381         def fix_format(text):
382             #            separator = [u" ", u"\t", u".", u";", u","]
383             if isinstance(text, list):
384                 # need to join it first
385                 text = filter(lambda s: s is not None, content)
386                 text = u' '.join(text)
387                 # for i in range(len(text)):
388                 #     if i > 0:
389                 #         if text[i][0] not in separator\
390                 #             and text[i - 1][-1] not in separator:
391                 #          text.insert(i, u" ")
392
393             return re.sub("(?m)/$", "", text)
394
395         def add_part(snippets, **fields):
396             doc = self.create_book_doc(book)
397             for n, v in book_fields.items():
398                 doc[n] = v
399
400             doc['header_index'] = fields["header_index"]
401             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
402             doc['header_type'] = fields['header_type']
403
404             doc['text'] = fields['text']
405
406             # snippets
407             snip_pos = snippets.add(fields["text"])
408
409             doc['snippets_position'] = snip_pos[0]
410             doc['snippets_length'] = snip_pos[1]
411             if snippets.revision:
412                 doc["snippets_revision"] = snippets.revision
413
414             if 'fragment_anchor' in fields:
415                 doc["fragment_anchor"] = fields['fragment_anchor']
416
417             if 'themes' in fields:
418                 doc['themes'] = fields['themes']
419             doc['uid'] = "part%s%s%s" % (doc['header_index'],
420                                          doc['header_span'],
421                                          doc.get('fragment_anchor', ''))
422             return doc
423
424         def give_me_utf8(s):
425             if isinstance(s, unicode):
426                 return s.encode('utf-8')
427             else:
428                 return s
429
430         fragments = {}
431         snippets = Snippets(book.id).open('w')
432         try:
433             for header, position in zip(list(master), range(len(master))):
434
435                 if header.tag in self.skip_header_tags:
436                     continue
437                 if header.tag is etree.Comment:
438                     continue
439
440                 # section content
441                 content = []
442                 footnote = []
443
444                 def all_content(text):
445                     for frag in fragments.values():
446                         frag['text'].append(text)
447                     content.append(text)
448                 handle_text = [all_content]
449
450                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
451                     # handle footnotes
452                     if start is not None and start.tag in self.footnote_tags:
453                         footnote = []
454
455                         def collect_footnote(t):
456                             footnote.append(t)
457
458                         handle_text.append(collect_footnote)
459                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
460                         handle_text.pop()
461                         doc = add_part(snippets, header_index=position, header_type=header.tag,
462                                        text=u''.join(footnote),
463                                        is_footnote=True)
464
465                         self.index.add(doc)
466                         #print "@ footnote text: %s" % footnote
467                         footnote = []
468
469                     # handle fragments and themes.
470                     if start is not None and start.tag == 'begin':
471                         fid = start.attrib['id'][1:]
472                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
473
474                     # themes for this fragment
475                     elif start is not None and start.tag == 'motyw':
476                         fid = start.attrib['id'][1:]
477                         handle_text.append(None)
478                         if start.text is not None:
479                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
480                     elif end is not None and end.tag == 'motyw':
481                         handle_text.pop()
482
483                     elif start is not None and start.tag == 'end':
484                         fid = start.attrib['id'][1:]
485                         if fid not in fragments:
486                             continue  # a broken <end> node, skip it
487                         frag = fragments[fid]
488                         if frag['themes'] == []:
489                             continue  # empty themes list.
490                         del fragments[fid]
491
492                         doc = add_part(snippets,
493                                        header_type=frag['start_header'],
494                                        header_index=frag['start_section'],
495                                        header_span=position - frag['start_section'] + 1,
496                                        fragment_anchor=fid,
497                                        text=fix_format(frag['text']),
498                                        themes=frag['themes'])
499                         #print '@ FRAG %s' % frag['content']
500                         self.index.add(doc)
501
502                         # Collect content.
503
504                     if text is not None and handle_text is not []:
505                         hdl = handle_text[-1]
506                         if hdl is not None:
507                             hdl(text)
508
509                         # in the end, add a section text.
510                 doc = add_part(snippets, header_index=position,
511                                header_type=header.tag, text=fix_format(content))
512                 #print '@ CONTENT: %s' % fix_format(content)
513
514                 self.index.add(doc)
515
516         finally:
517             snippets.close()
518
519
520 class SearchResult(object):
521     def __init__(self, doc, how_found=None, query=None):
522         #        self.search = search
523         self.boost = 1.0
524         self._hits = []
525         self._processed_hits = None  # processed hits
526         self.snippets = []
527
528         if 'score' in doc:
529             self._score = doc['score']
530         else:
531             self._score = 0
532
533         self.book_id = int(doc["book_id"])
534
535         try:
536             self.published_date = int(doc.get("published_date"))
537         except ValueError:
538             self.published_date = 0
539
540         # content hits
541         header_type = doc.get("header_type", None)
542         # we have a content hit in some header of fragment
543         if header_type is not None:
544             sec = (header_type, int(doc["header_index"]))
545             header_span = doc['header_span']
546             header_span = header_span is not None and int(header_span) or 1
547             fragment = doc.get("fragment_anchor", None)
548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549             snippets_rev = doc['snippets_revision']
550
551             hit = (sec + (header_span,), fragment, self._score, {
552                 'how_found': how_found,
553                 'snippets_pos': snippets_pos,
554                 'snippets_revision': snippets_rev
555                 })
556
557             self._hits.append(hit)
558
559     def __unicode__(self):
560         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
561             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
562     
563     def __str__(self):
564         return unicode(self).encode('utf-8')
565
566     @property
567     def score(self):
568         return self._score * self.boost
569
570     def merge(self, other):
571         if self.book_id != other.book_id:
572             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
573         self._hits += other._hits
574         if other.score > self.score:
575             self._score = other._score
576         return self
577
578     def get_book(self):
579         if hasattr(self, '_book'):
580             return self._book
581         self._book = catalogue.models.Book.objects.get(id=self.book_id)
582         return self._book
583
584     book = property(get_book)
585
586     POSITION = 0
587     FRAGMENT = 1
588     POSITION_INDEX = 1
589     POSITION_SPAN = 2
590     SCORE = 2
591     OTHER = 3
592
593     @property
594     def hits(self):
595         if self._processed_hits is not None:
596             return self._processed_hits
597
598         # to sections and fragments
599         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
600
601         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
602
603         # sections not covered by fragments
604         sect = filter(lambda s: 0 == len(filter(
605             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
606             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
607             frags)), sect)
608
609         hits = []
610
611         def remove_duplicates(lst, keyfn, compare):
612             els = {}
613             for e in lst:
614                 eif = keyfn(e)
615                 if eif in els:
616                     if compare(els[eif], e) >= 1:
617                         continue
618                 els[eif] = e
619             return els.values()
620
621         # remove fragments with duplicated fid's and duplicated snippets
622         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
623         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
624         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
625
626         # remove duplicate sections
627         sections = {}
628
629         for s in sect:
630             si = s[self.POSITION][self.POSITION_INDEX]
631             # skip existing
632             if si in sections:
633                 if sections[si]['score'] >= s[self.SCORE]:
634                     continue
635
636             m = {'score': s[self.SCORE],
637                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
638                  }
639             m.update(s[self.OTHER])
640             sections[si] = m
641
642         hits = sections.values()
643
644         for f in frags:
645             try:
646                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
647             except catalogue.models.Fragment.DoesNotExist:
648                 # stale index
649                 continue
650
651             # Figure out if we were searching for a token matching some word in theme name.
652             themes = frag.tags.filter(category='theme')
653             themes_hit = []
654             # if self.searched is not None:
655             #     tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
656             #     for theme in themes:
657             #         name_tokens = self.search.get_tokens(theme.name, 'POLISH')
658             #         for t in tokens:
659             #             if t in name_tokens:
660             #                 if not theme in themes_hit:
661             #                     themes_hit.append(theme)
662             #                 break
663
664             m = {'score': f[self.SCORE],
665                  'fragment': frag,
666                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
667                  'themes': themes,
668                  'themes_hit': themes_hit
669                  }
670             m.update(f[self.OTHER])
671             hits.append(m)
672
673         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
674
675         self._processed_hits = hits
676
677         return hits
678
679     @staticmethod
680     def aggregate(*result_lists):
681         books = {}
682         for rl in result_lists:
683             for r in rl:
684                 if r.book_id in books:
685                     books[r.book_id].merge(r)
686                 else:
687                     books[r.book_id] = r
688         return books.values()
689
690     def __cmp__(self, other):
691         c = cmp(self.score, other.score)
692         if c == 0:
693             # this is inverted, because earlier date is better
694             return cmp(other.published_date, self.published_date)
695         else:
696             return c
697
698     def __len__(self):
699         return len(self.hits)
700
701     def snippet_pos(self, idx=0):
702         return self.hits[idx]['snippets_pos']
703
704     def snippet_revision(self, idx=0):
705         try:
706             return self.hits[idx]['snippets_revision']
707         except:
708             return None
709
710
711 class Search(SolrIndex):
712     """
713     Search facilities.
714     """
715     def __init__(self, default_field="text"):
716         super(Search, self).__init__()
717
718     # def get_tokens(self, searched, field='text', cached=None):
719     #     """returns tokens analyzed by a proper (for a field) analyzer
720     #     argument can be: StringReader, string/unicode, or tokens. In the last case
721     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
722     #     """
723     #     if cached is not None and field in cached:
724     #         return cached[field]
725
726     #     if isinstance(searched, str) or isinstance(searched, unicode):
727     #         searched = StringReader(searched)
728     #     elif isinstance(searched, list):
729     #         return searched
730
731     #     searched.reset()
732     #     tokens = self.analyzer.reusableTokenStream(field, searched)
733     #     toks = []
734     #     while tokens.incrementToken():
735     #         cta = tokens.getAttribute(CharTermAttribute.class_)
736     #         toks.append(cta.toString())
737
738     #     if cached is not None:
739     #         cached[field] = toks
740
741     #     return toks
742
743     # @staticmethod
744     # def fuzziness(fuzzy):
745     #     """Helper method to sanitize fuzziness"""
746     #     if not fuzzy:
747     #         return None
748     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
749     #         return fuzzy
750     #     else:
751     #         return 0.5
752
753     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
754     #     """
755     #     Return a PhraseQuery with a series of tokens.
756     #     """
757     #     if fuzzy:
758     #         phrase = MultiPhraseQuery()
759     #         for t in tokens:
760     #             term = Term(field, t)
761     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
762     #             fuzzterms = []
763
764     #             while True:
765     #                 ft = fuzzterm.term()
766     #                 if ft:
767     #                     fuzzterms.append(ft)
768     #                 if not fuzzterm.next(): break
769     #             if fuzzterms:
770     #                 phrase.add(JArray('object')(fuzzterms, Term))
771     #             else:
772     #                 phrase.add(term)
773     #     else:
774     #         phrase = PhraseQuery()
775     #         phrase.setSlop(slop)
776     #         for t in tokens:
777     #             term = Term(field, t)
778     #             phrase.add(term)
779     #     return phrase
780
781     def make_term_query(self, query, field='text', modal=operator.or_):
782         """
783         Returns term queries joined by boolean query.
784         modal - applies to boolean query
785         fuzzy - should the query by fuzzy.
786         """
787         q = self.index.Q()
788         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
789                         query.split(r" ")), q)
790
791         return q
792
793     def search_phrase(self, searched, field='text', book=False,
794                       filters=None,
795                       snippets=False):
796         if filters is None: filters = []
797         if book: filters.append(self.index.Q(is_book=True))
798
799         q = self.index.query(**{field: searched})
800         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
801         res = q.execute()
802         return [SearchResult(found, how_found=u'search_phrase') for found in res]
803
804     def search_some(self, searched, fields, book=True,
805                     filters=None,
806                     snippets=True):
807         assert isinstance(fields, list)
808         if filters is None: filters = []
809         if book: filters.append(self.index.Q(is_book=True))
810
811         query = self.index.Q()
812
813         for fld in fields:
814             query = self.index.Q(query | self.make_term_query(searched, fld))
815
816         query = self.index.query(query)
817         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
818         res = query.execute()
819         return [SearchResult(found, how_found='search_some') for found in res]
820
821     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
822     #     """
823     #     Search for perfect book matches. Just see if the query matches with some author or title,
824     #     taking hints into account.
825     #     """
826     #     fields_to_search = ['authors', 'title']
827     #     only_in = None
828     #     if hint:
829     #         if not hint.should_search_for_book():
830     #             return []
831     #         fields_to_search = hint.just_search_in(fields_to_search)
832     #         only_in = hint.book_filter()
833
834     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
835
836     #     books = []
837     #     for q in qrys:
838     #         top = self.searcher.search(q,
839     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
840     #             max_results)
841     #         for found in top.scoreDocs:
842     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
843     #     return books
844
845     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
846     #     fields_to_search = ['tags', 'authors', 'title']
847
848     #     only_in = None
849     #     if hint:
850     #         if not hint.should_search_for_book():
851     #             return []
852     #         fields_to_search = hint.just_search_in(fields_to_search)
853     #         only_in = hint.book_filter()
854
855     #     tokens = self.get_tokens(searched, field='SIMPLE')
856
857     #     q = BooleanQuery()
858
859     #     for fld in fields_to_search:
860     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
861     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
862
863     #     books = []
864     #     top = self.searcher.search(q,
865     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
866     #         max_results)
867     #     for found in top.scoreDocs:
868     #         books.append(SearchResult(self, found, how_found="search_book"))
869
870     #     return books
871
872     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
873     #     """
874     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
875     #     some part/fragment of the book.
876     #     """
877     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
878
879     #     flt = None
880     #     if hint:
881     #         flt = hint.part_filter()
882
883     #     books = []
884     #     for q in qrys:
885     #         top = self.searcher.search(q,
886     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
887     #                                                        flt]),
888     #                                    max_results)
889     #         for found in top.scoreDocs:
890     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
891
892     #     return books
893
894     def search_everywhere(self, searched):
895         """
896         Tries to use search terms to match different fields of book (or its parts).
897         E.g. one word can be an author survey, another be a part of the title, and the rest
898         are some words from third chapter.
899         """
900         books = []
901         # content only query : themes x content
902
903         q = self.make_term_query(searched, 'text')
904         q_themes = self.make_term_query(searched, 'themes_pl')
905
906         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
907         res = query.execute()
908
909         for found in res:
910             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent'))
911
912         # query themes/content x author/title/tags
913         in_content = self.index.Q()
914         in_meta = self.index.Q()
915
916         for fld in ['themes_pl', 'text']:
917             in_content |= self.make_term_query(searched, field=fld)
918
919         for fld in ['tags', 'authors', 'title']:
920             in_meta |= self.make_term_query(searched, field=fld)
921
922         q = in_content & in_meta
923         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
924         for found in res:
925             books.append(SearchResult(found, how_found='search_everywhere'))
926
927         return books
928
929     def get_snippets(self, searchresult, query, field='text', num=1):
930         """
931         Returns a snippet for found scoreDoc.
932         """
933         maxnum = len(searchresult)
934         if num is None or num < 0 or num > maxnum:
935             num = maxnum
936         book_id = searchresult.book_id
937         revision = searchresult.snippet_revision()
938         snippets = Snippets(book_id, revision=revision)
939         snips = [None] * maxnum
940         try:
941             snippets.open()
942             idx = 0
943             while idx < maxnum and num > 0:
944                 position, length = searchresult.snippet_pos(idx)
945                 if position is None or length is None:
946                     continue
947                 text = snippets.get((int(position),
948                                      int(length)))
949                 print "== %s -- %s ==" % (query, text)
950                 snip = self.index.highlight(text=text, field=field, q=query)
951                 snips[idx] = snip
952                 if snip:
953                     num -= 1
954                 idx += 1
955
956         except IOError, e:
957             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
958             return []
959         finally:
960             snippets.close()
961
962             # remove verse end markers..
963         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
964
965         searchresult.snippets = snips
966         return snips
967
968     def hint_tags(self, query, pdcounter=True, prefix=True):
969         """
970         Return auto-complete hints for tags
971         using prefix search.
972         """
973         q = self.index.Q()
974         query = query.strip()
975         for field in ['tag_name', 'tag_name_pl']:
976             if prefix:
977                 q |= self.index.Q(**{field: query + "*"})
978             else:
979                 q |= self.make_term_query(query, field=field)
980         qu = self.index.query(q).exclude(tag_category="book")
981
982         return self.search_tags(qu, pdcounter=pdcounter)
983
984     def search_tags(self, query, filters=None, pdcounter=False):
985         """
986         Search for Tag objects using query.
987         """
988         if not filters: filters = []
989         if not pdcounter:
990             filters.append(~self.index.Q(is_pdcounter=True))
991         res = self.apply_filters(query, filters).execute()
992
993         tags = []
994         for doc in res:
995             is_pdcounter = doc.get('is_pdcounter', False)
996             category = doc.get('tag_category')
997             try:
998                 if is_pdcounter == True:
999                     if category == 'pd_author':
1000                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1001                     elif category == 'pd_book':
1002                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1003                         tag.category = 'pd_book'  # make it look more lik a tag.
1004                     else:
1005                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1006                 else:
1007                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1008                     # don't add the pdcounter tag if same tag already exists
1009
1010                 tags.append(tag)
1011
1012             except catalogue.models.Tag.DoesNotExist: pass
1013             except PDCounterAuthor.DoesNotExist: pass
1014             except PDCounterBook.DoesNotExist: pass
1015
1016         log.debug('search_tags: %s' % tags)
1017
1018         return tags
1019
1020     def hint_books(self, query, prefix=True):
1021         """
1022         Returns auto-complete hints for book titles
1023         Because we do not index 'pseudo' title-tags.
1024         Prefix search.
1025         """
1026         q = self.index.Q()
1027         query = query.strip()
1028         if prefix:
1029             q |= self.index.Q(title=query + "*")
1030         else:
1031             q |= self.make_term_query(query, field='title')
1032         qu = self.index.query(q)
1033         only_books = self.index.Q(is_book=True)
1034         return self.search_books(qu, [only_books])
1035
1036     def search_books(self, query, filters=None, max_results=10):
1037         """
1038         Searches for Book objects using query
1039         """
1040         bks = []
1041         res = self.apply_filters(query, filters).field_limit(['book_id'])
1042         for r in res:
1043             try:
1044                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1045             except catalogue.models.Book.DoesNotExist: pass
1046         return bks
1047  
1048     # def make_prefix_phrase(self, toks, field):
1049     #     q = MultiPhraseQuery()
1050     #     for i in range(len(toks)):
1051     #         t = Term(field, toks[i])
1052     #         if i == len(toks) - 1:
1053     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1054     #             if pterms:
1055     #                 q.add(pterms)
1056     #             else:
1057     #                 q.add(t)
1058     #         else:
1059     #             q.add(t)
1060     #     return q
1061
1062     # @staticmethod
1063     # def term_filter(term, inverse=False):
1064     #     only_term = TermsFilter()
1065     #     only_term.addTerm(term)
1066
1067     #     if inverse:
1068     #         neg = BooleanFilter()
1069     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1070     #         only_term = neg
1071
1072     #     return only_term
1073
1074
1075
1076     @staticmethod
1077     def apply_filters(query, filters):
1078         """
1079         Apply filters to a query
1080         """
1081         if filters is None: filters = []
1082         filters = filter(lambda x: x is not None, filters)
1083         for f in filters:
1084             query = query.query(f)
1085         return query
1086
1087     # def filtered_categories(self, tags):
1088     #     """
1089     #     Return a list of tag categories, present in tags list.
1090     #     """
1091     #     cats = {}
1092     #     for t in tags:
1093     #         cats[t.category] = True
1094     #     return cats.keys()
1095
1096     # def hint(self):
1097     #     return Hint(self)