557f4045c27513791a2744fe3bda6a169381222f
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4
5 import os
6 import re
7 import errno
8 from librarian import dcparser
9 from librarian.parser import WLDocument
10 from lxml import etree
11 import catalogue.models
12 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
13 from itertools import chain
14 import traceback
15 import logging
16 log = logging.getLogger('search')
17 import sunburnt
18 import custom
19 import operator
20
21
22 class SolrIndex(object):
23     def __init__(self, mode=None):
24         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
25
26
27 class Snippets(object):
28     """
29     This class manages snippet files for indexed object (book)
30     the snippets are concatenated together, and their positions and
31     lengths are kept in lucene index fields.
32     """
33     SNIPPET_DIR = "snippets"
34
35     def __init__(self, book_id, revision=None):
36         try:
37             os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
38         except OSError as exc:
39             if exc.errno == errno.EEXIST:
40                 pass
41             else: raise
42         self.book_id = book_id
43         self.revision = revision
44         self.file = None
45
46     @property
47     def path(self):
48         if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
49         else: fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if not 'b' in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         self.file.close()
96
97     def remove(self):
98         self.revision = None
99         try:
100             os.unlink(self.path)
101             self.revision = 0
102             while True:
103                 self.revision += 1
104                 os.unlink(self.path)
105         except OSError:
106             pass
107
108
109 class Index(SolrIndex):
110     """
111     Class indexing books.
112     """
113     def __init__(self):
114         super(Index, self).__init__(mode='rw')
115
116     def delete_query(self, *queries):
117         """
118         index.delete(queries=...) doesn't work, so let's reimplement it
119         using deletion of list of uids.
120         """
121         uids = set()
122         for q in queries:
123             if isinstance(q, sunburnt.search.LuceneQuery):
124                 q = self.index.query(q)
125             q.field_limiter.update(['uid'])
126             st = 0
127             rows = 100
128             while True:
129                 ids = q.paginate(start=st, rows=rows).execute()
130                 if not len(ids):
131                     break
132                 for res in ids:
133                     uids.add(res['uid'])
134                 st += rows
135                 #        print "Will delete %s" % ','.join([x for x in uids])
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     def index_tags(self, *tags, **kw):
143         """
144         Re-index global tag list.
145         Removes all tags from index, then index them again.
146         Indexed fields include: id, name (with and without polish stems), category
147         """
148         remove_only = kw.get('remove_only', False)
149         # first, remove tags from index.
150         if tags:
151             tag_qs = []
152             for tag in tags:
153                 q_id = self.index.Q(tag_id=tag.id)
154
155                 if isinstance(tag, PDCounterAuthor):
156                     q_cat = self.index.Q(tag_category='pd_author')
157                 elif isinstance(tag, PDCounterBook):
158                     q_cat = self.index.Q(tag_category='pd_book')
159                 else:
160                     q_cat = self.index.Q(tag_category=tag.category)
161
162                 q_id_cat = self.index.Q(q_id & q_cat)
163                 tag_qs.append(q_id_cat)
164             self.delete_query(tag_qs)
165         else:  # all
166             q = self.index.Q(tag_id__any=True)
167             self.delete_query(q)
168
169         if not remove_only:
170             # then add them [all or just one passed]
171             if not tags:
172                 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
173                     PDCounterAuthor.objects.all(), \
174                     PDCounterBook.objects.all())
175
176             for tag in tags:
177                 if isinstance(tag, PDCounterAuthor):
178                     doc = {
179                         "tag_id": int(tag.id),
180                         "tag_name": tag.name,
181                         "tag_name_pl": tag.name,
182                         "tag_category": 'pd_author',
183                         "is_pdcounter": True,
184                         "uid": "tag%d_pd_a" % tag.id
185                         }
186                 elif isinstance(tag, PDCounterBook):
187                     doc = {
188                         "tag_id": int(tag.id),
189                         "tag_name": tag.title,
190                         "tag_name_pl": tag.title,
191                         "tag_category": 'pd_book',
192                         "is_pdcounter": True,
193                         "uid": "tag%d_pd_b" % tag.id
194                         }
195                 else:
196                     doc = {
197                         "tag_id": int(tag.id),
198                         "tag_name": tag.name,
199                         "tag_name_pl": tag.name,
200                         "tag_category": tag.category,
201                         "is_pdcounter": False,
202                         "uid": "tag%d" % tag.id
203                         }
204                 self.index.add(doc)
205
206     def create_book_doc(self, book):
207         """
208         Create a lucene document referring book id.
209         """
210         doc = {
211             'book_id': int(book.id),
212             }
213         if book.parent is not None:
214             doc["parent_id"] = int(book.parent.id)
215         return doc
216
217     def remove_book(self, book_or_id, remove_snippets=True):
218         """Removes a book from search index.
219         book - Book instance."""
220         if isinstance(book_or_id, catalogue.models.Book):
221             book_id = book_or_id.id
222         else:
223             book_id = book_or_id
224
225         self.delete_query(self.index.Q(book_id=book_id))
226
227         if remove_snippets:
228             snippets = Snippets(book_id)
229             snippets.remove()
230
231     def index_book(self, book, book_info=None, overwrite=True):
232         """
233         Indexes the book.
234         Creates a lucene document for extracted metadata
235         and calls self.index_content() to index the contents of the book.
236         """
237         if overwrite:
238             # we don't remove snippets, since they might be still needed by
239             # threads using not reopened index
240             self.remove_book(book, remove_snippets=False)
241
242         book_doc = self.create_book_doc(book)
243         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
244         # let's not index it - it's only used for extracting publish date
245         if 'source_name' in meta_fields:
246             del meta_fields['source_name']
247
248         for n, f in meta_fields.items():
249             book_doc[n] = f
250
251         book_doc['uid'] = "book%s" % book_doc['book_id']
252         self.index.add(book_doc)
253         del book_doc
254         book_fields = {
255             'title': meta_fields['title'],
256             'authors': meta_fields['authors'],
257             'published_date': meta_fields['published_date']
258             }
259         if 'translators' in meta_fields:
260             book_fields['translators'] = meta_fields['translators']
261
262         self.index_content(book, book_fields=book_fields)
263
264     master_tags = [
265         'opowiadanie',
266         'powiesc',
267         'dramat_wierszowany_l',
268         'dramat_wierszowany_lp',
269         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
270         'wywiad',
271         ]
272
273     ignore_content_tags = [
274         'uwaga', 'extra',
275         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
276         'didaskalia',
277         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
278         ]
279
280     footnote_tags = ['pa', 'pt', 'pr', 'pe']
281
282     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
283
284     published_date_re = re.compile("([0-9]+)[\]. ]*$")
285
286     def extract_metadata(self, book, book_info=None, dc_only=None):
287         """
288         Extract metadata from book and returns a map of fields keyed by fieldname
289         """
290         fields = {}
291
292         if book_info is None:
293             book_info = dcparser.parse(open(book.xml_file.path))
294
295         fields['slug'] = book.slug
296         fields['tags'] = [t.name  for t in book.tags]
297         fields['is_book'] = True
298
299         # validator, name
300         for field in dcparser.BookInfo.FIELDS:
301             if dc_only and field.name not in dc_only:
302                 continue
303             if hasattr(book_info, field.name):
304                 if not getattr(book_info, field.name):
305                     continue
306                 # since no type information is available, we use validator
307                 type_indicator = field.validator
308                 if type_indicator == dcparser.as_unicode:
309                     s = getattr(book_info, field.name)
310                     if field.multiple:
311                         s = ', '.join(s)
312                     fields[field.name] = s
313                 elif type_indicator == dcparser.as_person:
314                     p = getattr(book_info, field.name)
315                     if isinstance(p, dcparser.Person):
316                         persons = unicode(p)
317                     else:
318                         persons = ', '.join(map(unicode, p))
319                     fields[field.name] = persons
320                 elif type_indicator == dcparser.as_date:
321                     dt = getattr(book_info, field.name)
322                     fields[field.name] = dt
323
324         # get published date
325         pd = None
326         if hasattr(book_info, 'source_name') and book_info.source_name:
327             match = self.published_date_re.search(book_info.source_name)
328             if match is not None:
329                 pd = str(match.groups()[0])
330         if not pd: pd = ""
331         fields["published_date"] = pd
332
333         return fields
334
335     # def add_gaps(self, fields, fieldname):
336     #     """
337     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
338     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
339     #     """
340     #     def gap():
341     #         while True:
342     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
343     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
344
345     def get_master(self, root):
346         """
347         Returns the first master tag from an etree.
348         """
349         for master in root.iter():
350             if master.tag in self.master_tags:
351                 return master
352
353     def index_content(self, book, book_fields={}):
354         """
355         Walks the book XML and extract content from it.
356         Adds parts for each header tag and for each fragment.
357         """
358         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
359         root = wld.edoc.getroot()
360
361         master = self.get_master(root)
362         if master is None:
363             return []
364
365         def walker(node, ignore_tags=[]):
366
367             if node.tag not in ignore_tags:
368                 yield node, None, None
369                 if node.text is not None:
370                     yield None, node.text, None
371                 for child in list(node):
372                     for b, t, e in walker(child):
373                         yield b, t, e
374                 yield None, None, node
375
376             if node.tail is not None:
377                 yield None, node.tail, None
378             return
379
380         def fix_format(text):
381             #            separator = [u" ", u"\t", u".", u";", u","]
382             if isinstance(text, list):
383                 # need to join it first
384                 text = filter(lambda s: s is not None, content)
385                 text = u' '.join(text)
386                 # for i in range(len(text)):
387                 #     if i > 0:
388                 #         if text[i][0] not in separator\
389                 #             and text[i - 1][-1] not in separator:
390                 #          text.insert(i, u" ")
391
392             return re.sub("(?m)/$", "", text)
393
394         def add_part(snippets, **fields):
395             doc = self.create_book_doc(book)
396             for n, v in book_fields.items():
397                 doc[n] = v
398
399             doc['header_index'] = fields["header_index"]
400             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
401             doc['header_type'] = fields['header_type']
402
403             doc['text'] = fields['text']
404
405             # snippets
406             snip_pos = snippets.add(fields["text"])
407
408             doc['snippets_position'] = snip_pos[0]
409             doc['snippets_length'] = snip_pos[1]
410             if snippets.revision:
411                 doc["snippets_revision"] = snippets.revision
412
413             if 'fragment_anchor' in fields:
414                 doc["fragment_anchor"] = fields['fragment_anchor']
415
416             if 'themes' in fields:
417                 doc['themes'] = fields['themes']
418             doc['uid'] = "part%s%s%s" % (doc['header_index'],
419                                          doc['header_span'],
420                                          doc.get('fragment_anchor', ''))
421             return doc
422
423         def give_me_utf8(s):
424             if isinstance(s, unicode):
425                 return s.encode('utf-8')
426             else:
427                 return s
428
429         fragments = {}
430         snippets = Snippets(book.id).open('w')
431         try:
432             for header, position in zip(list(master), range(len(master))):
433
434                 if header.tag in self.skip_header_tags:
435                     continue
436                 if header.tag is etree.Comment:
437                     continue
438
439                 # section content
440                 content = []
441                 footnote = []
442
443                 def all_content(text):
444                     for frag in fragments.values():
445                         frag['text'].append(text)
446                     content.append(text)
447                 handle_text = [all_content]
448
449                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
450                     # handle footnotes
451                     if start is not None and start.tag in self.footnote_tags:
452                         footnote = []
453
454                         def collect_footnote(t):
455                             footnote.append(t)
456
457                         handle_text.append(collect_footnote)
458                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
459                         handle_text.pop()
460                         doc = add_part(snippets, header_index=position, header_type=header.tag,
461                                        text=u''.join(footnote),
462                                        is_footnote=True)
463                         self.index.add(doc)
464                         #print "@ footnote text: %s" % footnote
465                         footnote = []
466
467                     # handle fragments and themes.
468                     if start is not None and start.tag == 'begin':
469                         fid = start.attrib['id'][1:]
470                         fragments[fid] = {'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
471
472                     # themes for this fragment
473                     elif start is not None and start.tag == 'motyw':
474                         fid = start.attrib['id'][1:]
475                         handle_text.append(None)
476                         if start.text is not None:
477                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
478                     elif end is not None and end.tag == 'motyw':
479                         handle_text.pop()
480
481                     elif start is not None and start.tag == 'end':
482                         fid = start.attrib['id'][1:]
483                         if fid not in fragments:
484                             continue  # a broken <end> node, skip it
485                         frag = fragments[fid]
486                         if frag['themes'] == []:
487                             continue  # empty themes list.
488                         del fragments[fid]
489
490                         doc = add_part(snippets,
491                                        header_type=frag['start_header'],
492                                        header_index=frag['start_section'],
493                                        header_span=position - frag['start_section'] + 1,
494                                        fragment_anchor=fid,
495                                        text=fix_format(frag['text']),
496                                        themes=frag['themes'])
497                         #print '@ FRAG %s' % frag['content']
498                         self.index.add(doc)
499
500                         # Collect content.
501
502                     if text is not None and handle_text is not []:
503                         hdl = handle_text[-1]
504                         if hdl is not None:
505                             hdl(text)
506
507                         # in the end, add a section text.
508                 doc = add_part(snippets, header_index=position,
509                                header_type=header.tag, text=fix_format(content))
510                 #print '@ CONTENT: %s' % fix_format(content)
511
512                 self.index.add(doc)
513
514         finally:
515             snippets.close()
516
517
518 class SearchResult(object):
519     def __init__(self, doc, how_found=None, query=None, query_terms=None):
520         #        self.search = search
521         self.boost = 1.0
522         self._hits = []
523         self._processed_hits = None  # processed hits
524         self.snippets = []
525         self.query_terms = query_terms
526
527         if 'score' in doc:
528             self._score = doc['score']
529         else:
530             self._score = 0
531
532         self.book_id = int(doc["book_id"])
533
534         try:
535             self.published_date = int(doc.get("published_date"))
536         except ValueError:
537             self.published_date = 0
538
539         # content hits
540         header_type = doc.get("header_type", None)
541         # we have a content hit in some header of fragment
542         if header_type is not None:
543             sec = (header_type, int(doc["header_index"]))
544             header_span = doc['header_span']
545             header_span = header_span is not None and int(header_span) or 1
546             fragment = doc.get("fragment_anchor", None)
547             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
548             snippets_rev = doc['snippets_revision']
549
550             hit = (sec + (header_span,), fragment, self._score, {
551                 'how_found': how_found,
552                 'snippets_pos': snippets_pos,
553                 'snippets_revision': snippets_rev,
554                 'themes': doc.get('themes', []),
555                 'themes_pl': doc.get('themes_pl', [])
556                 })
557
558             self._hits.append(hit)
559
560     def __unicode__(self):
561         return u"<SR id=%d %d(%d) hits score=%f %d snippets" % \
562             (self.book_id, len(self._hits), self._processed_hits and len(self._processed_hits) or -1, self._score, len(self.snippets))
563
564     def __str__(self):
565         return unicode(self).encode('utf-8')
566
567     @property
568     def score(self):
569         return self._score * self.boost
570
571     def merge(self, other):
572         if self.book_id != other.book_id:
573             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
574         self._hits += other._hits
575         if other.score > self.score:
576             self._score = other._score
577         return self
578
579     def get_book(self):
580         if hasattr(self, '_book'):
581             return self._book
582         self._book = catalogue.models.Book.objects.get(id=self.book_id)
583         return self._book
584
585     book = property(get_book)
586
587     POSITION = 0
588     FRAGMENT = 1
589     POSITION_INDEX = 1
590     POSITION_SPAN = 2
591     SCORE = 2
592     OTHER = 3
593
594     @property
595     def hits(self):
596         if self._processed_hits is not None:
597             return self._processed_hits
598
599         # to sections and fragments
600         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
601
602         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
603
604         # sections not covered by fragments
605         sect = filter(lambda s: 0 == len(filter(
606             lambda f: s[self.POSITION][self.POSITION_INDEX] >= f[self.POSITION][self.POSITION_INDEX]
607             and s[self.POSITION][self.POSITION_INDEX] < f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN],
608             frags)), sect)
609
610         hits = []
611
612         def remove_duplicates(lst, keyfn, compare):
613             els = {}
614             for e in lst:
615                 eif = keyfn(e)
616                 if eif in els:
617                     if compare(els[eif], e) >= 1:
618                         continue
619                 els[eif] = e
620             return els.values()
621
622         # remove fragments with duplicated fid's and duplicated snippets
623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
626
627         # remove duplicate sections
628         sections = {}
629
630         for s in sect:
631             si = s[self.POSITION][self.POSITION_INDEX]
632             # skip existing
633             if si in sections:
634                 if sections[si]['score'] >= s[self.SCORE]:
635                     continue
636
637             m = {'score': s[self.SCORE],
638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
639                  }
640             m.update(s[self.OTHER])
641             sections[si] = m
642
643         hits = sections.values()
644
645         for f in frags:
646             try:
647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
648             except catalogue.models.Fragment.DoesNotExist:
649                 # stale index
650                 continue
651             # Figure out if we were searching for a token matching some word in theme name.
652             themes = frag.tags.filter(category='theme')
653             themes_hit = set()
654             if self.query_terms is not None:
655                 for i in range(0, len(f[self.OTHER]['themes'])):
656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
657                     tms = map(unicode.lower, tms)
658                     for qt in self.query_terms:
659                         if qt in tms:
660                             themes_hit.add(f[self.OTHER]['themes'][i])
661                             break
662
663             def theme_by_name(n):
664                 th = filter(lambda t: t.name == n, themes)
665                 if th:
666                     return th[0]
667                 else:
668                     return None
669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
670
671             m = {'score': f[self.SCORE],
672                  'fragment': frag,
673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
674                  'themes': themes,
675                  'themes_hit': themes_hit
676                  }
677             m.update(f[self.OTHER])
678             hits.append(m)
679
680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
681
682         self._processed_hits = hits
683
684         return hits
685
686     @staticmethod
687     def aggregate(*result_lists):
688         books = {}
689         for rl in result_lists:
690             for r in rl:
691                 if r.book_id in books:
692                     books[r.book_id].merge(r)
693                 else:
694                     books[r.book_id] = r
695         return books.values()
696
697     def __cmp__(self, other):
698         c = cmp(self.score, other.score)
699         if c == 0:
700             # this is inverted, because earlier date is better
701             return cmp(other.published_date, self.published_date)
702         else:
703             return c
704
705     def __len__(self):
706         return len(self.hits)
707
708     def snippet_pos(self, idx=0):
709         return self.hits[idx]['snippets_pos']
710
711     def snippet_revision(self, idx=0):
712         try:
713             return self.hits[idx]['snippets_revision']
714         except:
715             return None
716
717
718 class Search(SolrIndex):
719     """
720     Search facilities.
721     """
722     def __init__(self, default_field="text"):
723         super(Search, self).__init__(mode='r')
724
725     # def get_tokens(self, searched, field='text', cached=None):
726     #     """returns tokens analyzed by a proper (for a field) analyzer
727     #     argument can be: StringReader, string/unicode, or tokens. In the last case
728     #     they will just be returned (so we can reuse tokens, if we don't change the analyzer)
729     #     """
730     #     if cached is not None and field in cached:
731     #         return cached[field]
732
733     #     if isinstance(searched, str) or isinstance(searched, unicode):
734     #         searched = StringReader(searched)
735     #     elif isinstance(searched, list):
736     #         return searched
737
738     #     searched.reset()
739     #     tokens = self.analyzer.reusableTokenStream(field, searched)
740     #     toks = []
741     #     while tokens.incrementToken():
742     #         cta = tokens.getAttribute(CharTermAttribute.class_)
743     #         toks.append(cta.toString())
744
745     #     if cached is not None:
746     #         cached[field] = toks
747
748     #     return toks
749
750     # @staticmethod
751     # def fuzziness(fuzzy):
752     #     """Helper method to sanitize fuzziness"""
753     #     if not fuzzy:
754     #         return None
755     #     if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
756     #         return fuzzy
757     #     else:
758     #         return 0.5
759
760     # def make_phrase(self, tokens, field='text', slop=2, fuzzy=False):
761     #     """
762     #     Return a PhraseQuery with a series of tokens.
763     #     """
764     #     if fuzzy:
765     #         phrase = MultiPhraseQuery()
766     #         for t in tokens:
767     #             term = Term(field, t)
768     #             fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
769     #             fuzzterms = []
770
771     #             while True:
772     #                 ft = fuzzterm.term()
773     #                 if ft:
774     #                     fuzzterms.append(ft)
775     #                 if not fuzzterm.next(): break
776     #             if fuzzterms:
777     #                 phrase.add(JArray('object')(fuzzterms, Term))
778     #             else:
779     #                 phrase.add(term)
780     #     else:
781     #         phrase = PhraseQuery()
782     #         phrase.setSlop(slop)
783     #         for t in tokens:
784     #             term = Term(field, t)
785     #             phrase.add(term)
786     #     return phrase
787
788     def make_term_query(self, query, field='text', modal=operator.or_):
789         """
790         Returns term queries joined by boolean query.
791         modal - applies to boolean query
792         fuzzy - should the query by fuzzy.
793         """
794         q = self.index.Q()
795         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}),
796                         query.split(r" ")), q)
797
798         return q
799
800     def search_phrase(self, searched, field='text', book=False,
801                       filters=None,
802                       snippets=False):
803         if filters is None: filters = []
804         if book: filters.append(self.index.Q(is_book=True))
805
806         q = self.index.query(**{field: searched})
807         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
808         res = q.execute()
809         return [SearchResult(found, how_found=u'search_phrase') for found in res]
810
811     def search_some(self, searched, fields, book=True,
812                     filters=None, snippets=True, query_terms=None):
813         assert isinstance(fields, list)
814         if filters is None: filters = []
815         if book: filters.append(self.index.Q(is_book=True))
816
817         query = self.index.Q()
818
819         for fld in fields:
820             query = self.index.Q(query | self.make_term_query(searched, fld))
821
822         query = self.index.query(query)
823         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
824         res = query.execute()
825         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
826
827     # def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
828     #     """
829     #     Search for perfect book matches. Just see if the query matches with some author or title,
830     #     taking hints into account.
831     #     """
832     #     fields_to_search = ['authors', 'title']
833     #     only_in = None
834     #     if hint:
835     #         if not hint.should_search_for_book():
836     #             return []
837     #         fields_to_search = hint.just_search_in(fields_to_search)
838     #         only_in = hint.book_filter()
839
840     #     qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
841
842     #     books = []
843     #     for q in qrys:
844     #         top = self.searcher.search(q,
845     #             self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
846     #             max_results)
847     #         for found in top.scoreDocs:
848     #             books.append(SearchResult(self, found, how_found="search_perfect_book"))
849     #     return books
850
851     # def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
852     #     fields_to_search = ['tags', 'authors', 'title']
853
854     #     only_in = None
855     #     if hint:
856     #         if not hint.should_search_for_book():
857     #             return []
858     #         fields_to_search = hint.just_search_in(fields_to_search)
859     #         only_in = hint.book_filter()
860
861     #     tokens = self.get_tokens(searched, field='SIMPLE')
862
863     #     q = BooleanQuery()
864
865     #     for fld in fields_to_search:
866     #         q.add(BooleanClause(self.make_term_query(tokens, field=fld,
867     #                             fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
868
869     #     books = []
870     #     top = self.searcher.search(q,
871     #                                self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
872     #         max_results)
873     #     for found in top.scoreDocs:
874     #         books.append(SearchResult(self, found, how_found="search_book"))
875
876     #     return books
877
878     # def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
879     #     """
880     #     Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
881     #     some part/fragment of the book.
882     #     """
883     #     qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['text']]
884
885     #     flt = None
886     #     if hint:
887     #         flt = hint.part_filter()
888
889     #     books = []
890     #     for q in qrys:
891     #         top = self.searcher.search(q,
892     #                                    self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
893     #                                                        flt]),
894     #                                    max_results)
895     #         for found in top.scoreDocs:
896     #             books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
897
898     #     return books
899
900     def search_everywhere(self, searched, query_terms=None):
901         """
902         Tries to use search terms to match different fields of book (or its parts).
903         E.g. one word can be an author survey, another be a part of the title, and the rest
904         are some words from third chapter.
905         """
906         books = []
907         # content only query : themes x content
908         q = self.make_term_query(searched, 'text')
909         q_themes = self.make_term_query(searched, 'themes_pl')
910
911         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
912         res = query.execute()
913
914         for found in res:
915             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
916
917         # query themes/content x author/title/tags
918         in_content = self.index.Q()
919         in_meta = self.index.Q()
920
921         for fld in ['themes_pl', 'text']:
922             in_content |= self.make_term_query(searched, field=fld)
923
924         for fld in ['tags', 'authors', 'title']:
925             in_meta |= self.make_term_query(searched, field=fld)
926
927         q = in_content & in_meta
928         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
929
930         for found in res:
931             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
932
933         return books
934
935     def get_snippets(self, searchresult, query, field='text', num=1):
936         """
937         Returns a snippet for found scoreDoc.
938         """
939         maxnum = len(searchresult)
940         if num is None or num < 0 or num > maxnum:
941             num = maxnum
942         book_id = searchresult.book_id
943         revision = searchresult.snippet_revision()
944         snippets = Snippets(book_id, revision=revision)
945         snips = [None] * maxnum
946         try:
947             snippets.open()
948             idx = 0
949             while idx < maxnum and num > 0:
950                 position, length = searchresult.snippet_pos(idx)
951                 if position is None or length is None:
952                     continue
953                 text = snippets.get((int(position),
954                                      int(length)))
955                 snip = self.index.highlight(text=text, field=field, q=query)
956                 snips[idx] = snip
957                 if snip:
958                     num -= 1
959                 idx += 1
960
961         except IOError, e:
962             log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
963             return []
964         finally:
965             snippets.close()
966
967             # remove verse end markers..
968         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
969
970         searchresult.snippets = snips
971
972         return snips
973
974     def hint_tags(self, query, pdcounter=True, prefix=True):
975         """
976         Return auto-complete hints for tags
977         using prefix search.
978         """
979         q = self.index.Q()
980         query = query.strip()
981         for field in ['tag_name', 'tag_name_pl']:
982             if prefix:
983                 q |= self.index.Q(**{field: query + "*"})
984             else:
985                 q |= self.make_term_query(query, field=field)
986         qu = self.index.query(q).exclude(tag_category="book")
987
988         return self.search_tags(qu, pdcounter=pdcounter)
989
990     def search_tags(self, query, filters=None, pdcounter=False):
991         """
992         Search for Tag objects using query.
993         """
994         if not filters: filters = []
995         if not pdcounter:
996             filters.append(~self.index.Q(is_pdcounter=True))
997         res = self.apply_filters(query, filters).execute()
998
999         tags = []
1000         for doc in res:
1001             is_pdcounter = doc.get('is_pdcounter', False)
1002             category = doc.get('tag_category')
1003             try:
1004                 if is_pdcounter == True:
1005                     if category == 'pd_author':
1006                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1007                     elif category == 'pd_book':
1008                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1009                         tag.category = 'pd_book'  # make it look more lik a tag.
1010                     else:
1011                         print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1012                 else:
1013                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1014                     # don't add the pdcounter tag if same tag already exists
1015
1016                 tags.append(tag)
1017
1018             except catalogue.models.Tag.DoesNotExist: pass
1019             except PDCounterAuthor.DoesNotExist: pass
1020             except PDCounterBook.DoesNotExist: pass
1021
1022         log.debug('search_tags: %s' % tags)
1023
1024         return tags
1025
1026     def hint_books(self, query, prefix=True):
1027         """
1028         Returns auto-complete hints for book titles
1029         Because we do not index 'pseudo' title-tags.
1030         Prefix search.
1031         """
1032         q = self.index.Q()
1033         query = query.strip()
1034         if prefix:
1035             q |= self.index.Q(title=query + "*")
1036         else:
1037             q |= self.make_term_query(query, field='title')
1038         qu = self.index.query(q)
1039         only_books = self.index.Q(is_book=True)
1040         return self.search_books(qu, [only_books])
1041
1042     def search_books(self, query, filters=None, max_results=10):
1043         """
1044         Searches for Book objects using query
1045         """
1046         bks = []
1047         res = self.apply_filters(query, filters).field_limit(['book_id'])
1048         for r in res:
1049             try:
1050                 bks.append(catalogue.models.Book.objects.get(id=r['book_id']))
1051             except catalogue.models.Book.DoesNotExist: pass
1052         return bks
1053  
1054     # def make_prefix_phrase(self, toks, field):
1055     #     q = MultiPhraseQuery()
1056     #     for i in range(len(toks)):
1057     #         t = Term(field, toks[i])
1058     #         if i == len(toks) - 1:
1059     #             pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1060     #             if pterms:
1061     #                 q.add(pterms)
1062     #             else:
1063     #                 q.add(t)
1064     #         else:
1065     #             q.add(t)
1066     #     return q
1067
1068     # @staticmethod
1069     # def term_filter(term, inverse=False):
1070     #     only_term = TermsFilter()
1071     #     only_term.addTerm(term)
1072
1073     #     if inverse:
1074     #         neg = BooleanFilter()
1075     #         neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1076     #         only_term = neg
1077
1078     #     return only_term
1079
1080
1081
1082     @staticmethod
1083     def apply_filters(query, filters):
1084         """
1085         Apply filters to a query
1086         """
1087         if filters is None: filters = []
1088         filters = filter(lambda x: x is not None, filters)
1089         for f in filters:
1090             query = query.query(f)
1091         return query
1092
1093     # def filtered_categories(self, tags):
1094     #     """
1095     #     Return a list of tag categories, present in tags list.
1096     #     """
1097     #     cats = {}
1098     #     for t in tags:
1099     #         cats[t.category] = True
1100     #     return cats.keys()
1101
1102     # def hint(self):
1103     #     return Hint(self)