turn off hyphenator in librarian
[wolnelektury.git] / src / search / index.py
1 # -*- coding: utf-8 -*-
2 # This file is part of Wolnelektury, licensed under GNU Affero GPLv3 or later.
3 # Copyright © Fundacja Nowoczesna Polska. See NOTICE for more information.
4 #
5 from django.conf import settings
6
7 import os
8 import re
9 from librarian import dcparser
10 from librarian.parser import WLDocument
11 from lxml import etree
12 import catalogue.models
13 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
14 from itertools import chain
15 import sunburnt
16 import custom
17 import operator
18 import logging
19 from wolnelektury.utils import makedirs
20
21 log = logging.getLogger('search')
22
23
24 class SolrIndex(object):
25     def __init__(self, mode=None):
26         self.index = custom.CustomSolrInterface(settings.SOLR, mode=mode)
27
28
29 class Snippets(object):
30     """
31     This class manages snippet files for indexed object (book)
32     the snippets are concatenated together, and their positions and
33     lengths are kept in lucene index fields.
34     """
35     SNIPPET_DIR = "snippets"
36
37     def __init__(self, book_id, revision=None):
38         makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
39         self.book_id = book_id
40         self.revision = revision
41         self.file = None
42         self.position = None
43
44     @property
45     def path(self):
46         if self.revision:
47             fn = "%d.%d" % (self.book_id, self.revision)
48         else:
49             fn = "%d" % self.book_id
50
51         return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
52
53     def open(self, mode='r'):
54         """
55         Open the snippet file. Call .close() afterwards.
56         """
57         if 'b' not in mode:
58             mode += 'b'
59
60         if 'w' in mode:
61             if os.path.exists(self.path):
62                 self.revision = 1
63                 while True:
64                     if not os.path.exists(self.path):
65                         break
66                     self.revision += 1
67
68         self.file = open(self.path, mode)
69         self.position = 0
70         return self
71
72     def add(self, snippet):
73         """
74         Append a snippet (unicode) to the snippet file.
75         Return a (position, length) tuple
76         """
77         txt = snippet.encode('utf-8')
78         l = len(txt)
79         self.file.write(txt)
80         pos = (self.position, l)
81         self.position += l
82         return pos
83
84     def get(self, pos):
85         """
86         Given a tuple of (position, length) return an unicode
87         of the snippet stored there.
88         """
89         self.file.seek(pos[0], 0)
90         txt = self.file.read(pos[1]).decode('utf-8')
91         return txt
92
93     def close(self):
94         """Close snippet file"""
95         if self.file:
96             self.file.close()
97
98     def remove(self):
99         self.revision = None
100         try:
101             os.unlink(self.path)
102             self.revision = 0
103             while True:
104                 self.revision += 1
105                 os.unlink(self.path)
106         except OSError:
107             pass
108
109
110 class Index(SolrIndex):
111     """
112     Class indexing books.
113     """
114     def __init__(self):
115         super(Index, self).__init__(mode='rw')
116
117     def delete_query(self, *queries):
118         """
119         index.delete(queries=...) doesn't work, so let's reimplement it
120         using deletion of list of uids.
121         """
122         uids = set()
123         for q in queries:
124             if isinstance(q, sunburnt.search.LuceneQuery):
125                 q = self.index.query(q)
126             q.field_limiter.update(['uid'])
127             st = 0
128             rows = 100
129             while True:
130                 ids = q.paginate(start=st, rows=rows).execute()
131                 if not len(ids):
132                     break
133                 for res in ids:
134                     uids.add(res['uid'])
135                 st += rows
136         if uids:
137             self.index.delete(uids)
138             return True
139         else:
140             return False
141
142     # WTF
143     def index_tags(self, *tags, **kw):
144         """
145         Re-index global tag list.
146         Removes all tags from index, then index them again.
147         Indexed fields include: id, name (with and without polish stems), category
148         """
149         log.debug("Indexing tags")
150         remove_only = kw.get('remove_only', False)
151         # first, remove tags from index.
152         if tags:
153             tag_qs = []
154             for tag in tags:
155                 q_id = self.index.Q(tag_id=tag.id)
156
157                 if isinstance(tag, PDCounterAuthor):
158                     q_cat = self.index.Q(tag_category='pd_author')
159                 elif isinstance(tag, PDCounterBook):
160                     q_cat = self.index.Q(tag_category='pd_book')
161                 else:
162                     q_cat = self.index.Q(tag_category=tag.category)
163
164                 q_id_cat = self.index.Q(q_id & q_cat)
165                 tag_qs.append(q_id_cat)
166             self.delete_query(*tag_qs)
167         else:  # all
168             q = self.index.Q(tag_id__any=True)
169             self.delete_query(q)
170
171         if not remove_only:
172             # then add them [all or just one passed]
173             if not tags:
174                 tags = chain(
175                     catalogue.models.Tag.objects.exclude(category='set'),
176                     PDCounterAuthor.objects.all(),
177                     PDCounterBook.objects.all())
178
179             for tag in tags:
180                 if isinstance(tag, PDCounterAuthor):
181                     doc = {
182                         "tag_id": int(tag.id),
183                         "tag_name": tag.name,
184                         "tag_name_pl": tag.name,
185                         "tag_category": 'pd_author',
186                         "is_pdcounter": True,
187                         "uid": "tag%d_pd_a" % tag.id
188                         }
189                 elif isinstance(tag, PDCounterBook):
190                     doc = {
191                         "tag_id": int(tag.id),
192                         "tag_name": tag.title,
193                         "tag_name_pl": tag.title,
194                         "tag_category": 'pd_book',
195                         "is_pdcounter": True,
196                         "uid": "tag%d_pd_b" % tag.id
197                         }
198                 else:
199                     doc = {
200                         "tag_id": int(tag.id),
201                         "tag_name": tag.name,
202                         "tag_name_pl": tag.name,
203                         "tag_category": tag.category,
204                         "is_pdcounter": False,
205                         "uid": "tag%d" % tag.id
206                         }
207                 self.index.add(doc)
208
209     def create_book_doc(self, book):
210         """
211         Create a lucene document referring book id.
212         """
213         doc = {'book_id': int(book.id)}
214         if book.parent is not None:
215             doc['parent_id'] = int(book.parent.id)
216         return doc
217
218     def remove_book(self, book_or_id, remove_snippets=True):
219         """Removes a book from search index.
220         book - Book instance."""
221         if isinstance(book_or_id, catalogue.models.Book):
222             book_id = book_or_id.id
223         else:
224             book_id = book_or_id
225
226         self.delete_query(self.index.Q(book_id=book_id))
227
228         if remove_snippets:
229             snippets = Snippets(book_id)
230             snippets.remove()
231
232     def index_book(self, book, book_info=None, overwrite=True):
233         """
234         Indexes the book.
235         Creates a lucene document for extracted metadata
236         and calls self.index_content() to index the contents of the book.
237         """
238         if overwrite:
239             # we don't remove snippets, since they might be still needed by
240             # threads using not reopened index
241             self.remove_book(book, remove_snippets=False)
242
243         book_doc = self.create_book_doc(book)
244         meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'translators', 'title'])
245         # let's not index it - it's only used for extracting publish date
246         if 'source_name' in meta_fields:
247             del meta_fields['source_name']
248
249         for n, f in meta_fields.items():
250             book_doc[n] = f
251
252         book_doc['uid'] = "book%s" % book_doc['book_id']
253         self.index.add(book_doc)
254         del book_doc
255         book_fields = {
256             'title': meta_fields['title'],
257             'authors': meta_fields['authors'],
258             'published_date': meta_fields['published_date']
259             }
260
261         if 'translators' in meta_fields:
262             book_fields['translators'] = meta_fields['translators']
263
264         self.index_content(book, book_fields=book_fields)
265
266     master_tags = [
267         'opowiadanie',
268         'powiesc',
269         'dramat_wierszowany_l',
270         'dramat_wierszowany_lp',
271         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
272         'wywiad',
273         ]
274
275     ignore_content_tags = [
276         'uwaga', 'extra',
277         'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
278         'didaskalia',
279         'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
280         ]
281
282     footnote_tags = ['pa', 'pt', 'pr', 'pe']
283
284     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne',
285                         '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
286
287     published_date_re = re.compile("([0-9]+)[\]. ]*$")
288
289     def extract_metadata(self, book, book_info=None, dc_only=None):
290         """
291         Extract metadata from book and returns a map of fields keyed by fieldname
292         """
293         fields = {}
294
295         if book_info is None:
296             book_info = dcparser.parse(open(book.xml_file.path))
297
298         fields['slug'] = book.slug
299         fields['tags'] = [t.name for t in book.tags]
300         fields['is_book'] = True
301
302         # validator, name
303         for field in dcparser.BookInfo.FIELDS:
304             if dc_only and field.name not in dc_only:
305                 continue
306             if hasattr(book_info, field.name):
307                 if not getattr(book_info, field.name):
308                     continue
309                 # since no type information is available, we use validator
310                 type_indicator = field.validator
311                 if type_indicator == dcparser.as_unicode:
312                     s = getattr(book_info, field.name)
313                     if field.multiple:
314                         s = ', '.join(s)
315                     fields[field.name] = s
316                 elif type_indicator == dcparser.as_person:
317                     p = getattr(book_info, field.name)
318                     if isinstance(p, dcparser.Person):
319                         persons = unicode(p)
320                     else:
321                         persons = ', '.join(map(unicode, p))
322                     fields[field.name] = persons
323                 elif type_indicator == dcparser.as_date:
324                     dt = getattr(book_info, field.name)
325                     fields[field.name] = dt
326
327         # get published date
328         pd = None
329         if hasattr(book_info, 'source_name') and book_info.source_name:
330             match = self.published_date_re.search(book_info.source_name)
331             if match is not None:
332                 pd = str(match.groups()[0])
333         if not pd:
334             pd = ""
335         fields["published_date"] = pd
336
337         return fields
338
339     # def add_gaps(self, fields, fieldname):
340     #     """
341     #     Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
342     #     This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
343     #     """
344     #     def gap():
345     #         while True:
346     #             yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
347     #     return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
348
349     def get_master(self, root):
350         """
351         Returns the first master tag from an etree.
352         """
353         for master in root.iter():
354             if master.tag in self.master_tags:
355                 return master
356
357     def index_content(self, book, book_fields):
358         """
359         Walks the book XML and extract content from it.
360         Adds parts for each header tag and for each fragment.
361         """
362         wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
363         root = wld.edoc.getroot()
364
365         master = self.get_master(root)
366         if master is None:
367             return []
368
369         def walker(node, ignore_tags=()):
370             if node.tag not in ignore_tags:
371                 yield node, None, None
372                 if node.text is not None:
373                     yield None, node.text, None
374                 for child in list(node):
375                     for b, t, e in walker(child):
376                         yield b, t, e
377                 yield None, None, node
378
379             if node.tail is not None:
380                 yield None, node.tail, None
381             return
382
383         def fix_format(text):
384             # separator = [u" ", u"\t", u".", u";", u","]
385             if isinstance(text, list):
386                 # need to join it first
387                 text = filter(lambda s: s is not None, content)
388                 text = u' '.join(text)
389                 # for i in range(len(text)):
390                 #     if i > 0:
391                 #         if text[i][0] not in separator\
392                 #             and text[i - 1][-1] not in separator:
393                 #          text.insert(i, u" ")
394
395             return re.sub("(?m)/$", "", text)
396
397         def add_part(snippets, **fields):
398             doc = self.create_book_doc(book)
399             for n, v in book_fields.items():
400                 doc[n] = v
401
402             doc['header_index'] = fields["header_index"]
403             doc['header_span'] = 'header_span' in fields and fields['header_span'] or 1
404             doc['header_type'] = fields['header_type']
405
406             doc['text'] = fields['text']
407
408             # snippets
409             snip_pos = snippets.add(fields["text"])
410
411             doc['snippets_position'] = snip_pos[0]
412             doc['snippets_length'] = snip_pos[1]
413             if snippets.revision:
414                 doc["snippets_revision"] = snippets.revision
415
416             if 'fragment_anchor' in fields:
417                 doc["fragment_anchor"] = fields['fragment_anchor']
418
419             if 'themes' in fields:
420                 doc['themes'] = fields['themes']
421             doc['uid'] = "part%s%s%s" % (doc['header_index'],
422                                          doc['header_span'],
423                                          doc.get('fragment_anchor', ''))
424             return doc
425
426         def give_me_utf8(s):
427             if isinstance(s, unicode):
428                 return s.encode('utf-8')
429             else:
430                 return s
431
432         fragments = {}
433         snippets = Snippets(book.id).open('w')
434         try:
435             for header, position in zip(list(master), range(len(master))):
436
437                 if header.tag in self.skip_header_tags:
438                     continue
439                 if header.tag is etree.Comment:
440                     continue
441
442                 # section content
443                 content = []
444                 footnote = []
445
446                 def all_content(text):
447                     for frag in fragments.values():
448                         frag['text'].append(text)
449                     content.append(text)
450                 handle_text = [all_content]
451
452                 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
453                     # handle footnotes
454                     if start is not None and start.tag in self.footnote_tags:
455                         footnote = []
456
457                         def collect_footnote(t):
458                             footnote.append(t)
459
460                         handle_text.append(collect_footnote)
461                     elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
462                         handle_text.pop()
463                         doc = add_part(snippets, header_index=position, header_type=header.tag,
464                                        text=u''.join(footnote),
465                                        is_footnote=True)
466                         self.index.add(doc)
467                         footnote = []
468
469                     # handle fragments and themes.
470                     if start is not None and start.tag == 'begin':
471                         fid = start.attrib['id'][1:]
472                         fragments[fid] = {
473                             'text': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
474
475                     # themes for this fragment
476                     elif start is not None and start.tag == 'motyw':
477                         fid = start.attrib['id'][1:]
478                         handle_text.append(lambda text: None)
479                         if start.text is not None:
480                             fragments[fid]['themes'] += map(unicode.strip, map(unicode, (start.text.split(','))))
481                     elif end is not None and end.tag == 'motyw':
482                         handle_text.pop()
483
484                     elif start is not None and start.tag == 'end':
485                         fid = start.attrib['id'][1:]
486                         if fid not in fragments:
487                             continue  # a broken <end> node, skip it
488                         frag = fragments[fid]
489                         if not frag['themes']:
490                             continue  # empty themes list.
491                         del fragments[fid]
492
493                         doc = add_part(snippets,
494                                        header_type=frag['start_header'],
495                                        header_index=frag['start_section'],
496                                        header_span=position - frag['start_section'] + 1,
497                                        fragment_anchor=fid,
498                                        text=fix_format(frag['text']),
499                                        themes=frag['themes'])
500                         self.index.add(doc)
501
502                         # Collect content.
503
504                     if text is not None and handle_text is not []:
505                         hdl = handle_text[-1]
506                         hdl(text)
507
508                         # in the end, add a section text.
509                 doc = add_part(snippets, header_index=position,
510                                header_type=header.tag, text=fix_format(content))
511
512                 self.index.add(doc)
513
514         finally:
515             snippets.close()
516
517
518 class SearchResult(object):
519     def __init__(self, doc, how_found=None, query=None, query_terms=None):
520         #        self.search = search
521         self.boost = 1.0
522         self._hits = []
523         self._processed_hits = None  # processed hits
524         self.snippets = []
525         self.query_terms = query_terms
526         self._book = None
527
528         if 'score' in doc:
529             self._score = doc['score']
530         else:
531             self._score = 0
532
533         self.book_id = int(doc["book_id"])
534
535         try:
536             self.published_date = int(doc.get("published_date"))
537         except ValueError:
538             self.published_date = 0
539
540         # content hits
541         header_type = doc.get("header_type", None)
542         # we have a content hit in some header of fragment
543         if header_type is not None:
544             sec = (header_type, int(doc["header_index"]))
545             header_span = doc['header_span']
546             header_span = header_span is not None and int(header_span) or 1
547             fragment = doc.get("fragment_anchor", None)
548             snippets_pos = (doc['snippets_position'], doc['snippets_length'])
549             snippets_rev = doc.get('snippets_revision', None)
550
551             hit = (sec + (header_span,), fragment, self._score, {
552                 'how_found': how_found,
553                 'snippets_pos': snippets_pos,
554                 'snippets_revision': snippets_rev,
555                 'themes': doc.get('themes', []),
556                 'themes_pl': doc.get('themes_pl', [])
557                 })
558
559             self._hits.append(hit)
560
561     def __unicode__(self):
562         return u"<SR id=%d %d(%d) hits score=%f %d snippets>" % \
563             (self.book_id, len(self._hits),
564              len(self._processed_hits) if self._processed_hits else -1,
565              self._score, len(self.snippets))
566
567     def __str__(self):
568         return unicode(self).encode('utf-8')
569
570     @property
571     def score(self):
572         return self._score * self.boost
573
574     def merge(self, other):
575         if self.book_id != other.book_id:
576             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
577         self._hits += other._hits
578         if other.score > self.score:
579             self._score = other._score
580         return self
581
582     def get_book(self):
583         if self._book is not None:
584             return self._book
585         self._book = catalogue.models.Book.objects.get(id=self.book_id)
586         return self._book
587
588     book = property(get_book)
589
590     POSITION = 0
591     FRAGMENT = 1
592     POSITION_INDEX = 1
593     POSITION_SPAN = 2
594     SCORE = 2
595     OTHER = 3
596
597     @property
598     def hits(self):
599         if self._processed_hits is not None:
600             return self._processed_hits
601
602         # to sections and fragments
603         frags = filter(lambda r: r[self.FRAGMENT] is not None, self._hits)
604
605         sect = filter(lambda r: r[self.FRAGMENT] is None, self._hits)
606
607         # sections not covered by fragments
608         sect = filter(lambda s: 0 == len(filter(
609             lambda f: f[self.POSITION][self.POSITION_INDEX] <= s[self.POSITION][self.POSITION_INDEX] <
610                       f[self.POSITION][self.POSITION_INDEX] + f[self.POSITION][self.POSITION_SPAN], frags)), sect)
611
612         def remove_duplicates(lst, keyfn, compare):
613             els = {}
614             for e in lst:
615                 eif = keyfn(e)
616                 if eif in els:
617                     if compare(els[eif], e) >= 1:
618                         continue
619                 els[eif] = e
620             return els.values()
621
622         # remove fragments with duplicated fid's and duplicated snippets
623         frags = remove_duplicates(frags, lambda f: f[self.FRAGMENT], lambda a, b: cmp(a[self.SCORE], b[self.SCORE]))
624         # frags = remove_duplicates(frags, lambda f: f[OTHER]['snippet_pos'] and f[OTHER]['snippet_pos'] or f[FRAGMENT],
625         #                           lambda a, b: cmp(a[SCORE], b[SCORE]))
626
627         # remove duplicate sections
628         sections = {}
629
630         for s in sect:
631             si = s[self.POSITION][self.POSITION_INDEX]
632             # skip existing
633             if si in sections:
634                 if sections[si]['score'] >= s[self.SCORE]:
635                     continue
636
637             m = {'score': s[self.SCORE],
638                  'section_number': s[self.POSITION][self.POSITION_INDEX] + 1,
639                  }
640             m.update(s[self.OTHER])
641             sections[si] = m
642
643         hits = sections.values()
644
645         for f in frags:
646             try:
647                 frag = catalogue.models.Fragment.objects.get(anchor=f[self.FRAGMENT], book__id=self.book_id)
648             except catalogue.models.Fragment.DoesNotExist:
649                 # stale index
650                 continue
651             # Figure out if we were searching for a token matching some word in theme name.
652             themes = frag.tags.filter(category='theme')
653             themes_hit = set()
654             if self.query_terms is not None:
655                 for i in range(0, len(f[self.OTHER]['themes'])):
656                     tms = f[self.OTHER]['themes'][i].split(r' +') + f[self.OTHER]['themes_pl'][i].split(' ')
657                     tms = map(unicode.lower, tms)
658                     for qt in self.query_terms:
659                         if qt in tms:
660                             themes_hit.add(f[self.OTHER]['themes'][i])
661                             break
662
663             def theme_by_name(n):
664                 th = filter(lambda t: t.name == n, themes)
665                 if th:
666                     return th[0]
667                 else:
668                     return None
669             themes_hit = filter(lambda a: a is not None, map(theme_by_name, themes_hit))
670
671             m = {'score': f[self.SCORE],
672                  'fragment': frag,
673                  'section_number': f[self.POSITION][self.POSITION_INDEX] + 1,
674                  'themes': themes,
675                  'themes_hit': themes_hit
676                  }
677             m.update(f[self.OTHER])
678             hits.append(m)
679
680         hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
681
682         self._processed_hits = hits
683
684         return hits
685
686     @staticmethod
687     def aggregate(*result_lists):
688         books = {}
689         for rl in result_lists:
690             for r in rl:
691                 if r.book_id in books:
692                     books[r.book_id].merge(r)
693                 else:
694                     books[r.book_id] = r
695         return books.values()
696
697     def __cmp__(self, other):
698         c = cmp(self.score, other.score)
699         if c == 0:
700             # this is inverted, because earlier date is better
701             return cmp(other.published_date, self.published_date)
702         else:
703             return c
704
705     def __len__(self):
706         return len(self.hits)
707
708     def snippet_pos(self, idx=0):
709         return self.hits[idx]['snippets_pos']
710
711     def snippet_revision(self, idx=0):
712         try:
713             return self.hits[idx]['snippets_revision']
714         except (IndexError, KeyError):
715             return None
716
717
718 class Search(SolrIndex):
719     """
720     Search facilities.
721     """
722     def __init__(self, default_field="text"):
723         super(Search, self).__init__(mode='r')
724
725     def make_term_query(self, query, field='text', modal=operator.or_):
726         """
727         Returns term queries joined by boolean query.
728         modal - applies to boolean query
729         fuzzy - should the query by fuzzy.
730         """
731         if query is None:
732             query = ''
733         q = self.index.Q()
734         q = reduce(modal, map(lambda s: self.index.Q(**{field: s}), query.split(r" ")), q)
735
736         return q
737
738     def search_phrase(self, searched, field='text', book=False,
739                       filters=None,
740                       snippets=False):
741         if filters is None:
742             filters = []
743         if book:
744             filters.append(self.index.Q(is_book=True))
745
746         q = self.index.query(**{field: searched})
747         q = self.apply_filters(q, filters).field_limit(score=True, all_fields=True)
748         res = q.execute()
749         return [SearchResult(found, how_found=u'search_phrase') for found in res]
750
751     def search_some(self, searched, fields, book=True,
752                     filters=None, snippets=True, query_terms=None):
753         assert isinstance(fields, list)
754         if filters is None:
755             filters = []
756         if book:
757             filters.append(self.index.Q(is_book=True))
758
759         query = self.index.Q()
760
761         for fld in fields:
762             query = self.index.Q(query | self.make_term_query(searched, fld))
763
764         query = self.index.query(query)
765         query = self.apply_filters(query, filters).field_limit(score=True, all_fields=True)
766         res = query.execute()
767         return [SearchResult(found, how_found='search_some', query_terms=query_terms) for found in res]
768
769     def search_everywhere(self, searched, query_terms=None):
770         """
771         Tries to use search terms to match different fields of book (or its parts).
772         E.g. one word can be an author survey, another be a part of the title, and the rest
773         are some words from third chapter.
774         """
775         books = []
776         # content only query : themes x content
777         q = self.make_term_query(searched, 'text')
778         q_themes = self.make_term_query(searched, 'themes_pl')
779
780         query = self.index.query(q).query(q_themes).field_limit(score=True, all_fields=True)
781         res = query.execute()
782
783         for found in res:
784             books.append(SearchResult(found, how_found='search_everywhere_themesXcontent', query_terms=query_terms))
785
786         # query themes/content x author/title/tags
787         in_content = self.index.Q()
788         in_meta = self.index.Q()
789
790         for fld in ['themes_pl', 'text']:
791             in_content |= self.make_term_query(searched, field=fld)
792
793         for fld in ['tags', 'authors', 'title']:
794             in_meta |= self.make_term_query(searched, field=fld)
795
796         q = in_content & in_meta
797         res = self.index.query(q).field_limit(score=True, all_fields=True).execute()
798
799         for found in res:
800             books.append(SearchResult(found, how_found='search_everywhere', query_terms=query_terms))
801
802         return books
803
804     def get_snippets(self, searchresult, query, field='text', num=1):
805         """
806         Returns a snippet for found scoreDoc.
807         """
808         maxnum = len(searchresult)
809         if num is None or num < 0 or num > maxnum:
810             num = maxnum
811         book_id = searchresult.book_id
812         revision = searchresult.snippet_revision()
813         snippets = Snippets(book_id, revision=revision)
814         snips = [None] * maxnum
815         try:
816             snippets.open()
817             idx = 0
818             while idx < maxnum and num > 0:
819                 position, length = searchresult.snippet_pos(idx)
820                 if position is None or length is None:
821                     continue
822                 text = snippets.get((int(position),
823                                      int(length)))
824                 snip = self.index.highlight(text=text, field=field, q=query)
825                 snips[idx] = snip
826                 if snip:
827                     num -= 1
828                 idx += 1
829
830         except IOError, e:
831             log.error("Cannot open snippet file for book id = %d [rev=%s], %s" % (book_id, revision, e))
832             return []
833         finally:
834             snippets.close()
835
836             # remove verse end markers..
837         snips = map(lambda s: s and s.replace("/\n", "\n"), snips)
838
839         searchresult.snippets = snips
840
841         return snips
842
843     def hint_tags(self, query, pdcounter=True, prefix=True):
844         """
845         Return auto-complete hints for tags
846         using prefix search.
847         """
848         q = self.index.Q()
849         query = query.strip()
850         for field in ['tag_name', 'tag_name_pl']:
851             if prefix:
852                 q |= self.index.Q(**{field: query + "*"})
853             else:
854                 q |= self.make_term_query(query, field=field)
855         qu = self.index.query(q)
856
857         return self.search_tags(qu, pdcounter=pdcounter)
858
859     def search_tags(self, query, filters=None, pdcounter=False):
860         """
861         Search for Tag objects using query.
862         """
863         if not filters:
864             filters = []
865         if not pdcounter:
866             filters.append(~self.index.Q(is_pdcounter=True))
867         res = self.apply_filters(query, filters).execute()
868
869         tags = []
870         pd_tags = []
871
872         for doc in res:
873             is_pdcounter = doc.get('is_pdcounter', False)
874             category = doc.get('tag_category')
875             try:
876                 if is_pdcounter:
877                     if category == 'pd_author':
878                         tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
879                     elif category == 'pd_book':
880                         tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
881                         tag.category = 'pd_book'  # make it look more lik a tag.
882                     else:
883                         # WTF
884                         print ("Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (
885                             int(doc.get('tag_id')), category)).encode('utf-8')
886                     pd_tags.append(tag)
887                 else:
888                     tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
889                     tags.append(tag)
890
891             except catalogue.models.Tag.DoesNotExist:
892                 pass
893             except PDCounterAuthor.DoesNotExist:
894                 pass
895             except PDCounterBook.DoesNotExist:
896                 pass
897
898         tags_slugs = set(map(lambda t: t.slug, tags))
899         tags = tags + filter(lambda t: t.slug not in tags_slugs, pd_tags)
900
901         log.debug('search_tags: %s' % tags)
902
903         return tags
904
905     def hint_books(self, query, prefix=True):
906         """
907         Returns auto-complete hints for book titles
908         Because we do not index 'pseudo' title-tags.
909         Prefix search.
910         """
911         q = self.index.Q()
912         query = query.strip()
913         if prefix:
914             q |= self.index.Q(title=query + "*")
915         else:
916             q |= self.make_term_query(query, field='title')
917         qu = self.index.query(q)
918         only_books = self.index.Q(is_book=True)
919         return self.search_books(qu, [only_books])
920
921     def search_books(self, query, filters=None, max_results=10):
922         """
923         Searches for Book objects using query
924         """
925         bks = []
926         bks_found = set()
927         query = query.query(is_book=True)
928         res = self.apply_filters(query, filters).field_limit(['book_id'])
929         for r in res:
930             try:
931                 bid = r['book_id']
932                 if bid not in bks_found:
933                     bks.append(catalogue.models.Book.objects.get(id=bid))
934                     bks_found.add(bid)
935             except catalogue.models.Book.DoesNotExist:
936                 pass
937         return bks
938
939     @staticmethod
940     def apply_filters(query, filters):
941         """
942         Apply filters to a query
943         """
944         if filters is None:
945             filters = []
946         filters = filter(lambda x: x is not None, filters)
947         for f in filters:
948             query = query.query(f)
949         return query
950
951
952 if getattr(settings, 'SEARCH_MOCK', False):
953     from .mock_search import Search