1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6 File, Field, Integer, \
7 NumericField, Version, Document, JavaError, IndexSearcher, \
8 QueryParser, PerFieldAnalyzerWrapper, \
9 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12 HashSet, BooleanClause, Term, CharTermAttribute, \
13 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17 initVM, CLASSPATH, JArray, JavaError
21 JVM = initVM(CLASSPATH, maxheap=settings.JVM_MAXHEAP)
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
38 log = logging.getLogger('search')
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
42 polish = PolishAnalyzer(Version.LUCENE_34)
43 # polish_gap.setPositionIncrementGap(999)
45 simple = SimpleAnalyzer(Version.LUCENE_34)
46 # simple_gap.setPositionIncrementGap(999)
48 keyword = KeywordAnalyzer(Version.LUCENE_34)
50 # not sure if needed: there's NOT_ANALYZED meaning basically the same
52 PerFieldAnalyzerWrapper.__init__(self, polish)
54 self.addAnalyzer("tags", simple)
55 self.addAnalyzer("technical_editors", simple)
56 self.addAnalyzer("editors", simple)
57 self.addAnalyzer("url", keyword)
58 self.addAnalyzer("source_url", keyword)
59 self.addAnalyzer("source_name", simple)
60 self.addAnalyzer("publisher", simple)
61 self.addAnalyzer("authors", simple)
62 self.addAnalyzer("title", simple)
64 self.addAnalyzer("is_book", keyword)
65 # shouldn't the title have two forms? _pl and simple?
67 self.addAnalyzer("themes", simple)
68 self.addAnalyzer("themes_pl", polish)
70 self.addAnalyzer("tag_name", simple)
71 self.addAnalyzer("tag_name_pl", polish)
73 self.addAnalyzer("translators", simple)
75 self.addAnalyzer("KEYWORD", keyword)
76 self.addAnalyzer("SIMPLE", simple)
77 self.addAnalyzer("POLISH", polish)
80 class IndexStore(object):
82 Provides access to search index.
84 self.store - lucene index directory
88 self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
90 def make_index_dir(self):
92 os.makedirs(settings.SEARCH_INDEX)
93 except OSError as exc:
94 if exc.errno == errno.EEXIST:
102 class IndexChecker(IndexStore):
104 IndexStore.__init__(self)
107 checker = CheckIndex(self.store)
108 status = checker.checkIndex()
112 class Snippets(object):
114 This class manages snippet files for indexed object (book)
115 the snippets are concatenated together, and their positions and
116 lengths are kept in lucene index fields.
118 SNIPPET_DIR = "snippets"
120 def __init__(self, book_id, revision=None):
122 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123 except OSError as exc:
124 if exc.errno == errno.EEXIST:
127 self.book_id = book_id
128 self.revision = revision
133 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134 else: fn = "%d" % self.book_id
136 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
138 def open(self, mode='r'):
140 Open the snippet file. Call .close() afterwards.
146 if os.path.exists(self.path):
149 if not os.path.exists(self.path):
153 self.file = open(self.path, mode)
157 def add(self, snippet):
159 Append a snippet (unicode) to the snippet file.
160 Return a (position, length) tuple
162 txt = snippet.encode('utf-8')
165 pos = (self.position, l)
171 Given a tuple of (position, length) return an unicode
172 of the snippet stored there.
174 self.file.seek(pos[0], 0)
175 txt = self.file.read(pos[1]).decode('utf-8')
179 """Close snippet file"""
194 class BaseIndex(IndexStore):
197 Provides basic operations on index: opening, closing, optimizing.
199 def __init__(self, analyzer=None):
200 super(BaseIndex, self).__init__()
203 analyzer = WLAnalyzer()
204 self.analyzer = analyzer
206 def open(self, timeout=None):
208 raise Exception("Index is already opened")
209 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
211 conf.setWriteLockTimeout(long(timeout))
212 self.index = IndexWriter(self.store, conf)
216 self.index.optimize()
220 self.index.optimize()
221 except JavaError, je:
222 log.error("Error during optimize phase, check index: %s" % je)
227 index_changed.send_robust(self)
229 super(BaseIndex, self).close()
235 def __exit__(self, type, value, tb):
239 index_changed = Signal()
242 class Index(BaseIndex):
244 Class indexing books.
246 def __init__(self, analyzer=None):
247 super(Index, self).__init__(analyzer)
249 def index_tags(self, *tags, **kw):
251 Re-index global tag list.
252 Removes all tags from index, then index them again.
253 Indexed fields include: id, name (with and without polish stems), category
255 remove_only = kw.get('remove_only', False)
256 # first, remove tags from index.
260 b_id_cat = BooleanQuery()
262 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
265 if isinstance(tag, PDCounterAuthor):
266 q_cat = TermQuery(Term('tag_category', 'pd_author'))
267 elif isinstance(tag, PDCounterBook):
268 q_cat = TermQuery(Term('tag_category', 'pd_book'))
270 q_cat = TermQuery(Term('tag_category', tag.category))
271 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
273 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
275 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276 self.index.deleteDocuments(q)
279 # then add them [all or just one passed]
281 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282 PDCounterAuthor.objects.all(), \
283 PDCounterBook.objects.all())
286 if isinstance(tag, PDCounterAuthor):
288 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293 self.index.addDocument(doc)
294 elif isinstance(tag, PDCounterBook):
296 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297 doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298 doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301 self.index.addDocument(doc)
304 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308 self.index.addDocument(doc)
310 def create_book_doc(self, book):
312 Create a lucene document referring book id.
315 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316 if book.parent is not None:
317 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
320 def remove_book(self, book_or_id, remove_snippets=True):
321 """Removes a book from search index.
322 book - Book instance."""
323 if isinstance(book_or_id, catalogue.models.Book):
324 book_id = book_or_id.id
328 q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
329 self.index.deleteDocuments(q)
332 snippets = Snippets(book_id)
335 def index_book(self, book, book_info=None, overwrite=True):
338 Creates a lucene document for extracted metadata
339 and calls self.index_content() to index the contents of the book.
342 # we don't remove snippets, since they might be still needed by
343 # threads using not reopened index
344 self.remove_book(book, remove_snippets=False)
346 book_doc = self.create_book_doc(book)
347 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
348 # let's not index it - it's only used for extracting publish date
349 if 'source_name' in meta_fields:
350 del meta_fields['source_name']
352 for f in meta_fields.values():
353 if isinstance(f, list) or isinstance(f, tuple):
358 self.index.addDocument(book_doc)
361 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
366 'dramat_wierszowany_l',
367 'dramat_wierszowany_lp',
368 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
372 ignore_content_tags = [
374 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
376 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
379 footnote_tags = ['pa', 'pt', 'pr', 'pe']
381 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
383 published_date_re = re.compile("([0-9]+)[\]. ]*$")
385 def extract_metadata(self, book, book_info=None, dc_only=None):
387 Extract metadata from book and returns a map of fields keyed by fieldname
391 if book_info is None:
392 book_info = dcparser.parse(open(book.xml_file.path))
394 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
395 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
396 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
399 for field in dcparser.BookInfo.FIELDS:
400 if dc_only and field.name not in dc_only:
402 if hasattr(book_info, field.name):
403 if not getattr(book_info, field.name):
405 # since no type information is available, we use validator
406 type_indicator = field.validator
407 if type_indicator == dcparser.as_unicode:
408 s = getattr(book_info, field.name)
412 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
413 except JavaError as je:
414 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
415 elif type_indicator == dcparser.as_person:
416 p = getattr(book_info, field.name)
417 if isinstance(p, dcparser.Person):
420 persons = ', '.join(map(unicode, p))
421 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
422 elif type_indicator == dcparser.as_date:
423 dt = getattr(book_info, field.name)
424 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
425 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
429 if hasattr(book_info, 'source_name') and book_info.source_name:
430 match = self.published_date_re.search(book_info.source_name)
431 if match is not None:
432 pd = str(match.groups()[0])
434 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
438 def add_gaps(self, fields, fieldname):
440 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
441 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
445 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
446 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
448 def get_master(self, root):
450 Returns the first master tag from an etree.
452 for master in root.iter():
453 if master.tag in self.master_tags:
456 def index_content(self, book, book_fields=[]):
458 Walks the book XML and extract content from it.
459 Adds parts for each header tag and for each fragment.
461 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
462 root = wld.edoc.getroot()
464 master = self.get_master(root)
468 def walker(node, ignore_tags=[]):
470 if node.tag not in ignore_tags:
471 yield node, None, None
472 if node.text is not None:
473 yield None, node.text, None
474 for child in list(node):
475 for b, t, e in walker(child):
477 yield None, None, node
479 if node.tail is not None:
480 yield None, node.tail, None
483 def fix_format(text):
484 # separator = [u" ", u"\t", u".", u";", u","]
485 if isinstance(text, list):
486 # need to join it first
487 text = filter(lambda s: s is not None, content)
488 text = u' '.join(text)
489 # for i in range(len(text)):
491 # if text[i][0] not in separator\
492 # and text[i - 1][-1] not in separator:
493 # text.insert(i, u" ")
495 return re.sub("(?m)/$", "", text)
497 def add_part(snippets, **fields):
498 doc = self.create_book_doc(book)
499 for f in book_fields:
502 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
503 doc.add(NumericField("header_span", Field.Store.YES, True)\
504 .setIntValue('header_span' in fields and fields['header_span'] or 1))
505 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
507 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
508 Field.TermVector.WITH_POSITIONS_OFFSETS))
510 snip_pos = snippets.add(fields["content"])
511 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
512 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
513 if snippets.revision:
514 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
516 if 'fragment_anchor' in fields:
517 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
518 Field.Store.YES, Field.Index.NOT_ANALYZED))
520 if 'themes' in fields:
521 themes, themes_pl = zip(*[
522 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
523 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
524 for theme in fields['themes']])
526 themes = self.add_gaps(themes, 'themes')
527 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
537 if isinstance(s, unicode):
538 return s.encode('utf-8')
543 snippets = Snippets(book.id).open('w')
545 for header, position in zip(list(master), range(len(master))):
547 if header.tag in self.skip_header_tags:
549 if header.tag is etree.Comment:
556 def all_content(text):
557 for frag in fragments.values():
558 frag['content'].append(text)
560 handle_text = [all_content]
563 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
565 if start is not None and start.tag in self.footnote_tags:
567 def collect_footnote(t):
569 handle_text.append(collect_footnote)
570 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
572 doc = add_part(snippets, header_index=position, header_type=header.tag,
573 content=u''.join(footnote),
574 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
576 self.index.addDocument(doc)
577 #print "@ footnote text: %s" % footnote
580 # handle fragments and themes.
581 if start is not None and start.tag == 'begin':
582 fid = start.attrib['id'][1:]
583 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
585 # themes for this fragment
586 elif start is not None and start.tag == 'motyw':
587 fid = start.attrib['id'][1:]
588 handle_text.append(None)
589 if start.text is not None:
590 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
591 elif end is not None and end.tag == 'motyw':
594 elif start is not None and start.tag == 'end':
595 fid = start.attrib['id'][1:]
596 if fid not in fragments:
597 continue # a broken <end> node, skip it
598 frag = fragments[fid]
599 if frag['themes'] == []:
600 continue # empty themes list.
603 doc = add_part(snippets,
604 header_type=frag['start_header'],
605 header_index=frag['start_section'],
606 header_span=position - frag['start_section'] + 1,
608 content=fix_format(frag['content']),
609 themes=frag['themes'])
610 #print '@ FRAG %s' % frag['content']
611 self.index.addDocument(doc)
615 if text is not None and handle_text is not []:
616 hdl = handle_text[-1]
620 # in the end, add a section text.
621 doc = add_part(snippets, header_index=position, header_type=header.tag,
622 content=fix_format(content))
623 #print '@ CONTENT: %s' % fix_format(content)
625 self.index.addDocument(doc)
631 def log_exception_wrapper(f):
636 log.error("Error in indexing thread: %s" % e)
637 traceback.print_exc()
642 class ReusableIndex(Index):
644 Works like index, but does not close/optimize Lucene index
645 until program exit (uses atexit hook).
646 This is usefull for importbooks command.
648 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
652 def open(self, analyzer=None, **kw):
653 if ReusableIndex.index:
654 self.index = ReusableIndex.index
656 Index.open(self, analyzer, **kw)
657 ReusableIndex.index = self.index
658 atexit.register(ReusableIndex.close_reusable)
660 # def index_book(self, *args, **kw):
661 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
662 # ReusableIndex.pool_jobs.append(job)
665 def close_reusable():
666 if ReusableIndex.index:
667 ReusableIndex.index.optimize()
668 ReusableIndex.index.close()
669 ReusableIndex.index = None
671 index_changed.send_robust(None)
674 if ReusableIndex.index:
675 ReusableIndex.index.commit()
678 class JoinSearch(object):
680 This mixin could be used to handle block join queries.
683 def __init__(self, *args, **kw):
684 super(JoinSearch, self).__init__(*args, **kw)
686 def wrapjoins(self, query, fields=[]):
688 This functions modifies the query in a recursive way,
689 so Term and Phrase Queries contained, which match
690 provided fields are wrapped in a BlockJoinQuery,
691 and so delegated to children documents.
693 if BooleanQuery.instance_(query):
694 qs = BooleanQuery.cast_(query)
696 clause = BooleanClause.cast_(clause)
697 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
701 query.extractTerms(termset)
704 if t.field() not in fields:
706 return BlockJoinQuery(query, self.parent_filter,
707 BlockJoinQuery.ScoreMode.Total)
709 def bsearch(self, query, max_results=50):
710 q = self.query(query)
711 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
713 tops = self.searcher.search(bjq, max_results)
715 for found in tops.scoreDocs:
716 doc = self.searcher.doc(found.doc)
717 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
718 return (bks, tops.totalHits)
721 class SearchResult(object):
722 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
723 if tokens_cache is None: tokens_cache = {}
728 self._score = scoreDocs.score
733 self._processed_hits = None # processed hits
735 stored = search.searcher.doc(scoreDocs.doc)
736 self.book_id = int(stored.get("book_id"))
738 pd = stored.get("published_date")
740 self.published_date = int(pd)
742 self.published_date = 0
744 header_type = stored.get("header_type")
745 # we have a content hit in some header of fragment
746 if header_type is not None:
747 sec = (header_type, int(stored.get("header_index")))
748 header_span = stored.get('header_span')
749 header_span = header_span is not None and int(header_span) or 1
751 fragment = stored.get("fragment_anchor")
754 snippets = snippets.replace("/\n", "\n")
755 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
757 self._hits.append(hit)
760 self.searched = searched
761 self.tokens_cache = tokens_cache
765 return self._score * self.boost
767 def merge(self, other):
768 if self.book_id != other.book_id:
769 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
770 self._hits += other._hits
771 if other.score > self.score:
772 self._score = other._score
776 if hasattr(self, '_book'):
778 return catalogue.models.Book.objects.get(id=self.book_id)
780 book = property(get_book)
784 if self._processed_hits is not None:
785 return self._processed_hits
794 # to sections and fragments
795 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
797 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
799 # sections not covered by fragments
800 sect = filter(lambda s: 0 == len(filter(
801 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
802 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
807 def remove_duplicates(lst, keyfn, compare):
812 if compare(els[eif], e) >= 1:
817 # remove fragments with duplicated fid's and duplicated snippets
818 frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
819 frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
820 lambda a, b: cmp(a[SCORE], b[SCORE]))
822 # remove duplicate sections
826 si = s[POSITION][POSITION_INDEX]
829 if sections[si]['score'] >= s[SCORE]:
832 m = {'score': s[SCORE],
833 'section_number': s[POSITION][POSITION_INDEX] + 1,
838 hits = sections.values()
842 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
843 except catalogue.models.Fragment.DoesNotExist:
847 # Figure out if we were searching for a token matching some word in theme name.
848 themes = frag.tags.filter(category='theme')
850 if self.searched is not None:
851 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
853 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
856 if not theme in themes_hit:
857 themes_hit.append(theme)
860 m = {'score': f[SCORE],
862 'section_number': f[POSITION][POSITION_INDEX] + 1,
864 'themes_hit': themes_hit
869 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
871 self._processed_hits = hits
875 def __unicode__(self):
876 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
879 def aggregate(*result_lists):
881 for rl in result_lists:
883 if r.book_id in books:
884 books[r.book_id].merge(r)
887 return books.values()
889 def __cmp__(self, other):
890 c = cmp(self.score, other.score)
892 # this is inverted, because earlier date is better
893 return cmp(other.published_date, self.published_date)
900 Given some hint information (information we already know about)
901 our search target - like author, title (specific book), epoch, genre, kind
902 we can narrow down search using filters.
904 def __init__(self, search):
906 Accepts a Searcher instance.
913 def books(self, *books):
915 Give a hint that we search these books.
919 def tags(self, tags):
921 Give a hint that these Tag objects (a list of)
925 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
926 lst = self.book_tags.get(t.category, [])
928 self.book_tags[t.category] = lst
929 if t.category in ['theme', 'theme_pl']:
930 self.part_tags.append(t)
932 def tag_filter(self, tags, field='tags'):
934 Given a lsit of tags and an optional field (but they are normally in tags field)
935 returns a filter accepting only books with specific tags.
940 toks = self.search.get_tokens(tag.name, field=field)
941 tag_phrase = PhraseQuery()
943 tag_phrase.add(Term(field, tok))
944 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
946 return QueryWrapperFilter(q)
948 def book_filter(self):
950 Filters using book tags (all tag kinds except a theme)
952 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
954 return self.tag_filter(tags)
958 def part_filter(self):
960 This filter can be used to look for book parts.
961 It filters on book id and/or themes.
965 fs.append(self.tag_filter(self.part_tags, field='themes'))
967 if self._books != []:
969 for b in self._books:
970 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
971 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
974 return Search.chain_filters(fs)
976 def should_search_for_book(self):
977 return self._books == []
979 def just_search_in(self, all):
980 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
983 if field == 'authors' and 'author' in self.book_tags:
985 if field == 'title' and self._books != []:
987 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
993 class Search(IndexStore):
997 def __init__(self, default_field="content"):
998 IndexStore.__init__(self)
999 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
1000 # self.analyzer = WLAnalyzer()
1001 reader = IndexReader.open(self.store, True)
1002 self.searcher = IndexSearcher(reader)
1003 self.parser = QueryParser(Version.LUCENE_34, default_field,
1006 self.parent_filter = TermsFilter()
1007 self.parent_filter.addTerm(Term("is_book", "true"))
1008 index_changed.connect(self.reopen)
1011 reader = self.searcher.getIndexReader()
1012 self.searcher.close()
1014 super(Search, self).close()
1015 index_changed.disconnect(self.reopen)
1017 def reopen(self, **unused):
1018 reader = self.searcher.getIndexReader()
1019 rdr = reader.reopen()
1020 if not rdr.equals(reader):
1021 log.debug('Reopening index')
1022 oldsearch = self.searcher
1023 self.searcher = IndexSearcher(rdr)
1027 def query(self, query):
1028 """Parse query in default Lucene Syntax. (for humans)
1030 return self.parser.parse(query)
1032 def simple_search(self, query, max_results=50):
1033 """Runs a query for books using lucene syntax. (for humans)
1034 Returns (books, total_hits)
1037 tops = self.searcher.search(self.query(query), max_results)
1039 for found in tops.scoreDocs:
1040 doc = self.searcher.doc(found.doc)
1041 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1042 return (bks, tops.totalHits)
1044 def get_tokens(self, searched, field='content', cached=None):
1045 """returns tokens analyzed by a proper (for a field) analyzer
1046 argument can be: StringReader, string/unicode, or tokens. In the last case
1047 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1049 if cached is not None and field in cached:
1050 return cached[field]
1052 if isinstance(searched, str) or isinstance(searched, unicode):
1053 searched = StringReader(searched)
1054 elif isinstance(searched, list):
1058 tokens = self.analyzer.reusableTokenStream(field, searched)
1060 while tokens.incrementToken():
1061 cta = tokens.getAttribute(CharTermAttribute.class_)
1062 toks.append(cta.toString())
1064 if cached is not None:
1065 cached[field] = toks
1070 def fuzziness(fuzzy):
1071 """Helper method to sanitize fuzziness"""
1074 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1079 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1081 Return a PhraseQuery with a series of tokens.
1084 phrase = MultiPhraseQuery()
1086 term = Term(field, t)
1087 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1091 ft = fuzzterm.term()
1093 fuzzterms.append(ft)
1094 if not fuzzterm.next(): break
1096 phrase.add(JArray('object')(fuzzterms, Term))
1100 phrase = PhraseQuery()
1101 phrase.setSlop(slop)
1103 term = Term(field, t)
1108 def make_term_query(tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1110 Returns term queries joined by boolean query.
1111 modal - applies to boolean query
1112 fuzzy - should the query by fuzzy.
1116 term = Term(field, t)
1118 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1120 term = TermQuery(term)
1121 q.add(BooleanClause(term, modal))
1124 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1125 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1126 if filters is None: filters = []
1127 if tokens_cache is None: tokens_cache = {}
1129 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1131 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1133 filters.append(self.term_filter(Term('is_book', 'true')))
1134 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1136 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1138 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1139 filters=None, tokens_cache=None, boost=None, snippets=True):
1140 if filters is None: filters = []
1141 if tokens_cache is None: tokens_cache = {}
1144 filters.append(self.term_filter(Term('is_book', 'true')))
1146 query = BooleanQuery()
1149 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1151 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1152 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1154 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1156 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1157 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1159 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1161 Search for perfect book matches. Just see if the query matches with some author or title,
1162 taking hints into account.
1164 fields_to_search = ['authors', 'title']
1167 if not hint.should_search_for_book():
1169 fields_to_search = hint.just_search_in(fields_to_search)
1170 only_in = hint.book_filter()
1172 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1176 top = self.searcher.search(q,
1177 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1179 for found in top.scoreDocs:
1180 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1183 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1184 fields_to_search = ['tags', 'authors', 'title']
1188 if not hint.should_search_for_book():
1190 fields_to_search = hint.just_search_in(fields_to_search)
1191 only_in = hint.book_filter()
1193 tokens = self.get_tokens(searched, field='SIMPLE')
1197 for fld in fields_to_search:
1198 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1199 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1202 top = self.searcher.search(q,
1203 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1205 for found in top.scoreDocs:
1206 books.append(SearchResult(self, found, how_found="search_book"))
1210 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1212 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1213 some part/fragment of the book.
1215 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1219 flt = hint.part_filter()
1223 top = self.searcher.search(q,
1224 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1227 for found in top.scoreDocs:
1228 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1232 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1234 Tries to use search terms to match different fields of book (or its parts).
1235 E.g. one word can be an author survey, another be a part of the title, and the rest
1236 are some words from third chapter.
1238 if tokens_cache is None: tokens_cache = {}
1243 only_in = hint.part_filter()
1245 # content only query : themes x content
1248 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1249 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1251 # only search in themes when we do not already filter by themes
1252 if hint is None or hint.just_search_in(['themes']) != []:
1253 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1254 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1256 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1257 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1259 topDocs = self.searcher.search(q, only_in, max_results)
1260 for found in topDocs.scoreDocs:
1261 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1263 # query themes/content x author/title/tags
1265 in_content = BooleanQuery()
1266 in_meta = BooleanQuery()
1268 for fld in ['themes_pl', 'content']:
1269 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1271 for fld in ['tags', 'authors', 'title']:
1272 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1274 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1275 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1277 topDocs = self.searcher.search(q, only_in, max_results)
1278 for found in topDocs.scoreDocs:
1279 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1283 # def multisearch(self, query, max_results=50):
1286 # - (phrase) OR -> content
1289 # - (keywords) -> authors
1294 # queryreader = StringReader(query)
1295 # tokens = self.get_tokens(queryreader)
1297 # top_level = BooleanQuery()
1298 # Should = BooleanClause.Occur.SHOULD
1300 # phrase_level = BooleanQuery()
1301 # phrase_level.setBoost(1.3)
1303 # p_content = self.make_phrase(tokens, joined=True)
1304 # p_title = self.make_phrase(tokens, 'title')
1305 # p_author = self.make_phrase(tokens, 'author')
1307 # phrase_level.add(BooleanClause(p_content, Should))
1308 # phrase_level.add(BooleanClause(p_title, Should))
1309 # phrase_level.add(BooleanClause(p_author, Should))
1311 # kw_level = BooleanQuery()
1313 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1314 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1315 # kw_level.add(j_themes, Should)
1316 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1317 # j_con = self.make_term_query(tokens, joined=True)
1318 # kw_level.add(j_con, Should)
1320 # top_level.add(BooleanClause(phrase_level, Should))
1321 # top_level.add(BooleanClause(kw_level, Should))
1325 def get_snippets(self, scoreDoc, query, field='content'):
1327 Returns a snippet for found scoreDoc.
1329 htmlFormatter = SimpleHTMLFormatter()
1330 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1332 stored = self.searcher.doc(scoreDoc.doc)
1334 position = stored.get('snippets_position')
1335 length = stored.get('snippets_length')
1336 if position is None or length is None:
1338 revision = stored.get('snippets_revision')
1339 if revision: revision = int(revision)
1342 book_id = int(stored.get('book_id'))
1343 snippets = Snippets(book_id, revision=revision)
1348 log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1353 text = snippets.get((int(position),
1358 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1359 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1360 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1362 except Exception, e:
1364 if hasattr(e, 'getJavaException'):
1365 e2 = unicode(e.getJavaException())
1366 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1371 def enum_to_array(enum):
1373 Converts a lucene TermEnum to array of Terms, suitable for
1382 if not enum.next(): break
1385 return JArray('object')(terms, Term)
1387 def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1389 Search for Tag objects using query.
1392 filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1393 tops = self.searcher.search(query, filt, max_results)
1396 for found in tops.scoreDocs:
1397 doc = self.searcher.doc(found.doc)
1398 is_pdcounter = doc.get('is_pdcounter')
1399 category = doc.get('tag_category')
1401 if is_pdcounter == 'true':
1402 if category == 'pd_author':
1403 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1404 elif category == 'pd_book':
1405 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1406 tag.category = 'pd_book' # make it look more lik a tag.
1408 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1410 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1411 # don't add the pdcounter tag if same tag already exists
1415 except catalogue.models.Tag.DoesNotExist: pass
1416 except PDCounterAuthor.DoesNotExist: pass
1417 except PDCounterBook.DoesNotExist: pass
1419 log.debug('search_tags: %s' % tags)
1423 def search_books(self, query, filt=None, max_results=10):
1425 Searches for Book objects using query
1428 tops = self.searcher.search(query, filt, max_results)
1429 for found in tops.scoreDocs:
1430 doc = self.searcher.doc(found.doc)
1432 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1433 except catalogue.models.Book.DoesNotExist: pass
1436 def make_prefix_phrase(self, toks, field):
1437 q = MultiPhraseQuery()
1438 for i in range(len(toks)):
1439 t = Term(field, toks[i])
1440 if i == len(toks) - 1:
1441 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1451 def term_filter(term, inverse=False):
1452 only_term = TermsFilter()
1453 only_term.addTerm(term)
1456 neg = BooleanFilter()
1457 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1462 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1464 Return auto-complete hints for tags
1465 using prefix search.
1467 toks = self.get_tokens(string, field='SIMPLE')
1468 top = BooleanQuery()
1470 for field in ['tag_name', 'tag_name_pl']:
1472 q = self.make_prefix_phrase(toks, field)
1474 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1475 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1477 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1479 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1481 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1483 Returns auto-complete hints for book titles
1484 Because we do not index 'pseudo' title-tags.
1487 toks = self.get_tokens(string, field='SIMPLE')
1490 q = self.make_prefix_phrase(toks, 'title')
1492 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1494 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1497 def chain_filters(filters, op=ChainedFilter.AND):
1499 Chains a filter list together
1501 filters = filter(lambda x: x is not None, filters)
1502 if not filters or filters is []:
1504 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1507 def filtered_categories(self, tags):
1509 Return a list of tag categories, present in tags list.
1513 cats[t.category] = True