1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6 File, Field, Integer, \
7 NumericField, Version, Document, JavaError, IndexSearcher, \
8 QueryParser, PerFieldAnalyzerWrapper, \
9 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12 HashSet, BooleanClause, Term, CharTermAttribute, \
13 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17 initVM, CLASSPATH, JArray, JavaError
21 JVM = initVM(CLASSPATH)
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
38 log = logging.getLogger('search')
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
42 polish = PolishAnalyzer(Version.LUCENE_34)
43 # polish_gap.setPositionIncrementGap(999)
45 simple = SimpleAnalyzer(Version.LUCENE_34)
46 # simple_gap.setPositionIncrementGap(999)
48 keyword = KeywordAnalyzer(Version.LUCENE_34)
50 # not sure if needed: there's NOT_ANALYZED meaning basically the same
52 PerFieldAnalyzerWrapper.__init__(self, polish)
54 self.addAnalyzer("tags", simple)
55 self.addAnalyzer("technical_editors", simple)
56 self.addAnalyzer("editors", simple)
57 self.addAnalyzer("url", keyword)
58 self.addAnalyzer("source_url", keyword)
59 self.addAnalyzer("source_name", simple)
60 self.addAnalyzer("publisher", simple)
61 self.addAnalyzer("authors", simple)
62 self.addAnalyzer("title", simple)
64 self.addAnalyzer("is_book", keyword)
65 # shouldn't the title have two forms? _pl and simple?
67 self.addAnalyzer("themes", simple)
68 self.addAnalyzer("themes_pl", polish)
70 self.addAnalyzer("tag_name", simple)
71 self.addAnalyzer("tag_name_pl", polish)
73 self.addAnalyzer("translators", simple)
75 self.addAnalyzer("KEYWORD", keyword)
76 self.addAnalyzer("SIMPLE", simple)
77 self.addAnalyzer("POLISH", polish)
80 class IndexStore(object):
82 Provides access to search index.
84 self.store - lucene index directory
88 self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
90 def make_index_dir(self):
92 os.makedirs(settings.SEARCH_INDEX)
93 except OSError as exc:
94 if exc.errno == errno.EEXIST:
102 class IndexChecker(IndexStore):
104 IndexStore.__init__(self)
107 checker = CheckIndex(self.store)
108 status = checker.checkIndex()
112 class Snippets(object):
114 This class manages snippet files for indexed object (book)
115 the snippets are concatenated together, and their positions and
116 lengths are kept in lucene index fields.
118 SNIPPET_DIR = "snippets"
120 def __init__(self, book_id, revision=None):
122 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123 except OSError as exc:
124 if exc.errno == errno.EEXIST:
127 self.book_id = book_id
128 self.revision = revision
133 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134 else: fn = "%d" % self.book_id
136 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
138 def open(self, mode='r'):
140 Open the snippet file. Call .close() afterwards.
146 if os.path.exists(self.path):
149 if not os.path.exists(self.path):
153 self.file = open(self.path, mode)
157 def add(self, snippet):
159 Append a snippet (unicode) to the snippet file.
160 Return a (position, length) tuple
162 txt = snippet.encode('utf-8')
165 pos = (self.position, l)
171 Given a tuple of (position, length) return an unicode
172 of the snippet stored there.
174 self.file.seek(pos[0], 0)
175 txt = self.file.read(pos[1]).decode('utf-8')
179 """Close snippet file"""
194 class BaseIndex(IndexStore):
197 Provides basic operations on index: opening, closing, optimizing.
199 def __init__(self, analyzer=None):
200 super(BaseIndex, self).__init__()
203 analyzer = WLAnalyzer()
204 self.analyzer = analyzer
206 def open(self, timeout=None):
208 raise Exception("Index is already opened")
209 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
211 conf.setWriteLockTimeout(long(timeout))
212 self.index = IndexWriter(self.store, conf)
216 self.index.optimize()
220 self.index.optimize()
221 except JavaError, je:
222 log.error("Error during optimize phase, check index: %s" % je)
227 index_changed.send_robust(self)
229 super(BaseIndex, self).close()
235 def __exit__(self, type, value, tb):
239 index_changed = Signal()
242 class Index(BaseIndex):
244 Class indexing books.
246 def __init__(self, analyzer=None):
247 super(Index, self).__init__(analyzer)
249 def index_tags(self, *tags, **kw):
251 Re-index global tag list.
252 Removes all tags from index, then index them again.
253 Indexed fields include: id, name (with and without polish stems), category
255 remove_only = kw.get('remove_only', False)
256 # first, remove tags from index.
260 b_id_cat = BooleanQuery()
262 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
265 if isinstance(tag, PDCounterAuthor):
266 q_cat = TermQuery(Term('tag_category', 'pd_author'))
267 elif isinstance(tag, PDCounterBook):
268 q_cat = TermQuery(Term('tag_category', 'pd_book'))
270 q_cat = TermQuery(Term('tag_category', tag.category))
271 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
273 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
275 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276 self.index.deleteDocuments(q)
279 # then add them [all or just one passed]
281 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282 PDCounterAuthor.objects.all(), \
283 PDCounterBook.objects.all())
286 if isinstance(tag, PDCounterAuthor):
288 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293 self.index.addDocument(doc)
294 elif isinstance(tag, PDCounterBook):
296 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297 doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298 doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301 self.index.addDocument(doc)
304 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308 self.index.addDocument(doc)
310 def create_book_doc(self, book):
312 Create a lucene document referring book id.
315 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316 if book.parent is not None:
317 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
320 def remove_book(self, book_or_id, remove_snippets=True):
321 """Removes a book from search index.
322 book - Book instance."""
323 if isinstance(book_or_id, catalogue.models.Book):
324 book_id = book_or_id.id
328 q = NumericRangeQuery.newIntRange("book_id", book_id, book_id, True, True)
329 self.index.deleteDocuments(q)
332 snippets = Snippets(book_id)
335 def index_book(self, book, book_info=None, overwrite=True):
338 Creates a lucene document for extracted metadata
339 and calls self.index_content() to index the contents of the book.
342 # we don't remove snippets, since they might be still needed by
343 # threads using not reopened index
344 self.remove_book(book, remove_snippets=False)
346 book_doc = self.create_book_doc(book)
347 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
348 # let's not index it - it's only used for extracting publish date
349 del meta_fields['source_name']
351 for f in meta_fields.values():
352 if isinstance(f, list) or isinstance(f, tuple):
357 self.index.addDocument(book_doc)
360 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
365 'dramat_wierszowany_l',
366 'dramat_wierszowany_lp',
367 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
371 ignore_content_tags = [
373 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
375 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
378 footnote_tags = ['pa', 'pt', 'pr', 'pe']
380 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
382 published_date_re = re.compile("([0-9]+)[\]. ]*$")
384 def extract_metadata(self, book, book_info=None, dc_only=None):
386 Extract metadata from book and returns a map of fields keyed by fieldname
390 if book_info is None:
391 book_info = dcparser.parse(open(book.xml_file.path))
393 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
394 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
395 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
398 for field in dcparser.BookInfo.FIELDS:
399 if dc_only and field.name not in dc_only:
401 if hasattr(book_info, field.name):
402 if not getattr(book_info, field.name):
404 # since no type information is available, we use validator
405 type_indicator = field.validator
406 if type_indicator == dcparser.as_unicode:
407 s = getattr(book_info, field.name)
411 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
412 except JavaError as je:
413 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
414 elif type_indicator == dcparser.as_person:
415 p = getattr(book_info, field.name)
416 if isinstance(p, dcparser.Person):
419 persons = ', '.join(map(unicode, p))
420 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
421 elif type_indicator == dcparser.as_date:
422 dt = getattr(book_info, field.name)
423 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
424 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
428 if hasattr(book_info, 'source_name') and book_info.source_name:
429 match = self.published_date_re.search(book_info.source_name)
430 if match is not None:
431 pd = str(match.groups()[0])
433 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
437 def add_gaps(self, fields, fieldname):
439 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
440 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
444 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
445 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
447 def get_master(self, root):
449 Returns the first master tag from an etree.
451 for master in root.iter():
452 if master.tag in self.master_tags:
455 def index_content(self, book, book_fields=[]):
457 Walks the book XML and extract content from it.
458 Adds parts for each header tag and for each fragment.
460 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
461 root = wld.edoc.getroot()
463 master = self.get_master(root)
467 def walker(node, ignore_tags=[]):
469 if node.tag not in ignore_tags:
470 yield node, None, None
471 if node.text is not None:
472 yield None, node.text, None
473 for child in list(node):
474 for b, t, e in walker(child):
476 yield None, None, node
478 if node.tail is not None:
479 yield None, node.tail, None
482 def fix_format(text):
483 # separator = [u" ", u"\t", u".", u";", u","]
484 if isinstance(text, list):
485 # need to join it first
486 text = filter(lambda s: s is not None, content)
487 text = u' '.join(text)
488 # for i in range(len(text)):
490 # if text[i][0] not in separator\
491 # and text[i - 1][-1] not in separator:
492 # text.insert(i, u" ")
494 return re.sub("(?m)/$", "", text)
496 def add_part(snippets, **fields):
497 doc = self.create_book_doc(book)
498 for f in book_fields:
501 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
502 doc.add(NumericField("header_span", Field.Store.YES, True)\
503 .setIntValue('header_span' in fields and fields['header_span'] or 1))
504 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
506 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
507 Field.TermVector.WITH_POSITIONS_OFFSETS))
509 snip_pos = snippets.add(fields["content"])
510 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
511 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
512 if snippets.revision:
513 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
515 if 'fragment_anchor' in fields:
516 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
517 Field.Store.YES, Field.Index.NOT_ANALYZED))
519 if 'themes' in fields:
520 themes, themes_pl = zip(*[
521 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
522 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
523 for theme in fields['themes']])
525 themes = self.add_gaps(themes, 'themes')
526 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
536 if isinstance(s, unicode):
537 return s.encode('utf-8')
542 snippets = Snippets(book.id).open('w')
544 for header, position in zip(list(master), range(len(master))):
546 if header.tag in self.skip_header_tags:
548 if header.tag is etree.Comment:
555 def all_content(text):
556 for frag in fragments.values():
557 frag['content'].append(text)
559 handle_text = [all_content]
562 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
564 if start is not None and start.tag in self.footnote_tags:
566 def collect_footnote(t):
568 handle_text.append(collect_footnote)
569 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
571 doc = add_part(snippets, header_index=position, header_type=header.tag,
572 content=u''.join(footnote),
573 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
575 self.index.addDocument(doc)
576 #print "@ footnote text: %s" % footnote
579 # handle fragments and themes.
580 if start is not None and start.tag == 'begin':
581 fid = start.attrib['id'][1:]
582 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
584 # themes for this fragment
585 elif start is not None and start.tag == 'motyw':
586 fid = start.attrib['id'][1:]
587 handle_text.append(None)
588 if start.text is not None:
589 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
590 elif end is not None and end.tag == 'motyw':
593 elif start is not None and start.tag == 'end':
594 fid = start.attrib['id'][1:]
595 if fid not in fragments:
596 continue # a broken <end> node, skip it
597 frag = fragments[fid]
598 if frag['themes'] == []:
599 continue # empty themes list.
602 doc = add_part(snippets,
603 header_type=frag['start_header'],
604 header_index=frag['start_section'],
605 header_span=position - frag['start_section'] + 1,
607 content=fix_format(frag['content']),
608 themes=frag['themes'])
609 #print '@ FRAG %s' % frag['content']
610 self.index.addDocument(doc)
614 if text is not None and handle_text is not []:
615 hdl = handle_text[-1]
619 # in the end, add a section text.
620 doc = add_part(snippets, header_index=position, header_type=header.tag,
621 content=fix_format(content))
622 #print '@ CONTENT: %s' % fix_format(content)
624 self.index.addDocument(doc)
630 def log_exception_wrapper(f):
635 log.error("Error in indexing thread: %s" % e)
636 traceback.print_exc()
641 class ReusableIndex(Index):
643 Works like index, but does not close/optimize Lucene index
644 until program exit (uses atexit hook).
645 This is usefull for importbooks command.
647 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
651 def open(self, analyzer=None, **kw):
652 if ReusableIndex.index:
653 self.index = ReusableIndex.index
655 Index.open(self, analyzer, **kw)
656 ReusableIndex.index = self.index
657 atexit.register(ReusableIndex.close_reusable)
659 # def index_book(self, *args, **kw):
660 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
661 # ReusableIndex.pool_jobs.append(job)
664 def close_reusable():
665 if ReusableIndex.index:
666 ReusableIndex.index.optimize()
667 ReusableIndex.index.close()
668 ReusableIndex.index = None
670 index_changed.send_robust(None)
673 if ReusableIndex.index:
674 ReusableIndex.index.commit()
677 class JoinSearch(object):
679 This mixin could be used to handle block join queries.
682 def __init__(self, *args, **kw):
683 super(JoinSearch, self).__init__(*args, **kw)
685 def wrapjoins(self, query, fields=[]):
687 This functions modifies the query in a recursive way,
688 so Term and Phrase Queries contained, which match
689 provided fields are wrapped in a BlockJoinQuery,
690 and so delegated to children documents.
692 if BooleanQuery.instance_(query):
693 qs = BooleanQuery.cast_(query)
695 clause = BooleanClause.cast_(clause)
696 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
700 query.extractTerms(termset)
703 if t.field() not in fields:
705 return BlockJoinQuery(query, self.parent_filter,
706 BlockJoinQuery.ScoreMode.Total)
708 def bsearch(self, query, max_results=50):
709 q = self.query(query)
710 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
712 tops = self.searcher.search(bjq, max_results)
714 for found in tops.scoreDocs:
715 doc = self.searcher.doc(found.doc)
716 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
717 return (bks, tops.totalHits)
720 class SearchResult(object):
721 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
722 if tokens_cache is None: tokens_cache = {}
727 self._score = scoreDocs.score
732 self._processed_hits = None # processed hits
734 stored = search.searcher.doc(scoreDocs.doc)
735 self.book_id = int(stored.get("book_id"))
737 pd = stored.get("published_date")
739 self.published_date = int(pd)
741 self.published_date = 0
743 header_type = stored.get("header_type")
744 # we have a content hit in some header of fragment
745 if header_type is not None:
746 sec = (header_type, int(stored.get("header_index")))
747 header_span = stored.get('header_span')
748 header_span = header_span is not None and int(header_span) or 1
750 fragment = stored.get("fragment_anchor")
753 snippets = snippets.replace("/\n", "\n")
754 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
756 self._hits.append(hit)
759 self.searched = searched
760 self.tokens_cache = tokens_cache
764 return self._score * self.boost
766 def merge(self, other):
767 if self.book_id != other.book_id:
768 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
769 self._hits += other._hits
770 if other.score > self.score:
771 self._score = other._score
775 if hasattr(self, '_book'):
777 return catalogue.models.Book.objects.get(id=self.book_id)
779 book = property(get_book)
783 if self._processed_hits is not None:
784 return self._processed_hits
793 # to sections and fragments
794 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
796 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
798 # sections not covered by fragments
799 sect = filter(lambda s: 0 == len(filter(
800 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
801 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
806 def remove_duplicates(lst, keyfn, compare):
811 if compare(els[eif], e) >= 1:
816 # remove fragments with duplicated fid's and duplicated snippets
817 frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
818 frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
819 lambda a, b: cmp(a[SCORE], b[SCORE]))
821 # remove duplicate sections
825 si = s[POSITION][POSITION_INDEX]
828 if sections[si]['score'] >= s[SCORE]:
831 m = {'score': s[SCORE],
832 'section_number': s[POSITION][POSITION_INDEX] + 1,
837 hits = sections.values()
841 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
842 except catalogue.models.Fragment.DoesNotExist:
846 # Figure out if we were searching for a token matching some word in theme name.
847 themes = frag.tags.filter(category='theme')
849 if self.searched is not None:
850 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
852 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
855 if not theme in themes_hit:
856 themes_hit.append(theme)
859 m = {'score': f[SCORE],
861 'section_number': f[POSITION][POSITION_INDEX] + 1,
863 'themes_hit': themes_hit
868 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
870 self._processed_hits = hits
874 def __unicode__(self):
875 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
878 def aggregate(*result_lists):
880 for rl in result_lists:
882 if r.book_id in books:
883 books[r.book_id].merge(r)
886 return books.values()
888 def __cmp__(self, other):
889 c = cmp(self.score, other.score)
891 # this is inverted, because earlier date is better
892 return cmp(other.published_date, self.published_date)
899 Given some hint information (information we already know about)
900 our search target - like author, title (specific book), epoch, genre, kind
901 we can narrow down search using filters.
903 def __init__(self, search):
905 Accepts a Searcher instance.
912 def books(self, *books):
914 Give a hint that we search these books.
918 def tags(self, tags):
920 Give a hint that these Tag objects (a list of)
924 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
925 lst = self.book_tags.get(t.category, [])
927 self.book_tags[t.category] = lst
928 if t.category in ['theme', 'theme_pl']:
929 self.part_tags.append(t)
931 def tag_filter(self, tags, field='tags'):
933 Given a lsit of tags and an optional field (but they are normally in tags field)
934 returns a filter accepting only books with specific tags.
939 toks = self.search.get_tokens(tag.name, field=field)
940 tag_phrase = PhraseQuery()
942 tag_phrase.add(Term(field, tok))
943 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
945 return QueryWrapperFilter(q)
947 def book_filter(self):
949 Filters using book tags (all tag kinds except a theme)
951 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
953 return self.tag_filter(tags)
957 def part_filter(self):
959 This filter can be used to look for book parts.
960 It filters on book id and/or themes.
964 fs.append(self.tag_filter(self.part_tags, field='themes'))
966 if self._books != []:
968 for b in self._books:
969 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
970 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
973 return Search.chain_filters(fs)
975 def should_search_for_book(self):
976 return self._books == []
978 def just_search_in(self, all):
979 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
982 if field == 'authors' and 'author' in self.book_tags:
984 if field == 'title' and self._books != []:
986 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
992 class Search(IndexStore):
996 def __init__(self, default_field="content"):
997 IndexStore.__init__(self)
998 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
999 # self.analyzer = WLAnalyzer()
1000 reader = IndexReader.open(self.store, True)
1001 self.searcher = IndexSearcher(reader)
1002 self.parser = QueryParser(Version.LUCENE_34, default_field,
1005 self.parent_filter = TermsFilter()
1006 self.parent_filter.addTerm(Term("is_book", "true"))
1007 index_changed.connect(self.reopen)
1010 reader = self.searcher.getIndexReader()
1011 self.searcher.close()
1013 super(Search, self).close()
1014 index_changed.disconnect(self.reopen)
1016 def reopen(self, **unused):
1017 reader = self.searcher.getIndexReader()
1018 rdr = reader.reopen()
1019 if not rdr.equals(reader):
1020 log.debug('Reopening index')
1021 oldsearch = self.searcher
1022 self.searcher = IndexSearcher(rdr)
1026 def query(self, query):
1027 """Parse query in default Lucene Syntax. (for humans)
1029 return self.parser.parse(query)
1031 def simple_search(self, query, max_results=50):
1032 """Runs a query for books using lucene syntax. (for humans)
1033 Returns (books, total_hits)
1036 tops = self.searcher.search(self.query(query), max_results)
1038 for found in tops.scoreDocs:
1039 doc = self.searcher.doc(found.doc)
1040 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1041 return (bks, tops.totalHits)
1043 def get_tokens(self, searched, field='content', cached=None):
1044 """returns tokens analyzed by a proper (for a field) analyzer
1045 argument can be: StringReader, string/unicode, or tokens. In the last case
1046 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1048 if cached is not None and field in cached:
1049 return cached[field]
1051 if isinstance(searched, str) or isinstance(searched, unicode):
1052 searched = StringReader(searched)
1053 elif isinstance(searched, list):
1057 tokens = self.analyzer.reusableTokenStream(field, searched)
1059 while tokens.incrementToken():
1060 cta = tokens.getAttribute(CharTermAttribute.class_)
1061 toks.append(cta.toString())
1063 if cached is not None:
1064 cached[field] = toks
1069 def fuzziness(self, fuzzy):
1070 """Helper method to sanitize fuzziness"""
1073 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1078 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1080 Return a PhraseQuery with a series of tokens.
1083 phrase = MultiPhraseQuery()
1085 term = Term(field, t)
1086 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1090 ft = fuzzterm.term()
1092 fuzzterms.append(ft)
1093 if not fuzzterm.next(): break
1095 phrase.add(JArray('object')(fuzzterms, Term))
1099 phrase = PhraseQuery()
1100 phrase.setSlop(slop)
1102 term = Term(field, t)
1107 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1109 Returns term queries joined by boolean query.
1110 modal - applies to boolean query
1111 fuzzy - should the query by fuzzy.
1115 term = Term(field, t)
1117 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1119 term = TermQuery(term)
1120 q.add(BooleanClause(term, modal))
1123 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1124 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1125 if filters is None: filters = []
1126 if tokens_cache is None: tokens_cache = {}
1128 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1130 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1132 filters.append(self.term_filter(Term('is_book', 'true')))
1133 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1135 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1137 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1138 filters=None, tokens_cache=None, boost=None, snippets=True):
1139 if filters is None: filters = []
1140 if tokens_cache is None: tokens_cache = {}
1143 filters.append(self.term_filter(Term('is_book', 'true')))
1145 query = BooleanQuery()
1148 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1150 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1151 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1153 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1155 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1156 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1158 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1160 Search for perfect book matches. Just see if the query matches with some author or title,
1161 taking hints into account.
1163 fields_to_search = ['authors', 'title']
1166 if not hint.should_search_for_book():
1168 fields_to_search = hint.just_search_in(fields_to_search)
1169 only_in = hint.book_filter()
1171 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1175 top = self.searcher.search(q,
1176 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1178 for found in top.scoreDocs:
1179 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1182 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1183 fields_to_search = ['tags', 'authors', 'title']
1187 if not hint.should_search_for_book():
1189 fields_to_search = hint.just_search_in(fields_to_search)
1190 only_in = hint.book_filter()
1192 tokens = self.get_tokens(searched, field='SIMPLE')
1196 for fld in fields_to_search:
1197 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1198 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1201 top = self.searcher.search(q,
1202 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1204 for found in top.scoreDocs:
1205 books.append(SearchResult(self, found, how_found="search_book"))
1209 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1211 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1212 some part/fragment of the book.
1214 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1218 flt = hint.part_filter()
1222 top = self.searcher.search(q,
1223 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1226 for found in top.scoreDocs:
1227 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1231 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1233 Tries to use search terms to match different fields of book (or its parts).
1234 E.g. one word can be an author survey, another be a part of the title, and the rest
1235 are some words from third chapter.
1237 if tokens_cache is None: tokens_cache = {}
1242 only_in = hint.part_filter()
1244 # content only query : themes x content
1247 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1248 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1250 # only search in themes when we do not already filter by themes
1251 if hint is None or hint.just_search_in(['themes']) != []:
1252 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1253 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1255 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1256 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1258 topDocs = self.searcher.search(q, only_in, max_results)
1259 for found in topDocs.scoreDocs:
1260 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1262 # query themes/content x author/title/tags
1264 in_content = BooleanQuery()
1265 in_meta = BooleanQuery()
1267 for fld in ['themes_pl', 'content']:
1268 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1270 for fld in ['tags', 'authors', 'title']:
1271 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1273 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1274 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1276 topDocs = self.searcher.search(q, only_in, max_results)
1277 for found in topDocs.scoreDocs:
1278 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1282 # def multisearch(self, query, max_results=50):
1285 # - (phrase) OR -> content
1288 # - (keywords) -> authors
1293 # queryreader = StringReader(query)
1294 # tokens = self.get_tokens(queryreader)
1296 # top_level = BooleanQuery()
1297 # Should = BooleanClause.Occur.SHOULD
1299 # phrase_level = BooleanQuery()
1300 # phrase_level.setBoost(1.3)
1302 # p_content = self.make_phrase(tokens, joined=True)
1303 # p_title = self.make_phrase(tokens, 'title')
1304 # p_author = self.make_phrase(tokens, 'author')
1306 # phrase_level.add(BooleanClause(p_content, Should))
1307 # phrase_level.add(BooleanClause(p_title, Should))
1308 # phrase_level.add(BooleanClause(p_author, Should))
1310 # kw_level = BooleanQuery()
1312 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1313 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1314 # kw_level.add(j_themes, Should)
1315 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1316 # j_con = self.make_term_query(tokens, joined=True)
1317 # kw_level.add(j_con, Should)
1319 # top_level.add(BooleanClause(phrase_level, Should))
1320 # top_level.add(BooleanClause(kw_level, Should))
1324 def get_snippets(self, scoreDoc, query, field='content'):
1326 Returns a snippet for found scoreDoc.
1328 htmlFormatter = SimpleHTMLFormatter()
1329 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1331 stored = self.searcher.doc(scoreDoc.doc)
1333 position = stored.get('snippets_position')
1334 length = stored.get('snippets_length')
1335 if position is None or length is None:
1337 revision = stored.get('snippets_revision')
1338 if revision: revision = int(revision)
1341 book_id = int(stored.get('book_id'))
1342 snippets = Snippets(book_id, revision=revision)
1347 log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1352 text = snippets.get((int(position),
1357 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1358 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1359 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1361 except Exception, e:
1363 if hasattr(e, 'getJavaException'):
1364 e2 = unicode(e.getJavaException())
1365 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1370 def enum_to_array(enum):
1372 Converts a lucene TermEnum to array of Terms, suitable for
1381 if not enum.next(): break
1384 return JArray('object')(terms, Term)
1386 def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1388 Search for Tag objects using query.
1391 filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1392 tops = self.searcher.search(query, filt, max_results)
1395 for found in tops.scoreDocs:
1396 doc = self.searcher.doc(found.doc)
1397 is_pdcounter = doc.get('is_pdcounter')
1398 category = doc.get('tag_category')
1400 if is_pdcounter == 'true':
1401 if category == 'pd_author':
1402 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1403 elif category == 'pd_book':
1404 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1405 tag.category = 'pd_book' # make it look more lik a tag.
1407 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1409 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1410 # don't add the pdcounter tag if same tag already exists
1411 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1413 except catalogue.models.Tag.DoesNotExist: pass
1414 except PDCounterAuthor.DoesNotExist: pass
1415 except PDCounterBook.DoesNotExist: pass
1417 log.debug('search_tags: %s' % tags)
1421 def search_books(self, query, filt=None, max_results=10):
1423 Searches for Book objects using query
1426 tops = self.searcher.search(query, filt, max_results)
1427 for found in tops.scoreDocs:
1428 doc = self.searcher.doc(found.doc)
1430 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1431 except catalogue.models.Book.DoesNotExist: pass
1434 def make_prefix_phrase(self, toks, field):
1435 q = MultiPhraseQuery()
1436 for i in range(len(toks)):
1437 t = Term(field, toks[i])
1438 if i == len(toks) - 1:
1439 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1449 def term_filter(term, inverse=False):
1450 only_term = TermsFilter()
1451 only_term.addTerm(term)
1454 neg = BooleanFilter()
1455 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1460 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1462 Return auto-complete hints for tags
1463 using prefix search.
1465 toks = self.get_tokens(string, field='SIMPLE')
1466 top = BooleanQuery()
1468 for field in ['tag_name', 'tag_name_pl']:
1470 q = self.make_prefix_phrase(toks, field)
1472 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1473 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1475 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1477 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1479 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1481 Returns auto-complete hints for book titles
1482 Because we do not index 'pseudo' title-tags.
1485 toks = self.get_tokens(string, field='SIMPLE')
1488 q = self.make_prefix_phrase(toks, 'title')
1490 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1492 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1495 def chain_filters(filters, op=ChainedFilter.AND):
1497 Chains a filter list together
1499 filters = filter(lambda x: x is not None, filters)
1500 if not filters or filters is []:
1502 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1505 def filtered_categories(self, tags):
1507 Return a list of tag categories, present in tags list.
1511 cats[t.category] = True