1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6 File, Field, Integer, \
7 NumericField, Version, Document, JavaError, IndexSearcher, \
8 QueryParser, PerFieldAnalyzerWrapper, \
9 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12 HashSet, BooleanClause, Term, CharTermAttribute, \
13 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17 initVM, CLASSPATH, JArray, JavaError
21 JVM = initVM(CLASSPATH)
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
38 log = logging.getLogger('search')
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
42 polish = PolishAnalyzer(Version.LUCENE_34)
43 # polish_gap.setPositionIncrementGap(999)
45 simple = SimpleAnalyzer(Version.LUCENE_34)
46 # simple_gap.setPositionIncrementGap(999)
48 keyword = KeywordAnalyzer(Version.LUCENE_34)
50 # not sure if needed: there's NOT_ANALYZED meaning basically the same
52 PerFieldAnalyzerWrapper.__init__(self, polish)
54 self.addAnalyzer("tags", simple)
55 self.addAnalyzer("technical_editors", simple)
56 self.addAnalyzer("editors", simple)
57 self.addAnalyzer("url", keyword)
58 self.addAnalyzer("source_url", keyword)
59 self.addAnalyzer("source_name", simple)
60 self.addAnalyzer("publisher", simple)
61 self.addAnalyzer("authors", simple)
62 self.addAnalyzer("title", simple)
64 self.addAnalyzer("is_book", keyword)
65 # shouldn't the title have two forms? _pl and simple?
67 self.addAnalyzer("themes", simple)
68 self.addAnalyzer("themes_pl", polish)
70 self.addAnalyzer("tag_name", simple)
71 self.addAnalyzer("tag_name_pl", polish)
73 self.addAnalyzer("translators", simple)
75 self.addAnalyzer("KEYWORD", keyword)
76 self.addAnalyzer("SIMPLE", simple)
77 self.addAnalyzer("POLISH", polish)
80 class IndexStore(object):
82 Provides access to search index.
84 self.store - lucene index directory
88 self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
90 def make_index_dir(self):
92 os.makedirs(settings.SEARCH_INDEX)
93 except OSError as exc:
94 if exc.errno == errno.EEXIST:
102 class IndexChecker(IndexStore):
104 IndexStore.__init__(self)
107 checker = CheckIndex(self.store)
108 status = checker.checkIndex()
112 class Snippets(object):
114 This class manages snippet files for indexed object (book)
115 the snippets are concatenated together, and their positions and
116 lengths are kept in lucene index fields.
118 SNIPPET_DIR = "snippets"
120 def __init__(self, book_id, revision=None):
122 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123 except OSError as exc:
124 if exc.errno == errno.EEXIST:
127 self.book_id = book_id
128 self.revision = revision
133 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134 else: fn = "%d" % self.book_id
136 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
138 def open(self, mode='r'):
140 Open the snippet file. Call .close() afterwards.
146 if os.path.exists(self.path):
149 if not os.path.exists(self.path):
153 self.file = open(self.path, mode)
157 def add(self, snippet):
159 Append a snippet (unicode) to the snippet file.
160 Return a (position, length) tuple
162 txt = snippet.encode('utf-8')
165 pos = (self.position, l)
171 Given a tuple of (position, length) return an unicode
172 of the snippet stored there.
174 self.file.seek(pos[0], 0)
175 txt = self.file.read(pos[1]).decode('utf-8')
179 """Close snippet file"""
194 class BaseIndex(IndexStore):
197 Provides basic operations on index: opening, closing, optimizing.
199 def __init__(self, analyzer=None):
200 super(BaseIndex, self).__init__()
203 analyzer = WLAnalyzer()
204 self.analyzer = analyzer
206 def open(self, timeout=None):
208 raise Exception("Index is already opened")
209 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
211 conf.setWriteLockTimeout(long(timeout))
212 self.index = IndexWriter(self.store, conf)
216 self.index.optimize()
220 self.index.optimize()
221 except JavaError, je:
222 log.error("Error during optimize phase, check index: %s" % je)
227 index_changed.send_robust(self)
229 super(BaseIndex, self).close()
235 def __exit__(self, type, value, tb):
239 index_changed = Signal()
242 class Index(BaseIndex):
244 Class indexing books.
246 def __init__(self, analyzer=None):
247 super(Index, self).__init__(analyzer)
249 def index_tags(self, *tags, **kw):
251 Re-index global tag list.
252 Removes all tags from index, then index them again.
253 Indexed fields include: id, name (with and without polish stems), category
255 remove_only = kw.get('remove_only', False)
256 # first, remove tags from index.
260 b_id_cat = BooleanQuery()
262 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
265 if isinstance(tag, PDCounterAuthor):
266 q_cat = TermQuery(Term('tag_category', 'pd_author'))
267 elif isinstance(tag, PDCounterBook):
268 q_cat = TermQuery(Term('tag_category', 'pd_book'))
270 q_cat = TermQuery(Term('tag_category', tag.category))
271 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
273 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
275 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276 self.index.deleteDocuments(q)
279 # then add them [all or just one passed]
281 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282 PDCounterAuthor.objects.all(), \
283 PDCounterBook.objects.all())
286 if isinstance(tag, PDCounterAuthor):
288 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293 self.index.addDocument(doc)
294 elif isinstance(tag, PDCounterBook):
296 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297 doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298 doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301 self.index.addDocument(doc)
304 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308 self.index.addDocument(doc)
310 def create_book_doc(self, book):
312 Create a lucene document referring book id.
315 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316 if book.parent is not None:
317 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
320 def remove_book(self, book, remove_snippets=True):
321 """Removes a book from search index.
322 book - Book instance."""
323 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
324 self.index.deleteDocuments(q)
327 snippets = Snippets(book.id)
330 def index_book(self, book, book_info=None, overwrite=True):
333 Creates a lucene document for extracted metadata
334 and calls self.index_content() to index the contents of the book.
337 # we don't remove snippets, since they might be still needed by
338 # threads using not reopened index
339 self.remove_book(book, remove_snippets=False)
341 book_doc = self.create_book_doc(book)
342 meta_fields = self.extract_metadata(book, book_info, dc_only=['source_name', 'authors', 'title'])
343 # let's not index it - it's only used for extracting publish date
344 del meta_fields['source_name']
346 for f in meta_fields.values():
347 if isinstance(f, list) or isinstance(f, tuple):
352 self.index.addDocument(book_doc)
355 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
360 'dramat_wierszowany_l',
361 'dramat_wierszowany_lp',
362 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
366 ignore_content_tags = [
368 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
370 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
373 footnote_tags = ['pa', 'pt', 'pr', 'pe']
375 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
377 published_date_re = re.compile("([0-9]+)[\]. ]*$")
379 def extract_metadata(self, book, book_info=None, dc_only=None):
381 Extract metadata from book and returns a map of fields keyed by fieldname
385 if book_info is None:
386 book_info = dcparser.parse(open(book.xml_file.path))
388 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
389 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
390 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
393 for field in dcparser.BookInfo.FIELDS:
394 if dc_only and field.name not in dc_only:
396 if hasattr(book_info, field.name):
397 if not getattr(book_info, field.name):
399 # since no type information is available, we use validator
400 type_indicator = field.validator
401 if type_indicator == dcparser.as_unicode:
402 s = getattr(book_info, field.name)
406 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
407 except JavaError as je:
408 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
409 elif type_indicator == dcparser.as_person:
410 p = getattr(book_info, field.name)
411 if isinstance(p, dcparser.Person):
414 persons = ', '.join(map(unicode, p))
415 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
416 elif type_indicator == dcparser.as_date:
417 dt = getattr(book_info, field.name)
418 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
419 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
423 if hasattr(book_info, 'source_name') and book_info.source_name:
424 match = self.published_date_re.search(book_info.source_name)
425 if match is not None:
426 pd = str(match.groups()[0])
428 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
432 def add_gaps(self, fields, fieldname):
434 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
435 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
439 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
440 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
442 def get_master(self, root):
444 Returns the first master tag from an etree.
446 for master in root.iter():
447 if master.tag in self.master_tags:
450 def index_content(self, book, book_fields=[]):
452 Walks the book XML and extract content from it.
453 Adds parts for each header tag and for each fragment.
455 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
456 root = wld.edoc.getroot()
458 master = self.get_master(root)
462 def walker(node, ignore_tags=[]):
464 if node.tag not in ignore_tags:
465 yield node, None, None
466 if node.text is not None:
467 yield None, node.text, None
468 for child in list(node):
469 for b, t, e in walker(child):
471 yield None, None, node
473 if node.tail is not None:
474 yield None, node.tail, None
477 def fix_format(text):
478 # separator = [u" ", u"\t", u".", u";", u","]
479 if isinstance(text, list):
480 # need to join it first
481 text = filter(lambda s: s is not None, content)
482 text = u' '.join(text)
483 # for i in range(len(text)):
485 # if text[i][0] not in separator\
486 # and text[i - 1][-1] not in separator:
487 # text.insert(i, u" ")
489 return re.sub("(?m)/$", "", text)
491 def add_part(snippets, **fields):
492 doc = self.create_book_doc(book)
493 for f in book_fields:
496 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
497 doc.add(NumericField("header_span", Field.Store.YES, True)\
498 .setIntValue('header_span' in fields and fields['header_span'] or 1))
499 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
501 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
502 Field.TermVector.WITH_POSITIONS_OFFSETS))
504 snip_pos = snippets.add(fields["content"])
505 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
506 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
507 if snippets.revision:
508 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
510 if 'fragment_anchor' in fields:
511 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
512 Field.Store.YES, Field.Index.NOT_ANALYZED))
514 if 'themes' in fields:
515 themes, themes_pl = zip(*[
516 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
517 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
518 for theme in fields['themes']])
520 themes = self.add_gaps(themes, 'themes')
521 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
531 if isinstance(s, unicode):
532 return s.encode('utf-8')
537 snippets = Snippets(book.id).open('w')
539 for header, position in zip(list(master), range(len(master))):
541 if header.tag in self.skip_header_tags:
543 if header.tag is etree.Comment:
550 def all_content(text):
551 for frag in fragments.values():
552 frag['content'].append(text)
554 handle_text = [all_content]
557 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
559 if start is not None and start.tag in self.footnote_tags:
561 def collect_footnote(t):
563 handle_text.append(collect_footnote)
564 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
566 doc = add_part(snippets, header_index=position, header_type=header.tag,
567 content=u''.join(footnote),
568 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
570 self.index.addDocument(doc)
571 #print "@ footnote text: %s" % footnote
574 # handle fragments and themes.
575 if start is not None and start.tag == 'begin':
576 fid = start.attrib['id'][1:]
577 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
579 # themes for this fragment
580 elif start is not None and start.tag == 'motyw':
581 fid = start.attrib['id'][1:]
582 handle_text.append(None)
583 if start.text is not None:
584 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
585 elif end is not None and end.tag == 'motyw':
588 elif start is not None and start.tag == 'end':
589 fid = start.attrib['id'][1:]
590 if fid not in fragments:
591 continue # a broken <end> node, skip it
592 frag = fragments[fid]
593 if frag['themes'] == []:
594 continue # empty themes list.
597 doc = add_part(snippets,
598 header_type=frag['start_header'],
599 header_index=frag['start_section'],
600 header_span=position - frag['start_section'] + 1,
602 content=fix_format(frag['content']),
603 themes=frag['themes'])
604 #print '@ FRAG %s' % frag['content']
605 self.index.addDocument(doc)
609 if text is not None and handle_text is not []:
610 hdl = handle_text[-1]
614 # in the end, add a section text.
615 doc = add_part(snippets, header_index=position, header_type=header.tag,
616 content=fix_format(content))
617 #print '@ CONTENT: %s' % fix_format(content)
619 self.index.addDocument(doc)
625 def log_exception_wrapper(f):
630 log.error("Error in indexing thread: %s" % e)
631 traceback.print_exc()
636 class ReusableIndex(Index):
638 Works like index, but does not close/optimize Lucene index
639 until program exit (uses atexit hook).
640 This is usefull for importbooks command.
642 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
646 def open(self, analyzer=None, **kw):
647 if ReusableIndex.index:
648 self.index = ReusableIndex.index
650 Index.open(self, analyzer, **kw)
651 ReusableIndex.index = self.index
652 atexit.register(ReusableIndex.close_reusable)
654 # def index_book(self, *args, **kw):
655 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
656 # ReusableIndex.pool_jobs.append(job)
659 def close_reusable():
660 if ReusableIndex.index:
661 ReusableIndex.index.optimize()
662 ReusableIndex.index.close()
663 ReusableIndex.index = None
665 index_changed.send_robust(None)
668 if ReusableIndex.index:
669 ReusableIndex.index.commit()
672 class JoinSearch(object):
674 This mixin could be used to handle block join queries.
677 def __init__(self, *args, **kw):
678 super(JoinSearch, self).__init__(*args, **kw)
680 def wrapjoins(self, query, fields=[]):
682 This functions modifies the query in a recursive way,
683 so Term and Phrase Queries contained, which match
684 provided fields are wrapped in a BlockJoinQuery,
685 and so delegated to children documents.
687 if BooleanQuery.instance_(query):
688 qs = BooleanQuery.cast_(query)
690 clause = BooleanClause.cast_(clause)
691 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
695 query.extractTerms(termset)
698 if t.field() not in fields:
700 return BlockJoinQuery(query, self.parent_filter,
701 BlockJoinQuery.ScoreMode.Total)
703 def bsearch(self, query, max_results=50):
704 q = self.query(query)
705 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
707 tops = self.searcher.search(bjq, max_results)
709 for found in tops.scoreDocs:
710 doc = self.searcher.doc(found.doc)
711 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
712 return (bks, tops.totalHits)
715 class SearchResult(object):
716 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
717 if tokens_cache is None: tokens_cache = {}
722 self._score = scoreDocs.score
727 self._processed_hits = None # processed hits
729 stored = search.searcher.doc(scoreDocs.doc)
730 self.book_id = int(stored.get("book_id"))
732 pd = stored.get("published_date")
734 self.published_date = int(pd)
736 self.published_date = 0
738 header_type = stored.get("header_type")
739 # we have a content hit in some header of fragment
740 if header_type is not None:
741 sec = (header_type, int(stored.get("header_index")))
742 header_span = stored.get('header_span')
743 header_span = header_span is not None and int(header_span) or 1
745 fragment = stored.get("fragment_anchor")
748 snippets = snippets.replace("/\n", "\n")
749 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
751 self._hits.append(hit)
754 self.searched = searched
755 self.tokens_cache = tokens_cache
759 return self._score * self.boost
761 def merge(self, other):
762 if self.book_id != other.book_id:
763 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
764 self._hits += other._hits
765 if other.score > self.score:
766 self._score = other._score
770 if hasattr(self, '_book'):
772 return catalogue.models.Book.objects.get(id=self.book_id)
774 book = property(get_book)
778 if self._processed_hits is not None:
779 return self._processed_hits
788 # to sections and fragments
789 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
791 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
793 # sections not covered by fragments
794 sect = filter(lambda s: 0 == len(filter(
795 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
796 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
801 def remove_duplicates(lst, keyfn, compare):
806 if compare(els[eif], e) >= 1:
811 # remove fragments with duplicated fid's and duplicated snippets
812 frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
813 frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
814 lambda a, b: cmp(a[SCORE], b[SCORE]))
816 # remove duplicate sections
820 si = s[POSITION][POSITION_INDEX]
823 if sections[si]['score'] >= s[SCORE]:
826 m = {'score': s[SCORE],
827 'section_number': s[POSITION][POSITION_INDEX] + 1,
832 hits = sections.values()
836 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
837 except catalogue.models.Fragment.DoesNotExist:
841 # Figure out if we were searching for a token matching some word in theme name.
842 themes = frag.tags.filter(category='theme')
844 if self.searched is not None:
845 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
847 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
850 if not theme in themes_hit:
851 themes_hit.append(theme)
854 m = {'score': f[SCORE],
856 'section_number': f[POSITION][POSITION_INDEX] + 1,
858 'themes_hit': themes_hit
863 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
865 self._processed_hits = hits
869 def __unicode__(self):
870 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
873 def aggregate(*result_lists):
875 for rl in result_lists:
877 if r.book_id in books:
878 books[r.book_id].merge(r)
881 return books.values()
883 def __cmp__(self, other):
884 c = cmp(self.score, other.score)
886 # this is inverted, because earlier date is better
887 return cmp(other.published_date, self.published_date)
894 Given some hint information (information we already know about)
895 our search target - like author, title (specific book), epoch, genre, kind
896 we can narrow down search using filters.
898 def __init__(self, search):
900 Accepts a Searcher instance.
907 def books(self, *books):
909 Give a hint that we search these books.
913 def tags(self, tags):
915 Give a hint that these Tag objects (a list of)
919 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
920 lst = self.book_tags.get(t.category, [])
922 self.book_tags[t.category] = lst
923 if t.category in ['theme', 'theme_pl']:
924 self.part_tags.append(t)
926 def tag_filter(self, tags, field='tags'):
928 Given a lsit of tags and an optional field (but they are normally in tags field)
929 returns a filter accepting only books with specific tags.
934 toks = self.search.get_tokens(tag.name, field=field)
935 tag_phrase = PhraseQuery()
937 tag_phrase.add(Term(field, tok))
938 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
940 return QueryWrapperFilter(q)
942 def book_filter(self):
944 Filters using book tags (all tag kinds except a theme)
946 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
948 return self.tag_filter(tags)
952 def part_filter(self):
954 This filter can be used to look for book parts.
955 It filters on book id and/or themes.
959 fs.append(self.tag_filter(self.part_tags, field='themes'))
961 if self._books != []:
963 for b in self._books:
964 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
965 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
968 return Search.chain_filters(fs)
970 def should_search_for_book(self):
971 return self._books == []
973 def just_search_in(self, all):
974 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
977 if field == 'authors' and 'author' in self.book_tags:
979 if field == 'title' and self._books != []:
981 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
987 class Search(IndexStore):
991 def __init__(self, default_field="content"):
992 IndexStore.__init__(self)
993 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
994 # self.analyzer = WLAnalyzer()
995 reader = IndexReader.open(self.store, True)
996 self.searcher = IndexSearcher(reader)
997 self.parser = QueryParser(Version.LUCENE_34, default_field,
1000 self.parent_filter = TermsFilter()
1001 self.parent_filter.addTerm(Term("is_book", "true"))
1002 index_changed.connect(self.reopen)
1005 reader = self.searcher.getIndexReader()
1006 self.searcher.close()
1008 super(Search, self).close()
1009 index_changed.disconnect(self.reopen)
1011 def reopen(self, **unused):
1012 reader = self.searcher.getIndexReader()
1013 rdr = reader.reopen()
1014 if not rdr.equals(reader):
1015 log.debug('Reopening index')
1016 oldsearch = self.searcher
1017 self.searcher = IndexSearcher(rdr)
1021 def query(self, query):
1022 """Parse query in default Lucene Syntax. (for humans)
1024 return self.parser.parse(query)
1026 def simple_search(self, query, max_results=50):
1027 """Runs a query for books using lucene syntax. (for humans)
1028 Returns (books, total_hits)
1031 tops = self.searcher.search(self.query(query), max_results)
1033 for found in tops.scoreDocs:
1034 doc = self.searcher.doc(found.doc)
1035 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1036 return (bks, tops.totalHits)
1038 def get_tokens(self, searched, field='content', cached=None):
1039 """returns tokens analyzed by a proper (for a field) analyzer
1040 argument can be: StringReader, string/unicode, or tokens. In the last case
1041 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1043 if cached is not None and field in cached:
1044 return cached[field]
1046 if isinstance(searched, str) or isinstance(searched, unicode):
1047 searched = StringReader(searched)
1048 elif isinstance(searched, list):
1052 tokens = self.analyzer.reusableTokenStream(field, searched)
1054 while tokens.incrementToken():
1055 cta = tokens.getAttribute(CharTermAttribute.class_)
1056 toks.append(cta.toString())
1058 if cached is not None:
1059 cached[field] = toks
1063 def fuzziness(self, fuzzy):
1064 """Helper method to sanitize fuzziness"""
1067 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1072 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1074 Return a PhraseQuery with a series of tokens.
1077 phrase = MultiPhraseQuery()
1079 term = Term(field, t)
1080 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1084 ft = fuzzterm.term()
1086 fuzzterms.append(ft)
1087 if not fuzzterm.next(): break
1089 phrase.add(JArray('object')(fuzzterms, Term))
1093 phrase = PhraseQuery()
1094 phrase.setSlop(slop)
1096 term = Term(field, t)
1100 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1102 Returns term queries joined by boolean query.
1103 modal - applies to boolean query
1104 fuzzy - should the query by fuzzy.
1108 term = Term(field, t)
1110 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1112 term = TermQuery(term)
1113 q.add(BooleanClause(term, modal))
1116 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1117 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1118 if filters is None: filters = []
1119 if tokens_cache is None: tokens_cache = {}
1121 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1123 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1125 filters.append(self.term_filter(Term('is_book', 'true')))
1126 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1128 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1130 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1131 filters=None, tokens_cache=None, boost=None, snippets=True):
1132 if filters is None: filters = []
1133 if tokens_cache is None: tokens_cache = {}
1136 filters.append(self.term_filter(Term('is_book', 'true')))
1138 query = BooleanQuery()
1141 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1143 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1144 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1146 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1148 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1149 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1151 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1153 Search for perfect book matches. Just see if the query matches with some author or title,
1154 taking hints into account.
1156 fields_to_search = ['authors', 'title']
1159 if not hint.should_search_for_book():
1161 fields_to_search = hint.just_search_in(fields_to_search)
1162 only_in = hint.book_filter()
1164 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1168 top = self.searcher.search(q,
1169 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1171 for found in top.scoreDocs:
1172 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1175 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1176 fields_to_search = ['tags', 'authors', 'title']
1180 if not hint.should_search_for_book():
1182 fields_to_search = hint.just_search_in(fields_to_search)
1183 only_in = hint.book_filter()
1185 tokens = self.get_tokens(searched, field='SIMPLE')
1189 for fld in fields_to_search:
1190 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1191 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1194 top = self.searcher.search(q,
1195 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1197 for found in top.scoreDocs:
1198 books.append(SearchResult(self, found, how_found="search_book"))
1202 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1204 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1205 some part/fragment of the book.
1207 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1211 flt = hint.part_filter()
1215 top = self.searcher.search(q,
1216 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1219 for found in top.scoreDocs:
1220 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1224 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1226 Tries to use search terms to match different fields of book (or its parts).
1227 E.g. one word can be an author survey, another be a part of the title, and the rest
1228 are some words from third chapter.
1230 if tokens_cache is None: tokens_cache = {}
1235 only_in = hint.part_filter()
1237 # content only query : themes x content
1240 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1241 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1243 # only search in themes when we do not already filter by themes
1244 if hint is None or hint.just_search_in(['themes']) != []:
1245 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1246 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1248 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1249 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1251 topDocs = self.searcher.search(q, only_in, max_results)
1252 for found in topDocs.scoreDocs:
1253 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1255 # query themes/content x author/title/tags
1257 in_content = BooleanQuery()
1258 in_meta = BooleanQuery()
1260 for fld in ['themes_pl', 'content']:
1261 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1263 for fld in ['tags', 'authors', 'title']:
1264 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1266 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1267 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1269 topDocs = self.searcher.search(q, only_in, max_results)
1270 for found in topDocs.scoreDocs:
1271 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1275 # def multisearch(self, query, max_results=50):
1278 # - (phrase) OR -> content
1281 # - (keywords) -> authors
1286 # queryreader = StringReader(query)
1287 # tokens = self.get_tokens(queryreader)
1289 # top_level = BooleanQuery()
1290 # Should = BooleanClause.Occur.SHOULD
1292 # phrase_level = BooleanQuery()
1293 # phrase_level.setBoost(1.3)
1295 # p_content = self.make_phrase(tokens, joined=True)
1296 # p_title = self.make_phrase(tokens, 'title')
1297 # p_author = self.make_phrase(tokens, 'author')
1299 # phrase_level.add(BooleanClause(p_content, Should))
1300 # phrase_level.add(BooleanClause(p_title, Should))
1301 # phrase_level.add(BooleanClause(p_author, Should))
1303 # kw_level = BooleanQuery()
1305 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1306 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1307 # kw_level.add(j_themes, Should)
1308 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1309 # j_con = self.make_term_query(tokens, joined=True)
1310 # kw_level.add(j_con, Should)
1312 # top_level.add(BooleanClause(phrase_level, Should))
1313 # top_level.add(BooleanClause(kw_level, Should))
1317 def get_snippets(self, scoreDoc, query, field='content'):
1319 Returns a snippet for found scoreDoc.
1321 htmlFormatter = SimpleHTMLFormatter()
1322 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1324 stored = self.searcher.doc(scoreDoc.doc)
1326 position = stored.get('snippets_position')
1327 length = stored.get('snippets_length')
1328 if position is None or length is None:
1330 revision = stored.get('snippets_revision')
1331 if revision: revision = int(revision)
1334 book_id = int(stored.get('book_id'))
1335 snippets = Snippets(book_id, revision=revision)
1340 log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1345 text = snippets.get((int(position),
1350 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1351 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1352 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1354 except Exception, e:
1356 if hasattr(e, 'getJavaException'):
1357 e2 = unicode(e.getJavaException())
1358 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1363 def enum_to_array(enum):
1365 Converts a lucene TermEnum to array of Terms, suitable for
1374 if not enum.next(): break
1377 return JArray('object')(terms, Term)
1379 def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1381 Search for Tag objects using query.
1384 filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1385 tops = self.searcher.search(query, filt, max_results)
1388 for found in tops.scoreDocs:
1389 doc = self.searcher.doc(found.doc)
1390 is_pdcounter = doc.get('is_pdcounter')
1391 category = doc.get('tag_category')
1393 if is_pdcounter == 'true':
1394 if category == 'pd_author':
1395 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1396 elif category == 'pd_book':
1397 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1398 tag.category = 'pd_book' # make it look more lik a tag.
1400 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1402 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1403 # don't add the pdcounter tag if same tag already exists
1404 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1406 except catalogue.models.Tag.DoesNotExist: pass
1407 except PDCounterAuthor.DoesNotExist: pass
1408 except PDCounterBook.DoesNotExist: pass
1410 log.debug('search_tags: %s' % tags)
1414 def search_books(self, query, filt=None, max_results=10):
1416 Searches for Book objects using query
1419 tops = self.searcher.search(query, filt, max_results)
1420 for found in tops.scoreDocs:
1421 doc = self.searcher.doc(found.doc)
1423 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1424 except catalogue.models.Book.DoesNotExist: pass
1427 def make_prefix_phrase(self, toks, field):
1428 q = MultiPhraseQuery()
1429 for i in range(len(toks)):
1430 t = Term(field, toks[i])
1431 if i == len(toks) - 1:
1432 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1442 def term_filter(term, inverse=False):
1443 only_term = TermsFilter()
1444 only_term.addTerm(term)
1447 neg = BooleanFilter()
1448 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1453 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1455 Return auto-complete hints for tags
1456 using prefix search.
1458 toks = self.get_tokens(string, field='SIMPLE')
1459 top = BooleanQuery()
1461 for field in ['tag_name', 'tag_name_pl']:
1463 q = self.make_prefix_phrase(toks, field)
1465 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1466 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1468 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1470 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1472 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1474 Returns auto-complete hints for book titles
1475 Because we do not index 'pseudo' title-tags.
1478 toks = self.get_tokens(string, field='SIMPLE')
1481 q = self.make_prefix_phrase(toks, 'title')
1483 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1485 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1488 def chain_filters(filters, op=ChainedFilter.AND):
1490 Chains a filter list together
1492 filters = filter(lambda x: x is not None, filters)
1493 if not filters or filters is []:
1495 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1498 def filtered_categories(self, tags):
1500 Return a list of tag categories, present in tags list.
1504 cats[t.category] = True