1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 import catalogue.models
29 from multiprocessing.pool import ThreadPool
30 from threading import current_thread
35 class WLAnalyzer(PerFieldAnalyzerWrapper):
37 polish = PolishAnalyzer(Version.LUCENE_34)
38 # polish_gap.setPositionIncrementGap(999)
40 simple = SimpleAnalyzer(Version.LUCENE_34)
41 # simple_gap.setPositionIncrementGap(999)
43 keyword = KeywordAnalyzer(Version.LUCENE_34)
45 # not sure if needed: there's NOT_ANALYZED meaning basically the same
47 PerFieldAnalyzerWrapper.__init__(self, polish)
49 self.addAnalyzer("tags", simple)
50 self.addAnalyzer("technical_editors", simple)
51 self.addAnalyzer("editors", simple)
52 self.addAnalyzer("url", keyword)
53 self.addAnalyzer("source_url", keyword)
54 self.addAnalyzer("source_name", simple)
55 self.addAnalyzer("publisher", simple)
56 self.addAnalyzer("authors", simple)
57 self.addAnalyzer("is_book", keyword)
58 # shouldn't the title have two forms? _pl and simple?
60 self.addAnalyzer("themes", simple)
61 self.addAnalyzer("themes_pl", polish)
63 self.addAnalyzer("tag_name", simple)
64 self.addAnalyzer("tag_name_pl", polish)
66 self.addAnalyzer("translators", simple)
68 self.addAnalyzer("KEYWORD", keyword)
69 self.addAnalyzer("SIMPLE", simple)
70 self.addAnalyzer("POLISH", polish)
73 class IndexStore(object):
75 Provides access to search index.
77 self.store - lucene index directory
81 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
83 def make_index_dir(self):
85 os.makedirs(settings.SEARCH_INDEX)
86 except OSError as exc:
87 if exc.errno == errno.EEXIST:
92 class IndexChecker(IndexStore):
94 IndexStore.__init__(self)
97 checker = CheckIndex(self.store)
98 status = checker.checkIndex()
102 class Snippets(object):
104 This class manages snippet files for indexed object (book)
105 the snippets are concatenated together, and their positions and
106 lengths are kept in lucene index fields.
108 SNIPPET_DIR = "snippets"
110 def __init__(self, book_id):
112 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
113 except OSError as exc:
114 if exc.errno == errno.EEXIST:
117 self.book_id = book_id
120 def open(self, mode='r'):
122 Open the snippet file. Call .close() afterwards.
126 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
130 def add(self, snippet):
132 Append a snippet (unicode) to the snippet file.
133 Return a (position, length) tuple
135 txt = snippet.encode('utf-8')
138 pos = (self.position, l)
144 Given a tuple of (position, length) return an unicode
145 of the snippet stored there.
147 self.file.seek(pos[0], 0)
148 txt = self.file.read(pos[1]).decode('utf-8')
152 """Close snippet file"""
156 class BaseIndex(IndexStore):
159 Provides basic operations on index: opening, closing, optimizing.
161 def __init__(self, analyzer=None):
162 super(BaseIndex, self).__init__()
165 analyzer = WLAnalyzer()
166 self.analyzer = analyzer
168 def open(self, analyzer=None):
170 raise Exception("Index is already opened")
171 self.index = IndexWriter(self.store, self.analyzer,\
172 IndexWriter.MaxFieldLength.LIMITED)
176 self.index.optimize()
180 self.index.optimize()
181 except JavaError, je:
182 print "Error during optimize phase, check index: %s" % je
191 def __exit__(self, type, value, tb):
195 class Index(BaseIndex):
197 Class indexing books.
199 def __init__(self, analyzer=None):
200 super(Index, self).__init__(analyzer)
202 def index_tags(self):
204 Re-index global tag list.
205 Removes all tags from index, then index them again.
206 Indexed fields include: id, name (with and without polish stems), category
208 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
209 self.index.deleteDocuments(q)
211 for tag in catalogue.models.Tag.objects.all():
213 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(tag.id))
214 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
215 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
216 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
217 self.index.addDocument(doc)
219 def create_book_doc(self, book):
221 Create a lucene document referring book id.
224 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
225 if book.parent is not None:
226 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
229 def remove_book(self, book):
230 """Removes a book from search index.
231 book - Book instance."""
232 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
233 self.index.deleteDocuments(q)
235 def index_book(self, book, book_info=None, overwrite=True):
238 Creates a lucene document for extracted metadata
239 and calls self.index_content() to index the contents of the book.
242 self.remove_book(book)
244 book_doc = self.create_book_doc(book)
245 meta_fields = self.extract_metadata(book, book_info)
246 for f in meta_fields.values():
247 if isinstance(f, list) or isinstance(f, tuple):
253 self.index.addDocument(book_doc)
256 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors']])
261 'dramat_wierszowany_l',
262 'dramat_wierszowany_lp',
263 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
267 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
269 def extract_metadata(self, book, book_info=None):
271 Extract metadata from book and returns a map of fields keyed by fieldname
275 if book_info is None:
276 book_info = dcparser.parse(open(book.xml_file.path))
278 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
279 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
280 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
283 for field in dcparser.BookInfo.FIELDS:
284 if hasattr(book_info, field.name):
285 if not getattr(book_info, field.name):
287 # since no type information is available, we use validator
288 type_indicator = field.validator
289 if type_indicator == dcparser.as_unicode:
290 s = getattr(book_info, field.name)
294 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
295 except JavaError as je:
296 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
297 elif type_indicator == dcparser.as_person:
298 p = getattr(book_info, field.name)
299 if isinstance(p, dcparser.Person):
302 persons = ', '.join(map(unicode, p))
303 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
304 elif type_indicator == dcparser.as_date:
305 dt = getattr(book_info, field.name)
306 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
307 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
311 def add_gaps(self, fields, fieldname):
313 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
314 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
318 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
319 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
321 def get_master(self, root):
323 Returns the first master tag from an etree.
325 for master in root.iter():
326 if master.tag in self.master_tags:
329 def index_content(self, book, book_fields=[]):
331 Walks the book XML and extract content from it.
332 Adds parts for each header tag and for each fragment.
334 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
335 root = wld.edoc.getroot()
337 master = self.get_master(root)
343 for child in list(node):
344 for b, e in walker(child):
349 def fix_format(text):
350 return re.sub("(?m)/$", "", text)
352 def add_part(snippets, **fields):
353 doc = self.create_book_doc(book)
354 for f in book_fields:
357 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
358 doc.add(NumericField("header_span", Field.Store.YES, True)\
359 .setIntValue('header_span' in fields and fields['header_span'] or 1))
360 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
362 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
363 Field.TermVector.WITH_POSITIONS_OFFSETS))
365 snip_pos = snippets.add(fields["content"])
366 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
367 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
369 if 'fragment_anchor' in fields:
370 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
371 Field.Store.YES, Field.Index.NOT_ANALYZED))
373 if 'themes' in fields:
374 themes, themes_pl = zip(*[
375 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
376 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
377 for theme in fields['themes']])
379 themes = self.add_gaps(themes, 'themes')
380 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
390 if isinstance(s, unicode):
391 return s.encode('utf-8')
396 snippets = Snippets(book.id).open('w')
398 for header, position in zip(list(master), range(len(master))):
400 if header.tag in self.skip_header_tags:
403 content = u' '.join([t for t in header.itertext()])
404 content = fix_format(content)
406 doc = add_part(snippets, header_index=position, header_type=header.tag, content=content)
408 self.index.addDocument(doc)
410 for start, end in walker(header):
411 if start is not None and start.tag == 'begin':
412 fid = start.attrib['id'][1:]
413 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
414 fragments[fid]['content'].append(start.tail)
415 elif start is not None and start.tag == 'motyw':
416 fid = start.attrib['id'][1:]
417 if start.text is not None:
418 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
419 fragments[fid]['content'].append(start.tail)
420 elif start is not None and start.tag == 'end':
421 fid = start.attrib['id'][1:]
422 if fid not in fragments:
423 continue # a broken <end> node, skip it
424 frag = fragments[fid]
425 if frag['themes'] == []:
426 continue # empty themes list.
430 return u' '.join(map(
431 lambda x: x == None and u'(none)' or unicode(x),
434 doc = add_part(snippets,
435 header_type=frag['start_header'],
436 header_index=frag['start_section'],
437 header_span=position - frag['start_section'] + 1,
439 content=u' '.join(filter(lambda s: s is not None, frag['content'])),
440 themes=frag['themes'])
442 self.index.addDocument(doc)
443 elif start is not None:
444 for frag in fragments.values():
445 frag['content'].append(start.text)
446 elif end is not None:
447 for frag in fragments.values():
448 frag['content'].append(end.tail)
453 def log_exception_wrapper(f):
458 print("Error in indexing thread: %s" % e)
459 traceback.print_exc()
464 class ReusableIndex(Index):
466 Works like index, but does not close/optimize Lucene index
467 until program exit (uses atexit hook).
468 This is usefull for importbooks command.
470 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
474 def open(self, analyzer=None, threads=4):
475 if ReusableIndex.index is not None:
476 self.index = ReusableIndex.index
478 print("opening index")
479 Index.open(self, analyzer)
480 ReusableIndex.index = self.index
481 atexit.register(ReusableIndex.close_reusable)
483 # def index_book(self, *args, **kw):
484 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
485 # ReusableIndex.pool_jobs.append(job)
488 def close_reusable():
489 if ReusableIndex.index is not None:
490 ReusableIndex.index.optimize()
491 ReusableIndex.index.close()
492 ReusableIndex.index = None
498 class JoinSearch(object):
500 This mixin could be used to handle block join queries.
503 def __init__(self, *args, **kw):
504 super(JoinSearch, self).__init__(*args, **kw)
506 def wrapjoins(self, query, fields=[]):
508 This functions modifies the query in a recursive way,
509 so Term and Phrase Queries contained, which match
510 provided fields are wrapped in a BlockJoinQuery,
511 and so delegated to children documents.
513 if BooleanQuery.instance_(query):
514 qs = BooleanQuery.cast_(query)
516 clause = BooleanClause.cast_(clause)
517 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
521 query.extractTerms(termset)
524 if t.field() not in fields:
526 return BlockJoinQuery(query, self.parent_filter,
527 BlockJoinQuery.ScoreMode.Total)
529 def bsearch(self, query, max_results=50):
530 q = self.query(query)
531 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
533 tops = self.searcher.search(bjq, max_results)
535 for found in tops.scoreDocs:
536 doc = self.searcher.doc(found.doc)
537 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
538 return (bks, tops.totalHits)
541 class SearchResult(object):
542 def __init__(self, searcher, scoreDocs, score=None, how_found=None, snippets=None):
548 self.score = scoreDocs.score
552 stored = searcher.doc(scoreDocs.doc)
553 self.book_id = int(stored.get("book_id"))
555 header_type = stored.get("header_type")
559 sec = (header_type, int(stored.get("header_index")))
560 header_span = stored.get('header_span')
561 header_span = header_span is not None and int(header_span) or 1
563 fragment = stored.get("fragment_anchor")
565 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': [snippets]})
567 self.hits.append(hit)
569 def merge(self, other):
570 if self.book_id != other.book_id:
571 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
572 self.hits += other.hits
573 if other.score > self.score:
574 self.score = other.score
578 return catalogue.models.Book.objects.get(id=self.book_id)
580 book = property(get_book)
582 def process_hits(self):
583 frags = filter(lambda r: r[1] is not None, self.hits)
584 sect = filter(lambda r: r[1] is None, self.hits)
585 sect = filter(lambda s: 0 == len(filter(
586 lambda f: s[0][1] >= f[0][1] and s[0][1] < f[0][1] + f[0][2],
593 'header_index': s[0][1]
599 frag = catalogue.models.Fragment.objects.get(anchor=f[1])
602 'themes': frag.tags.filter(category='theme')
607 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
609 print("--- %s" % hits)
613 def __unicode__(self):
614 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
617 def aggregate(*result_lists):
619 for rl in result_lists:
621 if r.book_id in books:
622 books[r.book_id].merge(r)
623 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
626 return books.values()
628 def __cmp__(self, other):
629 return cmp(self.score, other.score)
634 Given some hint information (information we already know about)
635 our search target - like author, title (specific book), epoch, genre, kind
636 we can narrow down search using filters.
638 def __init__(self, search):
640 Accepts a Searcher instance.
647 def books(self, *books):
649 Give a hint that we search these books.
653 def tags(self, tags):
655 Give a hint that these Tag objects (a list of)
659 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
660 lst = self.book_tags.get(t.category, [])
662 self.book_tags[t.category] = lst
663 if t.category in ['theme']:
664 self.part_tags.append(t)
666 def tag_filter(self, tags, field='tags'):
668 Given a lsit of tags and an optional field (but they are normally in tags field)
669 returns a filter accepting only books with specific tags.
674 toks = self.search.get_tokens(tag.name, field=field)
675 tag_phrase = PhraseQuery()
677 tag_phrase.add(Term(field, tok))
678 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
680 return QueryWrapperFilter(q)
682 def book_filter(self):
684 Filters using book tags (all tag kinds except a theme)
686 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
688 return self.tag_filter(tags)
692 def part_filter(self):
694 This filter can be used to look for book parts.
695 It filters on book id and/or themes.
699 fs.append(self.tag_filter(self.part_tags, field='themes'))
701 if self._books != []:
703 for b in self._books:
704 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
705 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
708 return Search.chain_filters(fs)
710 def should_search_for_book(self):
711 return self._books == []
713 def just_search_in(self, all):
714 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
717 if field == 'authors' and 'author' in self.book_tags:
719 if field == 'title' and self._books != []:
721 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
727 class Search(IndexStore):
731 def __init__(self, default_field="content"):
732 IndexStore.__init__(self)
733 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
734 # self.analyzer = WLAnalyzer()
735 self.searcher = IndexSearcher(self.store, True)
736 self.parser = QueryParser(Version.LUCENE_34, default_field,
739 self.parent_filter = TermsFilter()
740 self.parent_filter.addTerm(Term("is_book", "true"))
742 def query(self, query):
743 """Parse query in default Lucene Syntax. (for humans)
745 return self.parser.parse(query)
747 def simple_search(self, query, max_results=50):
748 """Runs a query for books using lucene syntax. (for humans)
749 Returns (books, total_hits)
752 tops = self.searcher.search(self.query(query), max_results)
754 for found in tops.scoreDocs:
755 doc = self.searcher.doc(found.doc)
756 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
757 return (bks, tops.totalHits)
759 def get_tokens(self, searched, field='content'):
760 """returns tokens analyzed by a proper (for a field) analyzer
761 argument can be: StringReader, string/unicode, or tokens. In the last case
762 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
764 if isinstance(searched, str) or isinstance(searched, unicode):
765 searched = StringReader(searched)
766 elif isinstance(searched, list):
770 tokens = self.analyzer.reusableTokenStream(field, searched)
772 while tokens.incrementToken():
773 cta = tokens.getAttribute(CharTermAttribute.class_)
774 toks.append(cta.toString())
777 def fuzziness(self, fuzzy):
778 """Helper method to sanitize fuzziness"""
781 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
786 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
788 Return a PhraseQuery with a series of tokens.
791 phrase = MultiPhraseQuery()
793 term = Term(field, t)
794 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
798 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
802 if not fuzzterm.next(): break
804 phrase.add(JArray('object')(fuzzterms, Term))
808 phrase = PhraseQuery()
811 term = Term(field, t)
815 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
817 Returns term queries joined by boolean query.
818 modal - applies to boolean query
819 fuzzy - should the query by fuzzy.
823 term = Term(field, t)
825 term = FuzzyQuery(term, self.fuzziness(fuzzy))
827 term = TermQuery(term)
828 q.add(BooleanClause(term, modal))
831 # def content_query(self, query):
832 # return BlockJoinQuery(query, self.parent_filter,
833 # BlockJoinQuery.ScoreMode.Total)
835 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
837 Search for perfect book matches. Just see if the query matches with some author or title,
838 taking hints into account.
840 fields_to_search = ['authors', 'title']
843 if not hint.should_search_for_book():
845 fields_to_search = hint.just_search_in(fields_to_search)
846 only_in = hint.book_filter()
848 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
852 top = self.searcher.search(q,
853 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
855 for found in top.scoreDocs:
856 books.append(SearchResult(self.searcher, found))
859 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
861 Search for book parts which containt a phrase perfectly matching (with a slop of 2, default for make_phrase())
862 some part/fragment of the book.
864 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
868 flt = hint.part_filter()
872 top = self.searcher.search(q,
873 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
876 for found in top.scoreDocs:
877 books.append(SearchResult(self.searcher, found, snippets=self.get_snippets(found, q)))
881 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None):
883 Tries to use search terms to match different fields of book (or its parts).
884 E.g. one word can be an author survey, another be a part of the title, and the rest
885 are some words from third chapter.
891 only_in = hint.part_filter()
893 # content only query : themes x content
896 tokens = self.get_tokens(searched)
897 if hint is None or hint.just_search_in(['themes_pl']) != []:
898 q.add(BooleanClause(self.make_term_query(tokens, field='themes_pl',
899 fuzzy=fuzzy), BooleanClause.Occur.MUST))
901 q.add(BooleanClause(self.make_term_query(tokens, field='content',
902 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
904 topDocs = self.searcher.search(q, only_in, max_results)
905 for found in topDocs.scoreDocs:
906 books.append(SearchResult(self.searcher, found))
908 # query themes/content x author/title/tags
910 # in_meta = BooleanQuery()
911 in_content = BooleanQuery()
913 for fld in ['themes', 'content', 'tags', 'authors', 'title']:
914 in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
916 topDocs = self.searcher.search(q, only_in, max_results)
917 for found in topDocs.scoreDocs:
918 books.append(SearchResult(self.searcher, found))
922 # def multisearch(self, query, max_results=50):
925 # - (phrase) OR -> content
928 # - (keywords) -> authors
933 # queryreader = StringReader(query)
934 # tokens = self.get_tokens(queryreader)
936 # top_level = BooleanQuery()
937 # Should = BooleanClause.Occur.SHOULD
939 # phrase_level = BooleanQuery()
940 # phrase_level.setBoost(1.3)
942 # p_content = self.make_phrase(tokens, joined=True)
943 # p_title = self.make_phrase(tokens, 'title')
944 # p_author = self.make_phrase(tokens, 'author')
946 # phrase_level.add(BooleanClause(p_content, Should))
947 # phrase_level.add(BooleanClause(p_title, Should))
948 # phrase_level.add(BooleanClause(p_author, Should))
950 # kw_level = BooleanQuery()
952 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
953 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
954 # kw_level.add(j_themes, Should)
955 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
956 # j_con = self.make_term_query(tokens, joined=True)
957 # kw_level.add(j_con, Should)
959 # top_level.add(BooleanClause(phrase_level, Should))
960 # top_level.add(BooleanClause(kw_level, Should))
965 def get_snippets(self, scoreDoc, query, field='content'):
967 Returns a snippet for found scoreDoc.
969 htmlFormatter = SimpleHTMLFormatter()
970 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
972 stored = self.searcher.doc(scoreDoc.doc)
975 snippets = Snippets(stored.get('book_id')).open()
977 text = snippets.get((int(stored.get('snippets_position')),
978 int(stored.get('snippets_length'))))
982 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
983 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
984 # import pdb; pdb.set_trace()
985 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
990 def enum_to_array(enum):
992 Converts a lucene TermEnum to array of Terms, suitable for
1001 if not enum.next(): break
1004 return JArray('object')(terms, Term)
1006 def search_tags(self, query, filter=None, max_results=40):
1008 Search for Tag objects using query.
1010 tops = self.searcher.search(query, filter, max_results)
1013 for found in tops.scoreDocs:
1014 doc = self.searcher.doc(found.doc)
1015 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1017 print "%s (%d) -> %f" % (tag, tag.id, found.score)
1021 def search_books(self, query, filter=None, max_results=10):
1023 Searches for Book objects using query
1026 tops = self.searcher.search(query, filter, max_results)
1027 for found in tops.scoreDocs:
1028 doc = self.searcher.doc(found.doc)
1029 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1032 def create_prefix_phrase(self, toks, field):
1033 q = MultiPhraseQuery()
1034 for i in range(len(toks)):
1035 t = Term(field, toks[i])
1036 if i == len(toks) - 1:
1037 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1047 def term_filter(term, inverse=False):
1048 only_term = TermsFilter()
1049 only_term.addTerm(term)
1052 neg = BooleanFilter()
1053 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1058 def hint_tags(self, string, max_results=50):
1060 Return auto-complete hints for tags
1061 using prefix search.
1063 toks = self.get_tokens(string, field='SIMPLE')
1064 top = BooleanQuery()
1066 for field in ['tag_name', 'tag_name_pl']:
1067 q = self.create_prefix_phrase(toks, field)
1068 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1070 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1072 return self.search_tags(top, no_book_cat, max_results=max_results)
1074 def hint_books(self, string, max_results=50):
1076 Returns auto-complete hints for book titles
1077 Because we do not index 'pseudo' title-tags.
1080 toks = self.get_tokens(string, field='SIMPLE')
1082 q = self.create_prefix_phrase(toks, 'title')
1084 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1087 def chain_filters(filters, op=ChainedFilter.AND):
1089 Chains a filter list together
1091 filters = filter(lambda x: x is not None, filters)
1094 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1097 def filtered_categories(self, tags):
1099 Return a list of tag categories, present in tags list.
1103 cats[t.category] = True