1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, timeout=None):
174 raise Exception("Index is already opened")
175 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
177 conf.setWriteLockTimeout(long(timeout))
178 self.index = IndexWriter(self.store, conf)
182 self.index.optimize()
186 self.index.optimize()
187 except JavaError, je:
188 print "Error during optimize phase, check index: %s" % je
197 def __exit__(self, type, value, tb):
201 class Index(BaseIndex):
203 Class indexing books.
205 def __init__(self, analyzer=None):
206 super(Index, self).__init__(analyzer)
208 def index_tags(self):
210 Re-index global tag list.
211 Removes all tags from index, then index them again.
212 Indexed fields include: id, name (with and without polish stems), category
214 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215 self.index.deleteDocuments(q)
217 for tag in catalogue.models.Tag.objects.all():
219 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223 self.index.addDocument(doc)
225 for pdtag in PDCounterAuthor.objects.all():
227 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232 self.index.addDocument(doc)
234 for pdtag in PDCounterBook.objects.all():
236 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
237 doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
238 doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
240 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
241 self.index.addDocument(doc)
243 def create_book_doc(self, book):
245 Create a lucene document referring book id.
248 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
249 if book.parent is not None:
250 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
253 def remove_book(self, book):
254 """Removes a book from search index.
255 book - Book instance."""
256 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
257 self.index.deleteDocuments(q)
259 def index_book(self, book, book_info=None, overwrite=True):
262 Creates a lucene document for extracted metadata
263 and calls self.index_content() to index the contents of the book.
266 self.remove_book(book)
268 book_doc = self.create_book_doc(book)
269 meta_fields = self.extract_metadata(book, book_info)
270 for f in meta_fields.values():
271 if isinstance(f, list) or isinstance(f, tuple):
276 self.index.addDocument(book_doc)
279 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
284 'dramat_wierszowany_l',
285 'dramat_wierszowany_lp',
286 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
290 ignore_content_tags = [
292 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
294 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
297 footnote_tags = ['pa', 'pt', 'pr', 'pe']
299 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
301 published_date_re = re.compile("([0-9]+)[\]. ]*$")
303 def extract_metadata(self, book, book_info=None):
305 Extract metadata from book and returns a map of fields keyed by fieldname
309 if book_info is None:
310 book_info = dcparser.parse(open(book.xml_file.path))
312 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
313 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
314 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
317 for field in dcparser.BookInfo.FIELDS:
318 if hasattr(book_info, field.name):
319 if not getattr(book_info, field.name):
321 # since no type information is available, we use validator
322 type_indicator = field.validator
323 if type_indicator == dcparser.as_unicode:
324 s = getattr(book_info, field.name)
328 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
329 except JavaError as je:
330 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
331 elif type_indicator == dcparser.as_person:
332 p = getattr(book_info, field.name)
333 if isinstance(p, dcparser.Person):
336 persons = ', '.join(map(unicode, p))
337 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
338 elif type_indicator == dcparser.as_date:
339 dt = getattr(book_info, field.name)
340 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
341 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
345 if hasattr(book_info, 'source_name') and book_info.source_name:
346 match = self.published_date_re.search(book_info.source_name)
347 if match is not None:
348 pd = str(match.groups()[0])
350 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
354 def add_gaps(self, fields, fieldname):
356 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
357 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
361 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
362 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
364 def get_master(self, root):
366 Returns the first master tag from an etree.
368 for master in root.iter():
369 if master.tag in self.master_tags:
372 def index_content(self, book, book_fields=[]):
374 Walks the book XML and extract content from it.
375 Adds parts for each header tag and for each fragment.
377 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
378 root = wld.edoc.getroot()
380 master = self.get_master(root)
384 def walker(node, ignore_tags=[]):
386 if node.tag not in ignore_tags:
387 yield node, None, None
388 if node.text is not None:
389 yield None, node.text, None
390 for child in list(node):
391 for b, t, e in walker(child):
393 yield None, None, node
395 if node.tail is not None:
396 yield None, node.tail, None
399 def fix_format(text):
400 # separator = [u" ", u"\t", u".", u";", u","]
401 if isinstance(text, list):
402 # need to join it first
403 text = filter(lambda s: s is not None, content)
404 text = u' '.join(text)
405 # for i in range(len(text)):
407 # if text[i][0] not in separator\
408 # and text[i - 1][-1] not in separator:
409 # text.insert(i, u" ")
411 return re.sub("(?m)/$", "", text)
413 def add_part(snippets, **fields):
414 doc = self.create_book_doc(book)
415 for f in book_fields:
418 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
419 doc.add(NumericField("header_span", Field.Store.YES, True)\
420 .setIntValue('header_span' in fields and fields['header_span'] or 1))
421 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
423 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
424 Field.TermVector.WITH_POSITIONS_OFFSETS))
426 snip_pos = snippets.add(fields["content"])
427 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
428 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
430 if 'fragment_anchor' in fields:
431 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
432 Field.Store.YES, Field.Index.NOT_ANALYZED))
434 if 'themes' in fields:
435 themes, themes_pl = zip(*[
436 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
437 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
438 for theme in fields['themes']])
440 themes = self.add_gaps(themes, 'themes')
441 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
451 if isinstance(s, unicode):
452 return s.encode('utf-8')
457 snippets = Snippets(book.id).open('w')
459 for header, position in zip(list(master), range(len(master))):
461 if header.tag in self.skip_header_tags:
463 if header.tag is etree.Comment:
470 def all_content(text):
471 for frag in fragments.values():
472 frag['content'].append(text)
474 handle_text = [all_content]
477 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
479 if start is not None and start.tag in self.footnote_tags:
481 def collect_footnote(t):
483 handle_text.append(collect_footnote)
484 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
486 doc = add_part(snippets, header_index=position, header_type=header.tag,
487 content=u''.join(footnote),
488 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
490 self.index.addDocument(doc)
491 #print "@ footnote text: %s" % footnote
494 # handle fragments and themes.
495 if start is not None and start.tag == 'begin':
496 fid = start.attrib['id'][1:]
497 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
499 # themes for this fragment
500 elif start is not None and start.tag == 'motyw':
501 fid = start.attrib['id'][1:]
502 handle_text.append(None)
503 if start.text is not None:
504 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
505 elif end is not None and end.tag == 'motyw':
508 elif start is not None and start.tag == 'end':
509 fid = start.attrib['id'][1:]
510 if fid not in fragments:
511 continue # a broken <end> node, skip it
512 frag = fragments[fid]
513 if frag['themes'] == []:
514 continue # empty themes list.
517 doc = add_part(snippets,
518 header_type=frag['start_header'],
519 header_index=frag['start_section'],
520 header_span=position - frag['start_section'] + 1,
522 content=fix_format(frag['content']),
523 themes=frag['themes'])
524 #print '@ FRAG %s' % frag['content']
525 self.index.addDocument(doc)
529 if text is not None and handle_text is not []:
530 hdl = handle_text[-1]
534 # in the end, add a section text.
535 doc = add_part(snippets, header_index=position, header_type=header.tag,
536 content=fix_format(content))
537 #print '@ CONTENT: %s' % fix_format(content)
539 self.index.addDocument(doc)
545 def log_exception_wrapper(f):
550 print("Error in indexing thread: %s" % e)
551 traceback.print_exc()
556 class ReusableIndex(Index):
558 Works like index, but does not close/optimize Lucene index
559 until program exit (uses atexit hook).
560 This is usefull for importbooks command.
562 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
566 def open(self, analyzer=None, **kw):
567 if ReusableIndex.index:
568 self.index = ReusableIndex.index
570 print("opening index")
571 Index.open(self, analyzer, **kw)
572 ReusableIndex.index = self.index
573 atexit.register(ReusableIndex.close_reusable)
575 # def index_book(self, *args, **kw):
576 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
577 # ReusableIndex.pool_jobs.append(job)
580 def close_reusable():
581 if ReusableIndex.index:
582 print("closing index")
583 ReusableIndex.index.optimize()
584 ReusableIndex.index.close()
585 ReusableIndex.index = None
588 if ReusableIndex.index:
589 ReusableIndex.index.commit()
592 class JoinSearch(object):
594 This mixin could be used to handle block join queries.
597 def __init__(self, *args, **kw):
598 super(JoinSearch, self).__init__(*args, **kw)
600 def wrapjoins(self, query, fields=[]):
602 This functions modifies the query in a recursive way,
603 so Term and Phrase Queries contained, which match
604 provided fields are wrapped in a BlockJoinQuery,
605 and so delegated to children documents.
607 if BooleanQuery.instance_(query):
608 qs = BooleanQuery.cast_(query)
610 clause = BooleanClause.cast_(clause)
611 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
615 query.extractTerms(termset)
618 if t.field() not in fields:
620 return BlockJoinQuery(query, self.parent_filter,
621 BlockJoinQuery.ScoreMode.Total)
623 def bsearch(self, query, max_results=50):
624 q = self.query(query)
625 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
627 tops = self.searcher.search(bjq, max_results)
629 for found in tops.scoreDocs:
630 doc = self.searcher.doc(found.doc)
631 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
632 return (bks, tops.totalHits)
635 class SearchResult(object):
636 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
637 if tokens_cache is None: tokens_cache = {}
642 self._score = scoreDocs.score
647 self._processed_hits = None # processed hits
649 stored = search.searcher.doc(scoreDocs.doc)
650 self.book_id = int(stored.get("book_id"))
652 pd = stored.get("published_date")
655 self.published_date = int(pd)
657 header_type = stored.get("header_type")
658 # we have a content hit in some header of fragment
659 if header_type is not None:
660 sec = (header_type, int(stored.get("header_index")))
661 header_span = stored.get('header_span')
662 header_span = header_span is not None and int(header_span) or 1
664 fragment = stored.get("fragment_anchor")
667 snippets = snippets.replace("/\n", "\n")
668 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
670 self._hits.append(hit)
673 self.searched = searched
674 self.tokens_cache = tokens_cache
678 return self._score * self.boost
680 def merge(self, other):
681 if self.book_id != other.book_id:
682 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
683 self._hits += other._hits
684 if other.score > self.score:
685 self._score = other._score
689 return catalogue.models.Book.objects.get(id=self.book_id)
691 book = property(get_book)
695 if self._processed_hits is not None:
696 return self._processed_hits
705 # to sections and fragments
706 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
707 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
708 sect = filter(lambda s: 0 == len(filter(
709 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
710 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
715 # remove duplicate fragments
720 if fragments[fid][SCORE] >= f[SCORE]:
723 frags = fragments.values()
725 # remove duplicate sections
729 si = s[POSITION][POSITION_INDEX]
732 if sections[si]['score'] >= s[SCORE]:
735 m = {'score': s[SCORE],
736 'section_number': s[POSITION][POSITION_INDEX] + 1,
741 hits = sections.values()
745 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
746 except catalogue.models.Fragment.DoesNotExist:
750 # Figure out if we were searching for a token matching some word in theme name.
751 themes = frag.tags.filter(category='theme')
753 if self.searched is not None:
754 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
756 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
759 if not theme in themes_hit:
760 themes_hit.append(theme)
763 m = {'score': f[SCORE],
765 'section_number': f[POSITION][POSITION_INDEX] + 1,
767 'themes_hit': themes_hit
772 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
774 self._processed_hits = hits
778 def __unicode__(self):
779 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
782 def aggregate(*result_lists):
784 for rl in result_lists:
786 if r.book_id in books:
787 books[r.book_id].merge(r)
788 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
791 return books.values()
793 def __cmp__(self, other):
794 c = cmp(self.score, other.score)
796 # this is inverted, because earlier date is better
797 return cmp(other.published_date, self.published_date)
804 Given some hint information (information we already know about)
805 our search target - like author, title (specific book), epoch, genre, kind
806 we can narrow down search using filters.
808 def __init__(self, search):
810 Accepts a Searcher instance.
817 def books(self, *books):
819 Give a hint that we search these books.
823 def tags(self, tags):
825 Give a hint that these Tag objects (a list of)
829 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
830 lst = self.book_tags.get(t.category, [])
832 self.book_tags[t.category] = lst
833 if t.category in ['theme', 'theme_pl']:
834 self.part_tags.append(t)
836 def tag_filter(self, tags, field='tags'):
838 Given a lsit of tags and an optional field (but they are normally in tags field)
839 returns a filter accepting only books with specific tags.
844 toks = self.search.get_tokens(tag.name, field=field)
845 tag_phrase = PhraseQuery()
847 tag_phrase.add(Term(field, tok))
848 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
850 return QueryWrapperFilter(q)
852 def book_filter(self):
854 Filters using book tags (all tag kinds except a theme)
856 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
858 return self.tag_filter(tags)
862 def part_filter(self):
864 This filter can be used to look for book parts.
865 It filters on book id and/or themes.
869 fs.append(self.tag_filter(self.part_tags, field='themes'))
871 if self._books != []:
873 for b in self._books:
874 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
875 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
878 return Search.chain_filters(fs)
880 def should_search_for_book(self):
881 return self._books == []
883 def just_search_in(self, all):
884 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
887 if field == 'authors' and 'author' in self.book_tags:
889 if field == 'title' and self._books != []:
891 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
897 class Search(IndexStore):
901 def __init__(self, default_field="content"):
902 IndexStore.__init__(self)
903 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
904 # self.analyzer = WLAnalyzer()
905 self.searcher = IndexSearcher(self.store, True)
906 self.parser = QueryParser(Version.LUCENE_34, default_field,
909 self.parent_filter = TermsFilter()
910 self.parent_filter.addTerm(Term("is_book", "true"))
912 def query(self, query):
913 """Parse query in default Lucene Syntax. (for humans)
915 return self.parser.parse(query)
917 def simple_search(self, query, max_results=50):
918 """Runs a query for books using lucene syntax. (for humans)
919 Returns (books, total_hits)
922 tops = self.searcher.search(self.query(query), max_results)
924 for found in tops.scoreDocs:
925 doc = self.searcher.doc(found.doc)
926 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
927 return (bks, tops.totalHits)
929 def get_tokens(self, searched, field='content', cached=None):
930 """returns tokens analyzed by a proper (for a field) analyzer
931 argument can be: StringReader, string/unicode, or tokens. In the last case
932 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
934 if cached is not None and field in cached:
937 if isinstance(searched, str) or isinstance(searched, unicode):
938 searched = StringReader(searched)
939 elif isinstance(searched, list):
943 tokens = self.analyzer.reusableTokenStream(field, searched)
945 while tokens.incrementToken():
946 cta = tokens.getAttribute(CharTermAttribute.class_)
947 toks.append(cta.toString())
949 if cached is not None:
954 def fuzziness(self, fuzzy):
955 """Helper method to sanitize fuzziness"""
958 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
963 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
965 Return a PhraseQuery with a series of tokens.
968 phrase = MultiPhraseQuery()
970 term = Term(field, t)
971 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
975 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
979 if not fuzzterm.next(): break
981 phrase.add(JArray('object')(fuzzterms, Term))
985 phrase = PhraseQuery()
988 term = Term(field, t)
992 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
994 Returns term queries joined by boolean query.
995 modal - applies to boolean query
996 fuzzy - should the query by fuzzy.
1000 term = Term(field, t)
1002 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1004 term = TermQuery(term)
1005 q.add(BooleanClause(term, modal))
1008 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1009 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1010 if filters is None: filters = []
1011 if tokens_cache is None: tokens_cache = {}
1013 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1015 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1017 filters.append(self.term_filter(Term('is_book', 'true')))
1018 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1020 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1022 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1023 filters=None, tokens_cache=None, boost=None, snippets=True):
1024 if filters is None: filters = []
1025 if tokens_cache is None: tokens_cache = {}
1028 filters.append(self.term_filter(Term('is_book', 'true')))
1030 query = BooleanQuery()
1033 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1035 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1036 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1038 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1040 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1041 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1043 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1045 Search for perfect book matches. Just see if the query matches with some author or title,
1046 taking hints into account.
1048 fields_to_search = ['authors', 'title']
1051 if not hint.should_search_for_book():
1053 fields_to_search = hint.just_search_in(fields_to_search)
1054 only_in = hint.book_filter()
1056 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1060 top = self.searcher.search(q,
1061 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1063 for found in top.scoreDocs:
1064 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1067 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1068 fields_to_search = ['tags', 'authors', 'title']
1072 if not hint.should_search_for_book():
1074 fields_to_search = hint.just_search_in(fields_to_search)
1075 only_in = hint.book_filter()
1077 tokens = self.get_tokens(searched, field='SIMPLE')
1081 for fld in fields_to_search:
1082 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1083 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1086 top = self.searcher.search(q,
1087 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1089 for found in top.scoreDocs:
1090 books.append(SearchResult(self, found, how_found="search_book"))
1094 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1096 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1097 some part/fragment of the book.
1099 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1103 flt = hint.part_filter()
1107 top = self.searcher.search(q,
1108 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1111 for found in top.scoreDocs:
1112 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1116 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1118 Tries to use search terms to match different fields of book (or its parts).
1119 E.g. one word can be an author survey, another be a part of the title, and the rest
1120 are some words from third chapter.
1122 if tokens_cache is None: tokens_cache = {}
1127 only_in = hint.part_filter()
1129 # content only query : themes x content
1132 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1133 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1135 # only search in themes when we do not already filter by themes
1136 if hint is None or hint.just_search_in(['themes']) != []:
1137 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1138 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1140 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1141 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1143 topDocs = self.searcher.search(q, only_in, max_results)
1144 for found in topDocs.scoreDocs:
1145 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1146 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1148 # query themes/content x author/title/tags
1150 in_content = BooleanQuery()
1151 in_meta = BooleanQuery()
1153 for fld in ['themes_pl', 'content']:
1154 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1156 for fld in ['tags', 'authors', 'title']:
1157 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1159 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1160 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1162 topDocs = self.searcher.search(q, only_in, max_results)
1163 for found in topDocs.scoreDocs:
1164 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1165 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1169 # def multisearch(self, query, max_results=50):
1172 # - (phrase) OR -> content
1175 # - (keywords) -> authors
1180 # queryreader = StringReader(query)
1181 # tokens = self.get_tokens(queryreader)
1183 # top_level = BooleanQuery()
1184 # Should = BooleanClause.Occur.SHOULD
1186 # phrase_level = BooleanQuery()
1187 # phrase_level.setBoost(1.3)
1189 # p_content = self.make_phrase(tokens, joined=True)
1190 # p_title = self.make_phrase(tokens, 'title')
1191 # p_author = self.make_phrase(tokens, 'author')
1193 # phrase_level.add(BooleanClause(p_content, Should))
1194 # phrase_level.add(BooleanClause(p_title, Should))
1195 # phrase_level.add(BooleanClause(p_author, Should))
1197 # kw_level = BooleanQuery()
1199 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1200 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1201 # kw_level.add(j_themes, Should)
1202 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1203 # j_con = self.make_term_query(tokens, joined=True)
1204 # kw_level.add(j_con, Should)
1206 # top_level.add(BooleanClause(phrase_level, Should))
1207 # top_level.add(BooleanClause(kw_level, Should))
1211 def get_snippets(self, scoreDoc, query, field='content'):
1213 Returns a snippet for found scoreDoc.
1215 htmlFormatter = SimpleHTMLFormatter()
1216 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1218 stored = self.searcher.doc(scoreDoc.doc)
1220 position = stored.get('snippets_position')
1221 length = stored.get('snippets_length')
1222 if position is None or length is None:
1225 book_id = int(stored.get('book_id'))
1226 snippets = Snippets(book_id).open()
1229 text = snippets.get((int(position),
1234 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1235 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1236 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1238 except Exception, e:
1240 if hasattr(e, 'getJavaException'):
1241 e2 = unicode(e.getJavaException())
1242 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1247 def enum_to_array(enum):
1249 Converts a lucene TermEnum to array of Terms, suitable for
1258 if not enum.next(): break
1261 return JArray('object')(terms, Term)
1263 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1265 Search for Tag objects using query.
1268 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1269 tops = self.searcher.search(query, filters, max_results)
1272 for found in tops.scoreDocs:
1273 doc = self.searcher.doc(found.doc)
1274 is_pdcounter = doc.get('is_pdcounter')
1275 category = doc.get('tag_category')
1276 if is_pdcounter == 'true':
1277 if category == 'pd_author':
1278 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1279 elif category == 'pd_book':
1280 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1281 tag.category = 'pd_book' # make it look more lik a tag.
1283 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1285 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1286 # don't add the pdcounter tag if same tag already exists
1287 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1289 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1290 print 'returning %s' % tags
1293 def search_books(self, query, filter=None, max_results=10):
1295 Searches for Book objects using query
1298 tops = self.searcher.search(query, filter, max_results)
1299 for found in tops.scoreDocs:
1300 doc = self.searcher.doc(found.doc)
1301 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1304 def make_prefix_phrase(self, toks, field):
1305 q = MultiPhraseQuery()
1306 for i in range(len(toks)):
1307 t = Term(field, toks[i])
1308 if i == len(toks) - 1:
1309 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1319 def term_filter(term, inverse=False):
1320 only_term = TermsFilter()
1321 only_term.addTerm(term)
1324 neg = BooleanFilter()
1325 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1330 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1332 Return auto-complete hints for tags
1333 using prefix search.
1335 toks = self.get_tokens(string, field='SIMPLE')
1336 top = BooleanQuery()
1338 for field in ['tag_name', 'tag_name_pl']:
1340 q = self.make_prefix_phrase(toks, field)
1342 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1343 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1345 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1347 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1349 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1351 Returns auto-complete hints for book titles
1352 Because we do not index 'pseudo' title-tags.
1355 toks = self.get_tokens(string, field='SIMPLE')
1358 q = self.make_prefix_phrase(toks, 'title')
1360 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1362 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1365 def chain_filters(filters, op=ChainedFilter.AND):
1367 Chains a filter list together
1369 filters = filter(lambda x: x is not None, filters)
1370 if not filters or filters is []:
1372 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1375 def filtered_categories(self, tags):
1377 Return a list of tag categories, present in tags list.
1381 cats[t.category] = True