1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, timeout=None):
174 raise Exception("Index is already opened")
175 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
177 conf.setWriteLockTimeout(long(timeout))
178 self.index = IndexWriter(self.store, conf)
182 self.index.optimize()
186 self.index.optimize()
187 except JavaError, je:
188 print "Error during optimize phase, check index: %s" % je
197 def __exit__(self, type, value, tb):
201 class Index(BaseIndex):
203 Class indexing books.
205 def __init__(self, analyzer=None):
206 super(Index, self).__init__(analyzer)
208 def index_tags(self):
210 Re-index global tag list.
211 Removes all tags from index, then index them again.
212 Indexed fields include: id, name (with and without polish stems), category
214 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215 self.index.deleteDocuments(q)
217 for tag in catalogue.models.Tag.objects.all():
219 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223 self.index.addDocument(doc)
225 for pdtag in PDCounterAuthor.objects.all():
227 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232 self.index.addDocument(doc)
234 for pdtag in PDCounterBook.objects.all():
236 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
238 doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239 doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
240 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
241 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
242 self.index.addDocument(doc)
244 def create_book_doc(self, book):
246 Create a lucene document referring book id.
249 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
250 if book.parent is not None:
251 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
254 def remove_book(self, book):
255 """Removes a book from search index.
256 book - Book instance."""
257 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
258 self.index.deleteDocuments(q)
260 def index_book(self, book, book_info=None, overwrite=True):
263 Creates a lucene document for extracted metadata
264 and calls self.index_content() to index the contents of the book.
267 self.remove_book(book)
269 book_doc = self.create_book_doc(book)
270 meta_fields = self.extract_metadata(book, book_info)
271 for f in meta_fields.values():
272 if isinstance(f, list) or isinstance(f, tuple):
277 self.index.addDocument(book_doc)
280 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
285 'dramat_wierszowany_l',
286 'dramat_wierszowany_lp',
287 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
291 ignore_content_tags = [
293 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
295 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
298 footnote_tags = ['pa', 'pt', 'pr', 'pe']
300 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
302 published_date_re = re.compile("([0-9]+)[\]. ]*$")
304 def extract_metadata(self, book, book_info=None):
306 Extract metadata from book and returns a map of fields keyed by fieldname
310 if book_info is None:
311 book_info = dcparser.parse(open(book.xml_file.path))
313 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
314 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
315 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
318 for field in dcparser.BookInfo.FIELDS:
319 if hasattr(book_info, field.name):
320 if not getattr(book_info, field.name):
322 # since no type information is available, we use validator
323 type_indicator = field.validator
324 if type_indicator == dcparser.as_unicode:
325 s = getattr(book_info, field.name)
329 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
330 except JavaError as je:
331 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
332 elif type_indicator == dcparser.as_person:
333 p = getattr(book_info, field.name)
334 if isinstance(p, dcparser.Person):
337 persons = ', '.join(map(unicode, p))
338 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
339 elif type_indicator == dcparser.as_date:
340 dt = getattr(book_info, field.name)
341 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
342 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
346 if hasattr(book_info, 'source_name') and book_info.source_name:
347 match = self.published_date_re.search(book_info.source_name)
348 if match is not None:
349 pd = str(match.groups()[0])
351 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
355 def add_gaps(self, fields, fieldname):
357 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
358 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
362 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
363 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
365 def get_master(self, root):
367 Returns the first master tag from an etree.
369 for master in root.iter():
370 if master.tag in self.master_tags:
373 def index_content(self, book, book_fields=[]):
375 Walks the book XML and extract content from it.
376 Adds parts for each header tag and for each fragment.
378 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
379 root = wld.edoc.getroot()
381 master = self.get_master(root)
385 def walker(node, ignore_tags=[]):
387 if node.tag not in ignore_tags:
388 yield node, None, None
389 if node.text is not None:
390 yield None, node.text, None
391 for child in list(node):
392 for b, t, e in walker(child):
394 yield None, None, node
396 if node.tail is not None:
397 yield None, node.tail, None
400 def fix_format(text):
401 # separator = [u" ", u"\t", u".", u";", u","]
402 if isinstance(text, list):
403 # need to join it first
404 text = filter(lambda s: s is not None, content)
405 text = u' '.join(text)
406 # for i in range(len(text)):
408 # if text[i][0] not in separator\
409 # and text[i - 1][-1] not in separator:
410 # text.insert(i, u" ")
412 return re.sub("(?m)/$", "", text)
414 def add_part(snippets, **fields):
415 doc = self.create_book_doc(book)
416 for f in book_fields:
419 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
420 doc.add(NumericField("header_span", Field.Store.YES, True)\
421 .setIntValue('header_span' in fields and fields['header_span'] or 1))
422 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
424 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
425 Field.TermVector.WITH_POSITIONS_OFFSETS))
427 snip_pos = snippets.add(fields["content"])
428 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
429 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
431 if 'fragment_anchor' in fields:
432 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
433 Field.Store.YES, Field.Index.NOT_ANALYZED))
435 if 'themes' in fields:
436 themes, themes_pl = zip(*[
437 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
438 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
439 for theme in fields['themes']])
441 themes = self.add_gaps(themes, 'themes')
442 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
452 if isinstance(s, unicode):
453 return s.encode('utf-8')
458 snippets = Snippets(book.id).open('w')
460 for header, position in zip(list(master), range(len(master))):
462 if header.tag in self.skip_header_tags:
464 if header.tag is etree.Comment:
471 def all_content(text):
472 for frag in fragments.values():
473 frag['content'].append(text)
475 handle_text = [all_content]
478 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
480 if start is not None and start.tag in self.footnote_tags:
482 def collect_footnote(t):
484 handle_text.append(collect_footnote)
485 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
487 doc = add_part(snippets, header_index=position, header_type=header.tag,
488 content=u''.join(footnote),
489 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
491 self.index.addDocument(doc)
492 #print "@ footnote text: %s" % footnote
495 # handle fragments and themes.
496 if start is not None and start.tag == 'begin':
497 fid = start.attrib['id'][1:]
498 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
500 # themes for this fragment
501 elif start is not None and start.tag == 'motyw':
502 fid = start.attrib['id'][1:]
503 handle_text.append(None)
504 if start.text is not None:
505 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
506 elif end is not None and end.tag == 'motyw':
509 elif start is not None and start.tag == 'end':
510 fid = start.attrib['id'][1:]
511 if fid not in fragments:
512 continue # a broken <end> node, skip it
513 frag = fragments[fid]
514 if frag['themes'] == []:
515 continue # empty themes list.
518 doc = add_part(snippets,
519 header_type=frag['start_header'],
520 header_index=frag['start_section'],
521 header_span=position - frag['start_section'] + 1,
523 content=fix_format(frag['content']),
524 themes=frag['themes'])
525 #print '@ FRAG %s' % frag['content']
526 self.index.addDocument(doc)
530 if text is not None and handle_text is not []:
531 hdl = handle_text[-1]
535 # in the end, add a section text.
536 doc = add_part(snippets, header_index=position, header_type=header.tag,
537 content=fix_format(content))
538 #print '@ CONTENT: %s' % fix_format(content)
540 self.index.addDocument(doc)
546 def log_exception_wrapper(f):
551 print("Error in indexing thread: %s" % e)
552 traceback.print_exc()
557 class ReusableIndex(Index):
559 Works like index, but does not close/optimize Lucene index
560 until program exit (uses atexit hook).
561 This is usefull for importbooks command.
563 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
567 def open(self, analyzer=None, **kw):
568 if ReusableIndex.index:
569 self.index = ReusableIndex.index
571 print("opening index")
572 Index.open(self, analyzer, **kw)
573 ReusableIndex.index = self.index
574 atexit.register(ReusableIndex.close_reusable)
576 # def index_book(self, *args, **kw):
577 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
578 # ReusableIndex.pool_jobs.append(job)
581 def close_reusable():
582 if ReusableIndex.index:
583 print("closing index")
584 ReusableIndex.index.optimize()
585 ReusableIndex.index.close()
586 ReusableIndex.index = None
589 if ReusableIndex.index:
590 ReusableIndex.index.commit()
593 class JoinSearch(object):
595 This mixin could be used to handle block join queries.
598 def __init__(self, *args, **kw):
599 super(JoinSearch, self).__init__(*args, **kw)
601 def wrapjoins(self, query, fields=[]):
603 This functions modifies the query in a recursive way,
604 so Term and Phrase Queries contained, which match
605 provided fields are wrapped in a BlockJoinQuery,
606 and so delegated to children documents.
608 if BooleanQuery.instance_(query):
609 qs = BooleanQuery.cast_(query)
611 clause = BooleanClause.cast_(clause)
612 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
616 query.extractTerms(termset)
619 if t.field() not in fields:
621 return BlockJoinQuery(query, self.parent_filter,
622 BlockJoinQuery.ScoreMode.Total)
624 def bsearch(self, query, max_results=50):
625 q = self.query(query)
626 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
628 tops = self.searcher.search(bjq, max_results)
630 for found in tops.scoreDocs:
631 doc = self.searcher.doc(found.doc)
632 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
633 return (bks, tops.totalHits)
636 class SearchResult(object):
637 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
638 if tokens_cache is None: tokens_cache = {}
643 self._score = scoreDocs.score
648 self._processed_hits = None # processed hits
650 stored = search.searcher.doc(scoreDocs.doc)
651 self.book_id = int(stored.get("book_id"))
653 pd = stored.get("published_date")
656 self.published_date = int(pd)
658 header_type = stored.get("header_type")
659 # we have a content hit in some header of fragment
660 if header_type is not None:
661 sec = (header_type, int(stored.get("header_index")))
662 header_span = stored.get('header_span')
663 header_span = header_span is not None and int(header_span) or 1
665 fragment = stored.get("fragment_anchor")
668 snippets = snippets.replace("/\n", "\n")
669 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
671 self._hits.append(hit)
674 self.searched = searched
675 self.tokens_cache = tokens_cache
679 return self._score * self.boost
681 def merge(self, other):
682 if self.book_id != other.book_id:
683 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
684 self._hits += other._hits
685 if other.score > self.score:
686 self._score = other._score
690 return catalogue.models.Book.objects.get(id=self.book_id)
692 book = property(get_book)
696 if self._processed_hits is not None:
697 return self._processed_hits
706 # to sections and fragments
707 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
708 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
709 sect = filter(lambda s: 0 == len(filter(
710 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
711 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
716 # remove duplicate fragments
721 if fragments[fid][SCORE] >= f[SCORE]:
724 frags = fragments.values()
726 # remove duplicate sections
730 si = s[POSITION][POSITION_INDEX]
733 if sections[si]['score'] >= s[SCORE]:
736 m = {'score': s[SCORE],
737 'section_number': s[POSITION][POSITION_INDEX] + 1,
742 hits = sections.values()
746 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
747 except catalogue.models.Fragment.DoesNotExist:
751 # Figure out if we were searching for a token matching some word in theme name.
752 themes = frag.tags.filter(category='theme')
754 if self.searched is not None:
755 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
757 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
760 if not theme in themes_hit:
761 themes_hit.append(theme)
764 m = {'score': f[SCORE],
766 'section_number': f[POSITION][POSITION_INDEX] + 1,
768 'themes_hit': themes_hit
773 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
775 self._processed_hits = hits
779 def __unicode__(self):
780 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
783 def aggregate(*result_lists):
785 for rl in result_lists:
787 if r.book_id in books:
788 books[r.book_id].merge(r)
789 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
792 return books.values()
794 def __cmp__(self, other):
795 c = cmp(self.score, other.score)
797 # this is inverted, because earlier date is better
798 return cmp(other.published_date, self.published_date)
805 Given some hint information (information we already know about)
806 our search target - like author, title (specific book), epoch, genre, kind
807 we can narrow down search using filters.
809 def __init__(self, search):
811 Accepts a Searcher instance.
818 def books(self, *books):
820 Give a hint that we search these books.
824 def tags(self, tags):
826 Give a hint that these Tag objects (a list of)
830 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
831 lst = self.book_tags.get(t.category, [])
833 self.book_tags[t.category] = lst
834 if t.category in ['theme', 'theme_pl']:
835 self.part_tags.append(t)
837 def tag_filter(self, tags, field='tags'):
839 Given a lsit of tags and an optional field (but they are normally in tags field)
840 returns a filter accepting only books with specific tags.
845 toks = self.search.get_tokens(tag.name, field=field)
846 tag_phrase = PhraseQuery()
848 tag_phrase.add(Term(field, tok))
849 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
851 return QueryWrapperFilter(q)
853 def book_filter(self):
855 Filters using book tags (all tag kinds except a theme)
857 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
859 return self.tag_filter(tags)
863 def part_filter(self):
865 This filter can be used to look for book parts.
866 It filters on book id and/or themes.
870 fs.append(self.tag_filter(self.part_tags, field='themes'))
872 if self._books != []:
874 for b in self._books:
875 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
876 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
879 return Search.chain_filters(fs)
881 def should_search_for_book(self):
882 return self._books == []
884 def just_search_in(self, all):
885 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
888 if field == 'authors' and 'author' in self.book_tags:
890 if field == 'title' and self._books != []:
892 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
898 class Search(IndexStore):
902 def __init__(self, default_field="content"):
903 IndexStore.__init__(self)
904 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
905 # self.analyzer = WLAnalyzer()
906 self.searcher = IndexSearcher(self.store, True)
907 self.parser = QueryParser(Version.LUCENE_34, default_field,
910 self.parent_filter = TermsFilter()
911 self.parent_filter.addTerm(Term("is_book", "true"))
913 def query(self, query):
914 """Parse query in default Lucene Syntax. (for humans)
916 return self.parser.parse(query)
918 def simple_search(self, query, max_results=50):
919 """Runs a query for books using lucene syntax. (for humans)
920 Returns (books, total_hits)
923 tops = self.searcher.search(self.query(query), max_results)
925 for found in tops.scoreDocs:
926 doc = self.searcher.doc(found.doc)
927 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
928 return (bks, tops.totalHits)
930 def get_tokens(self, searched, field='content', cached=None):
931 """returns tokens analyzed by a proper (for a field) analyzer
932 argument can be: StringReader, string/unicode, or tokens. In the last case
933 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
935 if cached is not None and field in cached:
938 if isinstance(searched, str) or isinstance(searched, unicode):
939 searched = StringReader(searched)
940 elif isinstance(searched, list):
944 tokens = self.analyzer.reusableTokenStream(field, searched)
946 while tokens.incrementToken():
947 cta = tokens.getAttribute(CharTermAttribute.class_)
948 toks.append(cta.toString())
950 if cached is not None:
955 def fuzziness(self, fuzzy):
956 """Helper method to sanitize fuzziness"""
959 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
964 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
966 Return a PhraseQuery with a series of tokens.
969 phrase = MultiPhraseQuery()
971 term = Term(field, t)
972 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
976 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
980 if not fuzzterm.next(): break
982 phrase.add(JArray('object')(fuzzterms, Term))
986 phrase = PhraseQuery()
989 term = Term(field, t)
993 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
995 Returns term queries joined by boolean query.
996 modal - applies to boolean query
997 fuzzy - should the query by fuzzy.
1001 term = Term(field, t)
1003 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1005 term = TermQuery(term)
1006 q.add(BooleanClause(term, modal))
1009 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011 if filters is None: filters = []
1012 if tokens_cache is None: tokens_cache = {}
1014 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1016 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1018 filters.append(self.term_filter(Term('is_book', 'true')))
1019 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1021 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1023 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024 filters=None, tokens_cache=None, boost=None, snippets=True):
1025 if filters is None: filters = []
1026 if tokens_cache is None: tokens_cache = {}
1029 filters.append(self.term_filter(Term('is_book', 'true')))
1031 query = BooleanQuery()
1034 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1036 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1039 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1041 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1044 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1046 Search for perfect book matches. Just see if the query matches with some author or title,
1047 taking hints into account.
1049 fields_to_search = ['authors', 'title']
1052 if not hint.should_search_for_book():
1054 fields_to_search = hint.just_search_in(fields_to_search)
1055 only_in = hint.book_filter()
1057 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1061 top = self.searcher.search(q,
1062 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1064 for found in top.scoreDocs:
1065 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1068 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069 fields_to_search = ['tags', 'authors', 'title']
1073 if not hint.should_search_for_book():
1075 fields_to_search = hint.just_search_in(fields_to_search)
1076 only_in = hint.book_filter()
1078 tokens = self.get_tokens(searched, field='SIMPLE')
1082 for fld in fields_to_search:
1083 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1087 top = self.searcher.search(q,
1088 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1090 for found in top.scoreDocs:
1091 books.append(SearchResult(self, found, how_found="search_book"))
1095 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1097 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098 some part/fragment of the book.
1100 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1104 flt = hint.part_filter()
1108 top = self.searcher.search(q,
1109 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1112 for found in top.scoreDocs:
1113 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1117 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1119 Tries to use search terms to match different fields of book (or its parts).
1120 E.g. one word can be an author survey, another be a part of the title, and the rest
1121 are some words from third chapter.
1123 if tokens_cache is None: tokens_cache = {}
1128 only_in = hint.part_filter()
1130 # content only query : themes x content
1133 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1136 # only search in themes when we do not already filter by themes
1137 if hint is None or hint.just_search_in(['themes']) != []:
1138 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1141 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1144 topDocs = self.searcher.search(q, only_in, max_results)
1145 for found in topDocs.scoreDocs:
1146 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1149 # query themes/content x author/title/tags
1151 in_content = BooleanQuery()
1152 in_meta = BooleanQuery()
1154 for fld in ['themes_pl', 'content']:
1155 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1157 for fld in ['tags', 'authors', 'title']:
1158 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1160 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1163 topDocs = self.searcher.search(q, only_in, max_results)
1164 for found in topDocs.scoreDocs:
1165 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1170 # def multisearch(self, query, max_results=50):
1173 # - (phrase) OR -> content
1176 # - (keywords) -> authors
1181 # queryreader = StringReader(query)
1182 # tokens = self.get_tokens(queryreader)
1184 # top_level = BooleanQuery()
1185 # Should = BooleanClause.Occur.SHOULD
1187 # phrase_level = BooleanQuery()
1188 # phrase_level.setBoost(1.3)
1190 # p_content = self.make_phrase(tokens, joined=True)
1191 # p_title = self.make_phrase(tokens, 'title')
1192 # p_author = self.make_phrase(tokens, 'author')
1194 # phrase_level.add(BooleanClause(p_content, Should))
1195 # phrase_level.add(BooleanClause(p_title, Should))
1196 # phrase_level.add(BooleanClause(p_author, Should))
1198 # kw_level = BooleanQuery()
1200 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202 # kw_level.add(j_themes, Should)
1203 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204 # j_con = self.make_term_query(tokens, joined=True)
1205 # kw_level.add(j_con, Should)
1207 # top_level.add(BooleanClause(phrase_level, Should))
1208 # top_level.add(BooleanClause(kw_level, Should))
1212 def get_snippets(self, scoreDoc, query, field='content'):
1214 Returns a snippet for found scoreDoc.
1216 htmlFormatter = SimpleHTMLFormatter()
1217 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1219 stored = self.searcher.doc(scoreDoc.doc)
1221 position = stored.get('snippets_position')
1222 length = stored.get('snippets_length')
1223 if position is None or length is None:
1226 book_id = int(stored.get('book_id'))
1227 snippets = Snippets(book_id).open()
1230 text = snippets.get((int(position),
1235 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1239 except Exception, e:
1241 if hasattr(e, 'getJavaException'):
1242 e2 = unicode(e.getJavaException())
1243 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1248 def enum_to_array(enum):
1250 Converts a lucene TermEnum to array of Terms, suitable for
1259 if not enum.next(): break
1262 return JArray('object')(terms, Term)
1264 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1266 Search for Tag objects using query.
1269 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270 tops = self.searcher.search(query, filters, max_results)
1273 for found in tops.scoreDocs:
1274 doc = self.searcher.doc(found.doc)
1275 is_pdcounter = doc.get('is_pdcounter')
1276 category = doc.get('tag_category')
1277 if is_pdcounter == 'true':
1278 if category == 'pd_author':
1279 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280 elif category == 'pd_book':
1281 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282 tag.category = 'pd_book' # make it look more lik a tag.
1284 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1286 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287 # don't add the pdcounter tag if same tag already exists
1288 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1290 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291 print 'returning %s' % tags
1294 def search_books(self, query, filter=None, max_results=10):
1296 Searches for Book objects using query
1299 tops = self.searcher.search(query, filter, max_results)
1300 for found in tops.scoreDocs:
1301 doc = self.searcher.doc(found.doc)
1302 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1305 def make_prefix_phrase(self, toks, field):
1306 q = MultiPhraseQuery()
1307 for i in range(len(toks)):
1308 t = Term(field, toks[i])
1309 if i == len(toks) - 1:
1310 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1320 def term_filter(term, inverse=False):
1321 only_term = TermsFilter()
1322 only_term.addTerm(term)
1325 neg = BooleanFilter()
1326 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1331 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1333 Return auto-complete hints for tags
1334 using prefix search.
1336 toks = self.get_tokens(string, field='SIMPLE')
1337 top = BooleanQuery()
1339 for field in ['tag_name', 'tag_name_pl']:
1341 q = self.make_prefix_phrase(toks, field)
1343 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1346 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1348 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1350 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1352 Returns auto-complete hints for book titles
1353 Because we do not index 'pseudo' title-tags.
1356 toks = self.get_tokens(string, field='SIMPLE')
1359 q = self.make_prefix_phrase(toks, 'title')
1361 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1363 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1366 def chain_filters(filters, op=ChainedFilter.AND):
1368 Chains a filter list together
1370 filters = filter(lambda x: x is not None, filters)
1371 if not filters or filters is []:
1373 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1376 def filtered_categories(self, tags):
1378 Return a list of tag categories, present in tags list.
1382 cats[t.category] = True