1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None, timeout=None):
174 raise Exception("Index is already opened")
175 conf = IndexWriterConfig(Version.LUCENE_34, analyzer)
177 conf.setWriteLockTimeout(long(timeout))
178 self.index = IndexWriter(self.store, conf)
182 self.index.optimize()
186 self.index.optimize()
187 except JavaError, je:
188 print "Error during optimize phase, check index: %s" % je
197 def __exit__(self, type, value, tb):
201 class Index(BaseIndex):
203 Class indexing books.
205 def __init__(self, analyzer=None):
206 super(Index, self).__init__(analyzer)
208 def index_tags(self):
210 Re-index global tag list.
211 Removes all tags from index, then index them again.
212 Indexed fields include: id, name (with and without polish stems), category
214 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215 self.index.deleteDocuments(q)
217 for tag in catalogue.models.Tag.objects.all():
219 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223 self.index.addDocument(doc)
225 for pdtag in PDCounterAuthor.objects.all():
227 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
231 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232 self.index.addDocument(doc)
234 def create_book_doc(self, book):
236 Create a lucene document referring book id.
239 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
240 if book.parent is not None:
241 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
244 def remove_book(self, book):
245 """Removes a book from search index.
246 book - Book instance."""
247 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
248 self.index.deleteDocuments(q)
250 def index_book(self, book, book_info=None, overwrite=True):
253 Creates a lucene document for extracted metadata
254 and calls self.index_content() to index the contents of the book.
257 self.remove_book(book)
259 book_doc = self.create_book_doc(book)
260 meta_fields = self.extract_metadata(book, book_info)
261 for f in meta_fields.values():
262 if isinstance(f, list) or isinstance(f, tuple):
268 self.index.addDocument(book_doc)
271 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
276 'dramat_wierszowany_l',
277 'dramat_wierszowany_lp',
278 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
282 ignore_content_tags = [
284 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
286 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
289 footnote_tags = ['pa', 'pt', 'pr', 'pe']
291 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
293 published_date_re = re.compile("([0-9]+)[\]. ]*$")
295 def extract_metadata(self, book, book_info=None):
297 Extract metadata from book and returns a map of fields keyed by fieldname
301 if book_info is None:
302 book_info = dcparser.parse(open(book.xml_file.path))
304 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
305 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
306 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
309 for field in dcparser.BookInfo.FIELDS:
310 if hasattr(book_info, field.name):
311 if not getattr(book_info, field.name):
313 # since no type information is available, we use validator
314 type_indicator = field.validator
315 if type_indicator == dcparser.as_unicode:
316 s = getattr(book_info, field.name)
320 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
321 except JavaError as je:
322 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
323 elif type_indicator == dcparser.as_person:
324 p = getattr(book_info, field.name)
325 if isinstance(p, dcparser.Person):
328 persons = ', '.join(map(unicode, p))
329 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
330 elif type_indicator == dcparser.as_date:
331 dt = getattr(book_info, field.name)
332 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
333 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
337 if hasattr(book_info, 'source_name') and book_info.source_name:
338 match = self.published_date_re.search(book_info.source_name)
339 if match is not None:
340 pd = str(match.groups()[0])
342 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
346 def add_gaps(self, fields, fieldname):
348 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
349 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
353 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
354 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
356 def get_master(self, root):
358 Returns the first master tag from an etree.
360 for master in root.iter():
361 if master.tag in self.master_tags:
364 def index_content(self, book, book_fields=[]):
366 Walks the book XML and extract content from it.
367 Adds parts for each header tag and for each fragment.
369 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
370 root = wld.edoc.getroot()
372 master = self.get_master(root)
376 def walker(node, ignore_tags=[]):
378 if node.tag not in ignore_tags:
379 yield node, None, None
380 if node.text is not None:
381 yield None, node.text, None
382 for child in list(node):
383 for b, t, e in walker(child):
385 yield None, None, node
387 if node.tail is not None:
388 yield None, node.tail, None
391 def fix_format(text):
392 # separator = [u" ", u"\t", u".", u";", u","]
393 if isinstance(text, list):
394 # need to join it first
395 text = filter(lambda s: s is not None, content)
396 text = u' '.join(text)
397 # for i in range(len(text)):
399 # if text[i][0] not in separator\
400 # and text[i - 1][-1] not in separator:
401 # text.insert(i, u" ")
403 return re.sub("(?m)/$", "", text)
405 def add_part(snippets, **fields):
406 doc = self.create_book_doc(book)
407 for f in book_fields:
410 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
411 doc.add(NumericField("header_span", Field.Store.YES, True)\
412 .setIntValue('header_span' in fields and fields['header_span'] or 1))
413 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
415 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
416 Field.TermVector.WITH_POSITIONS_OFFSETS))
418 snip_pos = snippets.add(fields["content"])
419 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
420 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
422 if 'fragment_anchor' in fields:
423 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
424 Field.Store.YES, Field.Index.NOT_ANALYZED))
426 if 'themes' in fields:
427 themes, themes_pl = zip(*[
428 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
429 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
430 for theme in fields['themes']])
432 themes = self.add_gaps(themes, 'themes')
433 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
443 if isinstance(s, unicode):
444 return s.encode('utf-8')
449 snippets = Snippets(book.id).open('w')
451 for header, position in zip(list(master), range(len(master))):
453 if header.tag in self.skip_header_tags:
455 if header.tag is etree.Comment:
462 def all_content(text):
463 for frag in fragments.values():
464 frag['content'].append(text)
466 handle_text = [all_content]
469 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
471 if start is not None and start.tag in self.footnote_tags:
473 def collect_footnote(t):
475 handle_text.append(collect_footnote)
476 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
478 doc = add_part(snippets, header_index=position, header_type=header.tag,
479 content=u''.join(footnote),
480 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
482 self.index.addDocument(doc)
483 #print "@ footnote text: %s" % footnote
486 # handle fragments and themes.
487 if start is not None and start.tag == 'begin':
488 fid = start.attrib['id'][1:]
489 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
491 # themes for this fragment
492 elif start is not None and start.tag == 'motyw':
493 fid = start.attrib['id'][1:]
494 handle_text.append(None)
495 if start.text is not None:
496 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
497 elif end is not None and end.tag == 'motyw':
500 elif start is not None and start.tag == 'end':
501 fid = start.attrib['id'][1:]
502 if fid not in fragments:
503 continue # a broken <end> node, skip it
504 frag = fragments[fid]
505 if frag['themes'] == []:
506 continue # empty themes list.
509 doc = add_part(snippets,
510 header_type=frag['start_header'],
511 header_index=frag['start_section'],
512 header_span=position - frag['start_section'] + 1,
514 content=fix_format(frag['content']),
515 themes=frag['themes'])
516 #print '@ FRAG %s' % frag['content']
517 self.index.addDocument(doc)
521 if text is not None and handle_text is not []:
522 hdl = handle_text[-1]
526 # in the end, add a section text.
527 doc = add_part(snippets, header_index=position, header_type=header.tag,
528 content=fix_format(content))
529 #print '@ CONTENT: %s' % fix_format(content)
531 self.index.addDocument(doc)
537 def log_exception_wrapper(f):
542 print("Error in indexing thread: %s" % e)
543 traceback.print_exc()
548 class ReusableIndex(Index):
550 Works like index, but does not close/optimize Lucene index
551 until program exit (uses atexit hook).
552 This is usefull for importbooks command.
554 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
558 def open(self, analyzer=None, **kw):
559 if ReusableIndex.index:
560 self.index = ReusableIndex.index
562 print("opening index")
563 Index.open(self, analyzer, **kw)
564 ReusableIndex.index = self.index
565 atexit.register(ReusableIndex.close_reusable)
567 # def index_book(self, *args, **kw):
568 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
569 # ReusableIndex.pool_jobs.append(job)
572 def close_reusable():
573 if ReusableIndex.index:
574 print("closing index")
575 ReusableIndex.index.optimize()
576 ReusableIndex.index.close()
577 ReusableIndex.index = None
580 if ReusableIndex.index:
581 ReusableIndex.index.commit()
584 class JoinSearch(object):
586 This mixin could be used to handle block join queries.
589 def __init__(self, *args, **kw):
590 super(JoinSearch, self).__init__(*args, **kw)
592 def wrapjoins(self, query, fields=[]):
594 This functions modifies the query in a recursive way,
595 so Term and Phrase Queries contained, which match
596 provided fields are wrapped in a BlockJoinQuery,
597 and so delegated to children documents.
599 if BooleanQuery.instance_(query):
600 qs = BooleanQuery.cast_(query)
602 clause = BooleanClause.cast_(clause)
603 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
607 query.extractTerms(termset)
610 if t.field() not in fields:
612 return BlockJoinQuery(query, self.parent_filter,
613 BlockJoinQuery.ScoreMode.Total)
615 def bsearch(self, query, max_results=50):
616 q = self.query(query)
617 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
619 tops = self.searcher.search(bjq, max_results)
621 for found in tops.scoreDocs:
622 doc = self.searcher.doc(found.doc)
623 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
624 return (bks, tops.totalHits)
627 class SearchResult(object):
628 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
629 if tokens_cache is None: tokens_cache = {}
634 self._score = scoreDocs.score
639 self._processed_hits = None # processed hits
641 stored = search.searcher.doc(scoreDocs.doc)
642 self.book_id = int(stored.get("book_id"))
644 pd = stored.get("published_date")
647 self.published_date = int(pd)
649 header_type = stored.get("header_type")
650 # we have a content hit in some header of fragment
651 if header_type is not None:
652 sec = (header_type, int(stored.get("header_index")))
653 header_span = stored.get('header_span')
654 header_span = header_span is not None and int(header_span) or 1
656 fragment = stored.get("fragment_anchor")
659 snippets = snippets.replace("/\n", "\n")
660 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
662 self._hits.append(hit)
665 self.searched = searched
666 self.tokens_cache = tokens_cache
670 return self._score * self.boost
672 def merge(self, other):
673 if self.book_id != other.book_id:
674 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
675 self._hits += other._hits
676 if other.score > self.score:
677 self._score = other._score
681 return catalogue.models.Book.objects.get(id=self.book_id)
683 book = property(get_book)
687 if self._processed_hits is not None:
688 return self._processed_hits
697 # to sections and fragments
698 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
699 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
700 sect = filter(lambda s: 0 == len(filter(
701 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
702 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
707 # remove duplicate fragments
712 if fragments[fid][SCORE] >= f[SCORE]:
715 frags = fragments.values()
717 # remove duplicate sections
721 si = s[POSITION][POSITION_INDEX]
724 if sections[si]['score'] >= s[SCORE]:
727 m = {'score': s[SCORE],
728 'section_number': s[POSITION][POSITION_INDEX] + 1,
733 hits = sections.values()
737 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
738 except catalogue.models.Fragment.DoesNotExist:
742 # Figure out if we were searching for a token matching some word in theme name.
743 themes = frag.tags.filter(category='theme')
745 if self.searched is not None:
746 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
748 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
751 if not theme in themes_hit:
752 themes_hit.append(theme)
755 m = {'score': f[SCORE],
757 'section_number': f[POSITION][POSITION_INDEX] + 1,
759 'themes_hit': themes_hit
764 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
766 self._processed_hits = hits
770 def __unicode__(self):
771 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
774 def aggregate(*result_lists):
776 for rl in result_lists:
778 if r.book_id in books:
779 books[r.book_id].merge(r)
780 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
783 return books.values()
785 def __cmp__(self, other):
786 c = cmp(self.score, other.score)
788 # this is inverted, because earlier date is better
789 return cmp(other.published_date, self.published_date)
796 Given some hint information (information we already know about)
797 our search target - like author, title (specific book), epoch, genre, kind
798 we can narrow down search using filters.
800 def __init__(self, search):
802 Accepts a Searcher instance.
809 def books(self, *books):
811 Give a hint that we search these books.
815 def tags(self, tags):
817 Give a hint that these Tag objects (a list of)
821 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
822 lst = self.book_tags.get(t.category, [])
824 self.book_tags[t.category] = lst
825 if t.category in ['theme', 'theme_pl']:
826 self.part_tags.append(t)
828 def tag_filter(self, tags, field='tags'):
830 Given a lsit of tags and an optional field (but they are normally in tags field)
831 returns a filter accepting only books with specific tags.
836 toks = self.search.get_tokens(tag.name, field=field)
837 tag_phrase = PhraseQuery()
839 tag_phrase.add(Term(field, tok))
840 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
842 return QueryWrapperFilter(q)
844 def book_filter(self):
846 Filters using book tags (all tag kinds except a theme)
848 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
850 return self.tag_filter(tags)
854 def part_filter(self):
856 This filter can be used to look for book parts.
857 It filters on book id and/or themes.
861 fs.append(self.tag_filter(self.part_tags, field='themes'))
863 if self._books != []:
865 for b in self._books:
866 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
867 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
870 return Search.chain_filters(fs)
872 def should_search_for_book(self):
873 return self._books == []
875 def just_search_in(self, all):
876 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
879 if field == 'authors' and 'author' in self.book_tags:
881 if field == 'title' and self._books != []:
883 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
889 class Search(IndexStore):
893 def __init__(self, default_field="content"):
894 IndexStore.__init__(self)
895 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
896 # self.analyzer = WLAnalyzer()
897 self.searcher = IndexSearcher(self.store, True)
898 self.parser = QueryParser(Version.LUCENE_34, default_field,
901 self.parent_filter = TermsFilter()
902 self.parent_filter.addTerm(Term("is_book", "true"))
904 def query(self, query):
905 """Parse query in default Lucene Syntax. (for humans)
907 return self.parser.parse(query)
909 def simple_search(self, query, max_results=50):
910 """Runs a query for books using lucene syntax. (for humans)
911 Returns (books, total_hits)
914 tops = self.searcher.search(self.query(query), max_results)
916 for found in tops.scoreDocs:
917 doc = self.searcher.doc(found.doc)
918 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
919 return (bks, tops.totalHits)
921 def get_tokens(self, searched, field='content', cached=None):
922 """returns tokens analyzed by a proper (for a field) analyzer
923 argument can be: StringReader, string/unicode, or tokens. In the last case
924 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
926 if cached is not None and field in cached:
929 if isinstance(searched, str) or isinstance(searched, unicode):
930 searched = StringReader(searched)
931 elif isinstance(searched, list):
935 tokens = self.analyzer.reusableTokenStream(field, searched)
937 while tokens.incrementToken():
938 cta = tokens.getAttribute(CharTermAttribute.class_)
939 toks.append(cta.toString())
941 if cached is not None:
946 def fuzziness(self, fuzzy):
947 """Helper method to sanitize fuzziness"""
950 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
955 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
957 Return a PhraseQuery with a series of tokens.
960 phrase = MultiPhraseQuery()
962 term = Term(field, t)
963 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
967 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
971 if not fuzzterm.next(): break
973 phrase.add(JArray('object')(fuzzterms, Term))
977 phrase = PhraseQuery()
980 term = Term(field, t)
984 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
986 Returns term queries joined by boolean query.
987 modal - applies to boolean query
988 fuzzy - should the query by fuzzy.
992 term = Term(field, t)
994 term = FuzzyQuery(term, self.fuzziness(fuzzy))
996 term = TermQuery(term)
997 q.add(BooleanClause(term, modal))
1000 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1001 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1002 if filters is None: filters = []
1003 if tokens_cache is None: tokens_cache = {}
1005 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1007 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1009 filters.append(self.term_filter(Term('is_book', 'true')))
1010 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1012 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1014 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1015 filters=None, tokens_cache=None, boost=None, snippets=True):
1016 if filters is None: filters = []
1017 if tokens_cache is None: tokens_cache = {}
1020 filters.append(self.term_filter(Term('is_book', 'true')))
1022 query = BooleanQuery()
1025 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1027 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1028 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1030 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1032 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1033 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1035 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1037 Search for perfect book matches. Just see if the query matches with some author or title,
1038 taking hints into account.
1040 fields_to_search = ['authors', 'title']
1043 if not hint.should_search_for_book():
1045 fields_to_search = hint.just_search_in(fields_to_search)
1046 only_in = hint.book_filter()
1048 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1052 top = self.searcher.search(q,
1053 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1055 for found in top.scoreDocs:
1056 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1059 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1060 fields_to_search = ['tags', 'authors', 'title']
1064 if not hint.should_search_for_book():
1066 fields_to_search = hint.just_search_in(fields_to_search)
1067 only_in = hint.book_filter()
1069 tokens = self.get_tokens(searched, field='SIMPLE')
1073 for fld in fields_to_search:
1074 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1075 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1078 top = self.searcher.search(q,
1079 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1081 for found in top.scoreDocs:
1082 books.append(SearchResult(self, found, how_found="search_book"))
1086 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1088 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1089 some part/fragment of the book.
1091 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1095 flt = hint.part_filter()
1099 top = self.searcher.search(q,
1100 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1103 for found in top.scoreDocs:
1104 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1108 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1110 Tries to use search terms to match different fields of book (or its parts).
1111 E.g. one word can be an author survey, another be a part of the title, and the rest
1112 are some words from third chapter.
1114 if tokens_cache is None: tokens_cache = {}
1119 only_in = hint.part_filter()
1121 # content only query : themes x content
1124 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1125 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1127 # only search in themes when we do not already filter by themes
1128 if hint is None or hint.just_search_in(['themes']) != []:
1129 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1130 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1132 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1133 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1135 topDocs = self.searcher.search(q, only_in, max_results)
1136 for found in topDocs.scoreDocs:
1137 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1138 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1140 # query themes/content x author/title/tags
1142 in_content = BooleanQuery()
1143 in_meta = BooleanQuery()
1145 for fld in ['themes_pl', 'content']:
1146 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1148 for fld in ['tags', 'authors', 'title']:
1149 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1151 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1152 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1154 topDocs = self.searcher.search(q, only_in, max_results)
1155 for found in topDocs.scoreDocs:
1156 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1157 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1161 # def multisearch(self, query, max_results=50):
1164 # - (phrase) OR -> content
1167 # - (keywords) -> authors
1172 # queryreader = StringReader(query)
1173 # tokens = self.get_tokens(queryreader)
1175 # top_level = BooleanQuery()
1176 # Should = BooleanClause.Occur.SHOULD
1178 # phrase_level = BooleanQuery()
1179 # phrase_level.setBoost(1.3)
1181 # p_content = self.make_phrase(tokens, joined=True)
1182 # p_title = self.make_phrase(tokens, 'title')
1183 # p_author = self.make_phrase(tokens, 'author')
1185 # phrase_level.add(BooleanClause(p_content, Should))
1186 # phrase_level.add(BooleanClause(p_title, Should))
1187 # phrase_level.add(BooleanClause(p_author, Should))
1189 # kw_level = BooleanQuery()
1191 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1192 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1193 # kw_level.add(j_themes, Should)
1194 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1195 # j_con = self.make_term_query(tokens, joined=True)
1196 # kw_level.add(j_con, Should)
1198 # top_level.add(BooleanClause(phrase_level, Should))
1199 # top_level.add(BooleanClause(kw_level, Should))
1203 def get_snippets(self, scoreDoc, query, field='content'):
1205 Returns a snippet for found scoreDoc.
1207 htmlFormatter = SimpleHTMLFormatter()
1208 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1210 stored = self.searcher.doc(scoreDoc.doc)
1212 position = stored.get('snippets_position')
1213 length = stored.get('snippets_length')
1214 if position is None or length is None:
1217 book_id = int(stored.get('book_id'))
1218 snippets = Snippets(book_id).open()
1221 text = snippets.get((int(position),
1226 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1227 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1228 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1230 except Exception, e:
1232 if hasattr(e, 'getJavaException'):
1233 e2 = unicode(e.getJavaException())
1234 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1239 def enum_to_array(enum):
1241 Converts a lucene TermEnum to array of Terms, suitable for
1250 if not enum.next(): break
1253 return JArray('object')(terms, Term)
1255 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1257 Search for Tag objects using query.
1260 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1261 tops = self.searcher.search(query, filters, max_results)
1264 for found in tops.scoreDocs:
1265 doc = self.searcher.doc(found.doc)
1266 is_pdcounter = doc.get('is_pdcounter')
1268 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1270 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1271 # don't add the pdcounter tag if same tag already exists
1272 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1274 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1275 print 'returning %s' % tags
1278 def search_books(self, query, filter=None, max_results=10):
1280 Searches for Book objects using query
1283 tops = self.searcher.search(query, filter, max_results)
1284 for found in tops.scoreDocs:
1285 doc = self.searcher.doc(found.doc)
1286 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1289 def make_prefix_phrase(self, toks, field):
1290 q = MultiPhraseQuery()
1291 for i in range(len(toks)):
1292 t = Term(field, toks[i])
1293 if i == len(toks) - 1:
1294 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1304 def term_filter(term, inverse=False):
1305 only_term = TermsFilter()
1306 only_term.addTerm(term)
1309 neg = BooleanFilter()
1310 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1315 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1317 Return auto-complete hints for tags
1318 using prefix search.
1320 toks = self.get_tokens(string, field='SIMPLE')
1321 top = BooleanQuery()
1323 for field in ['tag_name', 'tag_name_pl']:
1325 q = self.make_prefix_phrase(toks, field)
1327 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1328 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1330 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1332 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1334 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1336 Returns auto-complete hints for book titles
1337 Because we do not index 'pseudo' title-tags.
1340 toks = self.get_tokens(string, field='SIMPLE')
1343 q = self.make_prefix_phrase(toks, 'title')
1345 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1347 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1350 def chain_filters(filters, op=ChainedFilter.AND):
1352 Chains a filter list together
1354 filters = filter(lambda x: x is not None, filters)
1355 if not filters or filters is []:
1357 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1360 def filtered_categories(self, tags):
1362 Return a list of tag categories, present in tags list.
1366 cats[t.category] = True