af3dd9a604391be708e2fc3bc4c50d99481af77e
[wolnelektury.git] / apps / search / index.py
1 # -*- coding: utf-8 -*-
2
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, File, Field, \
5     NumericField, Version, Document, JavaError, IndexSearcher, \
6     QueryParser, PerFieldAnalyzerWrapper, \
7     SimpleAnalyzer, PolishAnalyzer, ArrayList, \
8     KeywordAnalyzer, NumericRangeQuery, BooleanQuery, \
9     BlockJoinQuery, BlockJoinCollector, TermsFilter, \
10     HashSet, BooleanClause, Term, CharTermAttribute, \
11     PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, BlockJoinQuery, \
12     FuzzyQuery, FuzzyTermEnum, Sort, Integer, \
13     SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
14     initVM, CLASSPATH, JArray
15     # KeywordAnalyzer
16 JVM = initVM(CLASSPATH)
17 import sys
18 import os
19 import errno
20 from librarian import dcparser
21 from librarian.parser import WLDocument
22 import catalogue.models
23 from multiprocessing.pool import ThreadPool
24 from threading import current_thread
25 import atexit
26 import traceback
27
28
29 class WLAnalyzer(PerFieldAnalyzerWrapper):
30     def __init__(self):
31         polish = PolishAnalyzer(Version.LUCENE_34)
32         simple = SimpleAnalyzer(Version.LUCENE_34)
33         keyword = KeywordAnalyzer(Version.LUCENE_34)
34         # not sure if needed: there's NOT_ANALYZED meaning basically the same
35
36         PerFieldAnalyzerWrapper.__init__(self, polish)
37
38         self.addAnalyzer("tags", simple)
39         self.addAnalyzer("technical_editors", simple)
40         self.addAnalyzer("editors", simple)
41         self.addAnalyzer("url", keyword)
42         self.addAnalyzer("source_url", keyword)
43         self.addAnalyzer("source_name", simple)
44         self.addAnalyzer("publisher", simple)
45         self.addAnalyzer("author", simple)
46         self.addAnalyzer("is_book", keyword)
47
48         self.addAnalyzer("KEYWORD", keyword)
49         self.addAnalyzer("SIMPLE", simple)
50         self.addAnalyzer("NATURAL", polish)
51
52
53 class IndexStore(object):
54     def __init__(self):
55         self.make_index_dir()
56         self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
57
58     def make_index_dir(self):
59         try:
60             os.makedirs(settings.SEARCH_INDEX)
61         except OSError as exc:
62             if exc.errno == errno.EEXIST:
63                 pass
64             else: raise
65
66
67 class Index(IndexStore):
68     def __init__(self, analyzer=None):
69         IndexStore.__init__(self)
70         self.index = None
71         if not analyzer:
72             analyzer = WLAnalyzer()
73         self.analyzer = analyzer
74
75     def open(self, analyzer=None):
76         if self.index:
77             raise Exception("Index is already opened")
78         self.index = IndexWriter(self.store, self.analyzer,\
79                                  IndexWriter.MaxFieldLength.LIMITED)
80         return self.index
81
82     def close(self):
83         self.index.optimize()
84         self.index.close()
85         self.index = None
86
87     def remove_book(self, book):
88         q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
89         self.index.deleteDocuments(q)
90
91     def index_book(self, book, overwrite=True):
92         if overwrite:
93             self.remove_book(book)
94
95         doc = self.extract_metadata(book)
96         parts = self.extract_content(book)
97         block = ArrayList().of_(Document)
98
99         for p in parts:
100             block.add(p)
101         block.add(doc)
102         self.index.addDocuments(block)
103
104     master_tags = [
105         'opowiadanie',
106         'powiesc',
107         'dramat_wierszowany_l',
108         'dramat_wierszowany_lp',
109         'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
110         'wywiad'
111         ]
112
113     skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne']
114
115     def create_book_doc(self, book):
116         """
117         Create a lucene document connected to the book
118         """
119         doc = Document()
120         doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(book.id))
121         if book.parent is not None:
122             doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(book.parent.id))
123         return doc
124
125     def extract_metadata(self, book):
126         book_info = dcparser.parse(book.xml_file)
127
128         print("extract metadata for book %s id=%d, thread%d" % (book.slug, book.id, current_thread().ident))
129
130         doc = self.create_book_doc(book)
131         doc.add(Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS))
132         doc.add(Field("tags", ','.join([t.name for t in book.tags]), Field.Store.NO, Field.Index.ANALYZED))
133         doc.add(Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
134
135         # validator, name
136         for field in dcparser.BookInfo.FIELDS:
137             if hasattr(book_info, field.name):
138                 if not getattr(book_info, field.name):
139                     continue
140                 # since no type information is available, we use validator
141                 type_indicator = field.validator
142                 if type_indicator == dcparser.as_unicode:
143                     s = getattr(book_info, field.name)
144                     if field.multiple:
145                         s = ', '.join(s)
146                     try:
147                         doc.add(Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED))
148                     except JavaError as je:
149                         raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
150                 elif type_indicator == dcparser.as_person:
151                     p = getattr(book_info, field.name)
152                     if isinstance(p, dcparser.Person):
153                         persons = unicode(p)
154                     else:
155                         persons = ', '.join(map(unicode, p))
156                     doc.add(Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED))
157                 elif type_indicator == dcparser.as_date:
158                     dt = getattr(book_info, field.name)
159                     doc.add(Field(field.name, "%04d%02d%02d" % (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED))
160         return doc
161
162     def get_master(self, root):
163         for master in root.iter():
164             if master.tag in self.master_tags:
165                 return master
166
167     def extract_content(self, book):
168         wld = WLDocument.from_file(book.xml_file.path)
169         root = wld.edoc.getroot()
170
171         # first we build a sequence of top-level items.
172         # book_id
173         # header_index - the 0-indexed position of header element.
174         # content
175         master = self.get_master(root)
176         if master is None:
177             return []
178
179         header_docs = []
180         for header, position in zip(list(master), range(len(master))):
181             if header.tag in self.skip_header_tags:
182                 continue
183             doc = self.create_book_doc(book)
184             doc.add(NumericField("header_index", Field.Store.YES, True).setIntValue(position))
185             doc.add(Field("header_type", header.tag, Field.Store.YES, Field.Index.NOT_ANALYZED))
186             content = u' '.join([t for t in header.itertext()])
187             doc.add(Field("content", content, Field.Store.YES, Field.Index.ANALYZED))
188             header_docs.append(doc)
189
190         def walker(node):
191             yield node, None
192             for child in list(node):
193                 for b, e in walker(child):
194                     yield b, e
195             yield None, node
196             return
197
198         # Then we create a document for each fragments
199         # fragment_anchor - the anchor
200         # themes - list of themes [not indexed]
201         fragment_docs = []
202         # will contain (framgent id -> { content: [], themes: [] }
203         fragments = {}
204         for start, end in walker(master):
205             if start is not None and start.tag == 'begin':
206                 fid = start.attrib['id'][1:]
207                 fragments[fid] = {'content': [], 'themes': []}
208                 fragments[fid]['content'].append(start.tail)
209             elif start is not None and start.tag == 'motyw':
210                 fid = start.attrib['id'][1:]
211                 fragments[fid]['themes'].append(start.text)
212                 fragments[fid]['content'].append(start.tail)
213             elif start is not None and start.tag == 'end':
214                 fid = start.attrib['id'][1:]
215                 if fid not in fragments:
216                     continue  # a broken <end> node, skip it
217                 frag = fragments[fid]
218                 del fragments[fid]
219
220                 def jstr(l):
221                     return u' '.join(map(
222                         lambda x: x == None and u'(none)' or unicode(x),
223                         l))
224
225                 doc = self.create_book_doc(book)
226                 doc.add(Field("fragment_anchor", fid,
227                               Field.Store.YES, Field.Index.NOT_ANALYZED))
228                 doc.add(Field("content",
229                               u' '.join(filter(lambda s: s is not None, frag['content'])),
230                               Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
231                 doc.add(Field("themes",
232                               u' '.join(filter(lambda s: s is not None, frag['themes'])),
233                               Field.Store.NO, Field.Index.ANALYZED))
234
235                 fragment_docs.append(doc)
236             elif start is not None:
237                 for frag in fragments.values():
238                     frag['content'].append(start.text)
239             elif end is not None:
240                 for frag in fragments.values():
241                     frag['content'].append(end.tail)
242
243         return header_docs + fragment_docs
244
245     def __enter__(self):
246         self.open()
247         return self
248
249     def __exit__(self, type, value, tb):
250         self.close()
251
252
253 def log_exception_wrapper(f):
254     def _wrap(*a):
255         try:
256             f(*a)
257         except Exception, e:
258             print("Error in indexing thread: %s" % e)
259             traceback.print_exc()
260             raise e
261     return _wrap
262
263
264 class ReusableIndex(Index):
265     """
266     Works like index, but does not close/optimize Lucene index
267     until program exit (uses atexit hook).
268     This is usefull for importbooks command.
269
270     if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
271     """
272     index = None
273     pool = None
274     pool_jobs = None
275
276     def open(self, analyzer=None, threads=4):
277         if ReusableIndex.index is not None:
278             self.index = ReusableIndex.index
279         else:
280             print("opening index")
281             ReusableIndex.pool = ThreadPool(threads, initializer=lambda: JVM.attachCurrentThread() )
282             ReusableIndex.pool_jobs = []
283             Index.open(self, analyzer)
284             ReusableIndex.index = self.index
285             atexit.register(ReusableIndex.close_reusable)
286
287     def index_book(self, *args, **kw):
288         job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
289         ReusableIndex.pool_jobs.append(job)
290
291     @staticmethod
292     def close_reusable():
293         if ReusableIndex.index is not None:
294             print("wait for indexing to finish")
295             for job in ReusableIndex.pool_jobs:
296                 job.get()
297                 sys.stdout.write('.')
298                 sys.stdout.flush()
299             print("done.")
300             ReusableIndex.pool.close()
301
302             ReusableIndex.index.optimize()
303             ReusableIndex.index.close()
304             ReusableIndex.index = None
305
306     def close(self):
307         pass
308
309
310 class Search(IndexStore):
311     def __init__(self, default_field="content"):
312         IndexStore.__init__(self)
313         self.analyzer = WLAnalyzer() #PolishAnalyzer(Version.LUCENE_34)
314         ## self.analyzer = WLAnalyzer()
315         self.searcher = IndexSearcher(self.store, True)
316         self.parser = QueryParser(Version.LUCENE_34, default_field,
317                                   self.analyzer)
318
319         self.parent_filter = TermsFilter()
320         self.parent_filter.addTerm(Term("is_book", "true"))
321
322     def query(self, query):
323         return self.parser.parse(query)
324
325     def wrapjoins(self, query, fields=[]):
326         """
327         This functions modifies the query in a recursive way,
328         so Term and Phrase Queries contained, which match
329         provided fields are wrapped in a BlockJoinQuery,
330         and so delegated to children documents.
331         """
332         if BooleanQuery.instance_(query):
333             qs = BooleanQuery.cast_(query)
334             for clause in qs:
335                 clause = BooleanClause.cast_(clause)
336                 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
337             return qs
338         else:
339             termset = HashSet()
340             query.extractTerms(termset)
341             for t in termset:
342                 t = Term.cast_(t)
343                 if t.field() not in fields:
344                     return query
345             return BlockJoinQuery(query, self.parent_filter,
346                                   BlockJoinQuery.ScoreMode.Total)
347
348     def simple_search(self, query, max_results=50):
349         """Returns (books, total_hits)
350         """
351
352         tops = self.searcher.search(self.query(query), max_results)
353         bks = []
354         for found in tops.scoreDocs:
355             doc = self.searcher.doc(found.doc)
356             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
357         return (bks, tops.totalHits)
358
359     def search(self, query, max_results=50):
360         query = self.query(query)
361         query = self.wrapjoins(query, ["content", "themes"])
362
363         tops = self.searcher.search(query, max_results)
364         bks = []
365         for found in tops.scoreDocs:
366             doc = self.searcher.doc(found.doc)
367             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
368         return (bks, tops.totalHits)
369
370     def bsearch(self, query, max_results=50):
371         q = self.query(query)
372         bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
373
374         tops = self.searcher.search(bjq, max_results)
375         bks = []
376         for found in tops.scoreDocs:
377             doc = self.searcher.doc(found.doc)
378             bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
379         return (bks, tops.totalHits)
380
381 # TokenStream tokenStream = analyzer.tokenStream(fieldName, reader);
382 # OffsetAttribute offsetAttribute = tokenStream.getAttribute(OffsetAttribute.class);
383 # CharTermAttribute charTermAttribute = tokenStream.getAttribute(CharTermAttribute.class);
384
385 # while (tokenStream.incrementToken()) {
386 #     int startOffset = offsetAttribute.startOffset();
387 #     int endOffset = offsetAttribute.endOffset();
388 #     String term = charTermAttribute.toString();
389 # }
390
391
392 class SearchResult(object):
393     def __init__(self, searcher, scoreDocs, score=None, highlight_query=None):
394         if score:
395             self.score = score
396         else:
397             self.score = scoreDocs.score
398
399         self.fragments = []
400         self.scores = {}
401         self.sections = []
402
403         stored = searcher.doc(scoreDocs.doc)
404         self.book_id = int(stored.get("book_id"))
405
406         fragment = stored.get("fragment_anchor")
407         if fragment:
408             self.fragments.append(fragment)
409             self.scores[fragment] = scoreDocs.score
410
411         header_type = stored.get("header_type")
412         if header_type:
413             sec = (header_type, int(stored.get("header_index")))
414             self.sections.append(sec)
415             self.scores[sec] = scoreDocs.score
416
417         self.snippets = []
418
419     def add_snippets(self, snippets):
420         self.snippets += snippets
421         return self
422
423     def get_book(self):
424         return catalogue.models.Book.objects.get(id=self.book_id)
425
426     book = property(get_book)
427
428     def get_parts(self):
429         book = self.book
430         parts = [{"header": s[0], "position": s[1], '_score_key': s} for s in self.sections] \
431             + [{"fragment": book.fragments.get(anchor=f), '_score_key':f} for f in self.fragments]
432
433         parts.sort(lambda a, b: cmp(self.scores[a['_score_key']], self.scores[b['_score_key']]))
434         print("bookid: %d parts: %s" % (self.book_id, parts))
435         return parts
436
437     parts = property(get_parts)
438
439     def merge(self, other):
440         if self.book_id != other.book_id:
441             raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
442         self.fragments += other.fragments
443         self.sections += other.sections
444         self.snippets += other.snippets
445         self.scores.update(other.scores)
446         if other.score > self.score:
447             self.score = other.score
448         return self
449
450     def __unicode__(self):
451         return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
452
453     @staticmethod
454     def aggregate(*result_lists):
455         books = {}
456         for rl in result_lists:
457             for r in rl:
458                 if r.book_id in books:
459                     books[r.book_id].merge(r)
460                     #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
461                 else:
462                     books[r.book_id] = r
463         return books.values()
464
465     def __cmp__(self, other):
466         return cmp(self.score, other.score)
467
468
469 class MultiSearch(Search):
470     """Class capable of IMDb-like searching"""
471     def get_tokens(self, searched, field='content'):
472         """returns tokens analyzed by a proper (for a field) analyzer
473         argument can be: StringReader, string/unicode, or tokens. In the last case
474         they will just be returned (so we can reuse tokens, if we don't change the analyzer)
475         """
476         if isinstance(searched, str) or isinstance(searched, unicode):
477             searched = StringReader(searched)
478         elif isinstance(searched, list):
479             return searched
480
481         searched.reset()
482         tokens = self.analyzer.reusableTokenStream(field, searched)
483         toks = []
484         while tokens.incrementToken():
485             cta = tokens.getAttribute(CharTermAttribute.class_)
486             toks.append(cta.toString())
487         return toks
488
489     def fuzziness(self, fuzzy):
490         if not fuzzy:
491             return None
492         if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
493             return fuzzy
494         else:
495             return 0.5
496
497     def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
498         if fuzzy:
499             phrase = MultiPhraseQuery()
500             for t in tokens:
501                 term = Term(field, t)
502                 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
503                 fuzzterms = []
504
505                 while True:
506                     #                    print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
507                     ft = fuzzterm.term()
508                     if ft:
509                         fuzzterms.append(ft)
510                     if not fuzzterm.next(): break
511                 if fuzzterms:
512                     phrase.add(JArray('object')(fuzzterms, Term))
513                 else:
514                     phrase.add(term)
515         else:
516             phrase = PhraseQuery()
517             phrase.setSlop(slop)
518             for t in tokens:
519                 term = Term(field, t)
520                 phrase.add(term)
521         return phrase
522
523     def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
524         q = BooleanQuery()
525         for t in tokens:
526             term = Term(field, t)
527             if fuzzy:
528                 term = FuzzyQuery(term, self.fuzziness(fuzzy))
529             else:
530                 term = TermQuery(term)
531             q.add(BooleanClause(term, modal))
532         return q
533
534     def content_query(self, query):
535         return BlockJoinQuery(query, self.parent_filter,
536                               BlockJoinQuery.ScoreMode.Total)
537
538     def search_perfect_book(self, searched, max_results=20, fuzzy=False):
539         qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in ['author', 'title']]
540
541         books = []
542         for q in qrys:
543             top = self.searcher.search(q, max_results)
544             for found in top.scoreDocs:
545                 books.append(SearchResult(self.searcher, found))
546         return books
547
548     def search_perfect_parts(self, searched, max_results=20, fuzzy=False):
549         qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
550
551         books = []
552         for q in qrys:
553             top = self.searcher.search(q, max_results)
554             for found in top.scoreDocs:
555                 books.append(SearchResult(self.searcher, found).add_snippets(self.get_snippets(found, q)))
556
557         return books
558
559     def search_everywhere(self, searched, max_results=20, fuzzy=False):
560         books = []
561
562         # content only query : themes x content
563         q = BooleanQuery()
564
565         tokens = self.get_tokens(searched)
566         q.add(BooleanClause(self.make_term_query(tokens, field='themes', fuzzy=fuzzy), BooleanClause.Occur.MUST))
567         q.add(BooleanClause(self.make_term_query(tokens, field='content', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
568
569         topDocs = self.searcher.search(q, max_results)
570         for found in topDocs.scoreDocs:
571             books.append(SearchResult(self.searcher, found))
572
573         # joined query themes/content x author/title/epochs/genres/kinds
574         q = BooleanQuery()
575         in_meta = BooleanQuery()
576         in_content = BooleanQuery()
577
578         for fld in ['themes', 'content']:
579             in_content.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
580
581         in_meta.add(BooleanClause(self.make_term_query(self.get_tokens(searched, field='author'), field='author', fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
582
583         for fld in ['title', 'epochs', 'genres', 'kinds']:
584             in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
585
586         q.add(BooleanClause(in_meta, BooleanClause.Occur.MUST))
587         in_content_join = self.content_query(in_content)
588         q.add(BooleanClause(in_content_join, BooleanClause.Occur.MUST))
589         #        import pdb; pdb.set_trace()
590         collector = BlockJoinCollector(Sort.RELEVANCE, 100, True, True)
591
592         self.searcher.search(q, collector)
593
594         top_groups = collector.getTopGroups(in_content_join, Sort.RELEVANCE, 0, max_results, 0, True)
595         if top_groups:
596             for grp in top_groups.groups:
597                 for part in grp.scoreDocs:
598                     books.append(SearchResult(self.searcher, part, score=grp.maxScore))
599         return books
600
601     def multisearch(self, query, max_results=50):
602         """
603         Search strategy:
604         - (phrase) OR -> content
605                       -> title
606                       -> author
607         - (keywords)  -> author
608                       -> motyw
609                       -> tags
610                       -> content
611         """
612         # queryreader = StringReader(query)
613         # tokens = self.get_tokens(queryreader)
614
615         # top_level = BooleanQuery()
616         # Should = BooleanClause.Occur.SHOULD
617
618         # phrase_level = BooleanQuery()
619         # phrase_level.setBoost(1.3)
620
621         # p_content = self.make_phrase(tokens, joined=True)
622         # p_title = self.make_phrase(tokens, 'title')
623         # p_author = self.make_phrase(tokens, 'author')
624
625         # phrase_level.add(BooleanClause(p_content, Should))
626         # phrase_level.add(BooleanClause(p_title, Should))
627         # phrase_level.add(BooleanClause(p_author, Should))
628
629         # kw_level = BooleanQuery()
630
631         # kw_level.add(self.make_term_query(tokens, 'author'), Should)
632         # j_themes = self.make_term_query(tokens, 'themes', joined=True)
633         # kw_level.add(j_themes, Should)
634         # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
635         # j_con = self.make_term_query(tokens, joined=True)
636         # kw_level.add(j_con, Should)
637
638         # top_level.add(BooleanClause(phrase_level, Should))
639         # top_level.add(BooleanClause(kw_level, Should))
640
641         return None
642
643
644     def do_search(self, query, max_results=50, collector=None):
645         tops = self.searcher.search(query, max_results)
646         #tops = self.searcher.search(p_content, max_results)
647
648         bks = []
649         for found in tops.scoreDocs:
650             doc = self.searcher.doc(found.doc)
651             b = catalogue.models.Book.objects.get(id=doc.get("book_id"))
652             bks.append(b)
653             print "%s (%d) -> %f" % (b, b.id, found.score)
654         return (bks, tops.totalHits)
655
656     def get_snippets(self, scoreDoc, query, field='content'):
657         htmlFormatter = SimpleHTMLFormatter()
658         highlighter = Highlighter(htmlFormatter, QueryScorer(query))
659
660         stored = self.searcher.doc(scoreDoc.doc)
661         text = stored.get(field)
662         tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
663         #  highlighter.getBestTextFragments(tokenStream, text, False, 10)
664         snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
665         print('snips: %s' % snip)
666
667         return [snip]