1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, IndexWriterConfig, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, timeout=None):
174 raise Exception("Index is already opened")
175 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
177 conf.setWriteLockTimeout(long(timeout))
178 self.index = IndexWriter(self.store, conf)
182 self.index.optimize()
186 self.index.optimize()
187 except JavaError, je:
188 print "Error during optimize phase, check index: %s" % je
197 def __exit__(self, type, value, tb):
201 class Index(BaseIndex):
203 Class indexing books.
205 def __init__(self, analyzer=None):
206 super(Index, self).__init__(analyzer)
208 def index_tags(self):
210 Re-index global tag list.
211 Removes all tags from index, then index them again.
212 Indexed fields include: id, name (with and without polish stems), category
214 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
215 self.index.deleteDocuments(q)
217 for tag in catalogue.models.Tag.objects.exclude(category='set'):
219 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
220 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
221 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
222 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
223 self.index.addDocument(doc)
225 for pdtag in PDCounterAuthor.objects.all():
227 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
228 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
229 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
230 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
231 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
232 self.index.addDocument(doc)
234 for pdtag in PDCounterBook.objects.all():
236 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
237 doc.add(Field("tag_name", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
238 doc.add(Field("tag_name_pl", pdtag.title, Field.Store.NO, Field.Index.ANALYZED))
239 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
240 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
241 self.index.addDocument(doc)
243 def create_book_doc(self, book):
245 Create a lucene document referring book id.
248 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
249 if book.parent is not None:
250 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
253 def remove_book(self, book):
254 """Removes a book from search index.
255 book - Book instance."""
256 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
257 self.index.deleteDocuments(q)
259 def index_book(self, book, book_info=None, overwrite=True):
262 Creates a lucene document for extracted metadata
263 and calls self.index_content() to index the contents of the book.
266 self.remove_book(book)
268 book_doc = self.create_book_doc(book)
269 meta_fields = self.extract_metadata(book, book_info)
270 for f in meta_fields.values():
271 if isinstance(f, list) or isinstance(f, tuple):
276 self.index.addDocument(book_doc)
279 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
284 'dramat_wierszowany_l',
285 'dramat_wierszowany_lp',
286 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
290 ignore_content_tags = [
292 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
294 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
297 footnote_tags = ['pa', 'pt', 'pr', 'pe']
299 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
301 published_date_re = re.compile("([0-9]+)[\]. ]*$")
303 def extract_metadata(self, book, book_info=None):
305 Extract metadata from book and returns a map of fields keyed by fieldname
309 if book_info is None:
310 book_info = dcparser.parse(open(book.xml_file.path))
312 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
313 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
314 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
317 for field in dcparser.BookInfo.FIELDS:
318 if hasattr(book_info, field.name):
319 if not getattr(book_info, field.name):
321 # since no type information is available, we use validator
322 type_indicator = field.validator
323 if type_indicator == dcparser.as_unicode:
324 s = getattr(book_info, field.name)
328 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
329 except JavaError as je:
330 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
331 elif type_indicator == dcparser.as_person:
332 p = getattr(book_info, field.name)
333 if isinstance(p, dcparser.Person):
336 persons = ', '.join(map(unicode, p))
337 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
338 elif type_indicator == dcparser.as_date:
339 dt = getattr(book_info, field.name)
340 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
341 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
345 if hasattr(book_info, 'source_name') and book_info.source_name:
346 match = self.published_date_re.search(book_info.source_name)
347 if match is not None:
348 pd = str(match.groups()[0])
350 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
354 def add_gaps(self, fields, fieldname):
356 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
357 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
361 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
362 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
364 def get_master(self, root):
366 Returns the first master tag from an etree.
368 for master in root.iter():
369 if master.tag in self.master_tags:
372 def index_content(self, book, book_fields=[]):
374 Walks the book XML and extract content from it.
375 Adds parts for each header tag and for each fragment.
377 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
378 root = wld.edoc.getroot()
380 master = self.get_master(root)
384 def walker(node, ignore_tags=[]):
386 if node.tag not in ignore_tags:
387 yield node, None, None
388 if node.text is not None:
389 yield None, node.text, None
390 for child in list(node):
391 for b, t, e in walker(child):
393 yield None, None, node
395 if node.tail is not None:
396 yield None, node.tail, None
399 def fix_format(text):
400 # separator = [u" ", u"\t", u".", u";", u","]
401 if isinstance(text, list):
402 # need to join it first
403 text = filter(lambda s: s is not None, content)
404 text = u' '.join(text)
405 # for i in range(len(text)):
407 # if text[i][0] not in separator\
408 # and text[i - 1][-1] not in separator:
409 # text.insert(i, u" ")
411 return re.sub("(?m)/$", "", text)
413 def add_part(snippets, **fields):
414 doc = self.create_book_doc(book)
415 for f in book_fields:
418 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
419 doc.add(NumericField("header_span", Field.Store.YES, True)\
420 .setIntValue('header_span' in fields and fields['header_span'] or 1))
421 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
423 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
424 Field.TermVector.WITH_POSITIONS_OFFSETS))
426 snip_pos = snippets.add(fields["content"])
427 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
428 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
430 if 'fragment_anchor' in fields:
431 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
432 Field.Store.YES, Field.Index.NOT_ANALYZED))
434 if 'themes' in fields:
435 themes, themes_pl = zip(*[
436 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
437 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
438 for theme in fields['themes']])
440 themes = self.add_gaps(themes, 'themes')
441 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
451 if isinstance(s, unicode):
452 return s.encode('utf-8')
457 snippets = Snippets(book.id).open('w')
459 for header, position in zip(list(master), range(len(master))):
461 if header.tag in self.skip_header_tags:
463 if header.tag is etree.Comment:
470 def all_content(text):
471 for frag in fragments.values():
472 frag['content'].append(text)
474 handle_text = [all_content]
477 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
479 if start is not None and start.tag in self.footnote_tags:
481 def collect_footnote(t):
483 handle_text.append(collect_footnote)
484 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
486 doc = add_part(snippets, header_index=position, header_type=header.tag,
487 content=u''.join(footnote),
488 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
490 self.index.addDocument(doc)
491 #print "@ footnote text: %s" % footnote
494 # handle fragments and themes.
495 if start is not None and start.tag == 'begin':
496 fid = start.attrib['id'][1:]
497 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
499 # themes for this fragment
500 elif start is not None and start.tag == 'motyw':
501 fid = start.attrib['id'][1:]
502 handle_text.append(None)
503 if start.text is not None:
504 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
505 elif end is not None and end.tag == 'motyw':
508 elif start is not None and start.tag == 'end':
509 fid = start.attrib['id'][1:]
510 if fid not in fragments:
511 continue # a broken <end> node, skip it
512 frag = fragments[fid]
513 if frag['themes'] == []:
514 continue # empty themes list.
517 doc = add_part(snippets,
518 header_type=frag['start_header'],
519 header_index=frag['start_section'],
520 header_span=position - frag['start_section'] + 1,
522 content=fix_format(frag['content']),
523 themes=frag['themes'])
524 #print '@ FRAG %s' % frag['content']
525 self.index.addDocument(doc)
529 if text is not None and handle_text is not []:
530 hdl = handle_text[-1]
534 # in the end, add a section text.
535 doc = add_part(snippets, header_index=position, header_type=header.tag,
536 content=fix_format(content))
537 #print '@ CONTENT: %s' % fix_format(content)
539 self.index.addDocument(doc)
545 def log_exception_wrapper(f):
550 print("Error in indexing thread: %s" % e)
551 traceback.print_exc()
556 class ReusableIndex(Index):
558 Works like index, but does not close/optimize Lucene index
559 until program exit (uses atexit hook).
560 This is usefull for importbooks command.
562 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
566 def open(self, analyzer=None, **kw):
567 if ReusableIndex.index:
568 self.index = ReusableIndex.index
570 print("opening index")
571 Index.open(self, analyzer, **kw)
572 ReusableIndex.index = self.index
573 atexit.register(ReusableIndex.close_reusable)
575 # def index_book(self, *args, **kw):
576 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
577 # ReusableIndex.pool_jobs.append(job)
580 def close_reusable():
581 if ReusableIndex.index:
582 print("closing index")
583 ReusableIndex.index.optimize()
584 ReusableIndex.index.close()
585 ReusableIndex.index = None
588 if ReusableIndex.index:
589 ReusableIndex.index.commit()
592 class JoinSearch(object):
594 This mixin could be used to handle block join queries.
597 def __init__(self, *args, **kw):
598 super(JoinSearch, self).__init__(*args, **kw)
600 def wrapjoins(self, query, fields=[]):
602 This functions modifies the query in a recursive way,
603 so Term and Phrase Queries contained, which match
604 provided fields are wrapped in a BlockJoinQuery,
605 and so delegated to children documents.
607 if BooleanQuery.instance_(query):
608 qs = BooleanQuery.cast_(query)
610 clause = BooleanClause.cast_(clause)
611 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
615 query.extractTerms(termset)
618 if t.field() not in fields:
620 return BlockJoinQuery(query, self.parent_filter,
621 BlockJoinQuery.ScoreMode.Total)
623 def bsearch(self, query, max_results=50):
624 q = self.query(query)
625 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
627 tops = self.searcher.search(bjq, max_results)
629 for found in tops.scoreDocs:
630 doc = self.searcher.doc(found.doc)
631 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
632 return (bks, tops.totalHits)
635 class SearchResult(object):
636 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
637 if tokens_cache is None: tokens_cache = {}
642 self._score = scoreDocs.score
647 self._processed_hits = None # processed hits
649 stored = search.searcher.doc(scoreDocs.doc)
650 self.book_id = int(stored.get("book_id"))
652 pd = stored.get("published_date")
654 self.published_date = int(pd)
656 self.published_date = 0
658 header_type = stored.get("header_type")
659 # we have a content hit in some header of fragment
660 if header_type is not None:
661 sec = (header_type, int(stored.get("header_index")))
662 header_span = stored.get('header_span')
663 header_span = header_span is not None and int(header_span) or 1
665 fragment = stored.get("fragment_anchor")
668 snippets = snippets.replace("/\n", "\n")
669 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
671 self._hits.append(hit)
674 self.searched = searched
675 self.tokens_cache = tokens_cache
679 return self._score * self.boost
681 def merge(self, other):
682 if self.book_id != other.book_id:
683 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
684 self._hits += other._hits
685 if other.score > self.score:
686 self._score = other._score
690 return catalogue.models.Book.objects.get(id=self.book_id)
692 book = property(get_book)
696 if self._processed_hits is not None:
697 return self._processed_hits
706 # to sections and fragments
707 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
708 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
709 sect = filter(lambda s: 0 == len(filter(
710 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
711 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
716 # remove duplicate fragments
721 if fragments[fid][SCORE] >= f[SCORE]:
724 frags = fragments.values()
726 # remove duplicate sections
730 si = s[POSITION][POSITION_INDEX]
733 if sections[si]['score'] >= s[SCORE]:
736 m = {'score': s[SCORE],
737 'section_number': s[POSITION][POSITION_INDEX] + 1,
742 hits = sections.values()
746 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
747 except catalogue.models.Fragment.DoesNotExist:
751 # Figure out if we were searching for a token matching some word in theme name.
752 themes = frag.tags.filter(category='theme')
754 if self.searched is not None:
755 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
757 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
760 if not theme in themes_hit:
761 themes_hit.append(theme)
764 m = {'score': f[SCORE],
766 'section_number': f[POSITION][POSITION_INDEX] + 1,
768 'themes_hit': themes_hit
773 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
775 self._processed_hits = hits
779 def __unicode__(self):
780 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
783 def aggregate(*result_lists):
785 for rl in result_lists:
787 if r.book_id in books:
788 books[r.book_id].merge(r)
789 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
792 return books.values()
794 def __cmp__(self, other):
795 c = cmp(self.score, other.score)
797 # this is inverted, because earlier date is better
798 return cmp(other.published_date, self.published_date)
805 Given some hint information (information we already know about)
806 our search target - like author, title (specific book), epoch, genre, kind
807 we can narrow down search using filters.
809 def __init__(self, search):
811 Accepts a Searcher instance.
818 def books(self, *books):
820 Give a hint that we search these books.
824 def tags(self, tags):
826 Give a hint that these Tag objects (a list of)
830 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
831 lst = self.book_tags.get(t.category, [])
833 self.book_tags[t.category] = lst
834 if t.category in ['theme', 'theme_pl']:
835 self.part_tags.append(t)
837 def tag_filter(self, tags, field='tags'):
839 Given a lsit of tags and an optional field (but they are normally in tags field)
840 returns a filter accepting only books with specific tags.
845 toks = self.search.get_tokens(tag.name, field=field)
846 tag_phrase = PhraseQuery()
848 tag_phrase.add(Term(field, tok))
849 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
851 return QueryWrapperFilter(q)
853 def book_filter(self):
855 Filters using book tags (all tag kinds except a theme)
857 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
859 return self.tag_filter(tags)
863 def part_filter(self):
865 This filter can be used to look for book parts.
866 It filters on book id and/or themes.
870 fs.append(self.tag_filter(self.part_tags, field='themes'))
872 if self._books != []:
874 for b in self._books:
875 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
876 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
879 return Search.chain_filters(fs)
881 def should_search_for_book(self):
882 return self._books == []
884 def just_search_in(self, all):
885 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
888 if field == 'authors' and 'author' in self.book_tags:
890 if field == 'title' and self._books != []:
892 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
898 class Search(IndexStore):
902 def __init__(self, default_field="content"):
903 IndexStore.__init__(self)
904 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
905 # self.analyzer = WLAnalyzer()
906 self.searcher = IndexSearcher(self.store, True)
907 self.parser = QueryParser(Version.LUCENE_34, default_field,
910 self.parent_filter = TermsFilter()
911 self.parent_filter.addTerm(Term("is_book", "true"))
913 def query(self, query):
914 """Parse query in default Lucene Syntax. (for humans)
916 return self.parser.parse(query)
918 def simple_search(self, query, max_results=50):
919 """Runs a query for books using lucene syntax. (for humans)
920 Returns (books, total_hits)
923 tops = self.searcher.search(self.query(query), max_results)
925 for found in tops.scoreDocs:
926 doc = self.searcher.doc(found.doc)
927 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
928 return (bks, tops.totalHits)
930 def get_tokens(self, searched, field='content', cached=None):
931 """returns tokens analyzed by a proper (for a field) analyzer
932 argument can be: StringReader, string/unicode, or tokens. In the last case
933 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
935 if cached is not None and field in cached:
938 if isinstance(searched, str) or isinstance(searched, unicode):
939 searched = StringReader(searched)
940 elif isinstance(searched, list):
944 tokens = self.analyzer.reusableTokenStream(field, searched)
946 while tokens.incrementToken():
947 cta = tokens.getAttribute(CharTermAttribute.class_)
948 toks.append(cta.toString())
950 if cached is not None:
955 def fuzziness(self, fuzzy):
956 """Helper method to sanitize fuzziness"""
959 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
964 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
966 Return a PhraseQuery with a series of tokens.
969 phrase = MultiPhraseQuery()
971 term = Term(field, t)
972 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
976 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
980 if not fuzzterm.next(): break
982 phrase.add(JArray('object')(fuzzterms, Term))
986 phrase = PhraseQuery()
989 term = Term(field, t)
993 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
995 Returns term queries joined by boolean query.
996 modal - applies to boolean query
997 fuzzy - should the query by fuzzy.
1001 term = Term(field, t)
1003 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1005 term = TermQuery(term)
1006 q.add(BooleanClause(term, modal))
1009 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1010 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1011 if filters is None: filters = []
1012 if tokens_cache is None: tokens_cache = {}
1014 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1016 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1018 filters.append(self.term_filter(Term('is_book', 'true')))
1019 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1021 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1023 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1024 filters=None, tokens_cache=None, boost=None, snippets=True):
1025 if filters is None: filters = []
1026 if tokens_cache is None: tokens_cache = {}
1029 filters.append(self.term_filter(Term('is_book', 'true')))
1031 query = BooleanQuery()
1034 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1036 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1037 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1039 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1041 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1042 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1044 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1046 Search for perfect book matches. Just see if the query matches with some author or title,
1047 taking hints into account.
1049 fields_to_search = ['authors', 'title']
1052 if not hint.should_search_for_book():
1054 fields_to_search = hint.just_search_in(fields_to_search)
1055 only_in = hint.book_filter()
1057 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1061 top = self.searcher.search(q,
1062 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1064 for found in top.scoreDocs:
1065 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1068 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1069 fields_to_search = ['tags', 'authors', 'title']
1073 if not hint.should_search_for_book():
1075 fields_to_search = hint.just_search_in(fields_to_search)
1076 only_in = hint.book_filter()
1078 tokens = self.get_tokens(searched, field='SIMPLE')
1082 for fld in fields_to_search:
1083 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1084 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1087 top = self.searcher.search(q,
1088 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1090 for found in top.scoreDocs:
1091 books.append(SearchResult(self, found, how_found="search_book"))
1095 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1097 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1098 some part/fragment of the book.
1100 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1104 flt = hint.part_filter()
1108 top = self.searcher.search(q,
1109 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1112 for found in top.scoreDocs:
1113 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1117 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1119 Tries to use search terms to match different fields of book (or its parts).
1120 E.g. one word can be an author survey, another be a part of the title, and the rest
1121 are some words from third chapter.
1123 if tokens_cache is None: tokens_cache = {}
1128 only_in = hint.part_filter()
1130 # content only query : themes x content
1133 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1134 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1136 # only search in themes when we do not already filter by themes
1137 if hint is None or hint.just_search_in(['themes']) != []:
1138 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1139 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1141 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1142 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1144 topDocs = self.searcher.search(q, only_in, max_results)
1145 for found in topDocs.scoreDocs:
1146 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1147 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1149 # query themes/content x author/title/tags
1151 in_content = BooleanQuery()
1152 in_meta = BooleanQuery()
1154 for fld in ['themes_pl', 'content']:
1155 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1157 for fld in ['tags', 'authors', 'title']:
1158 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1160 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1161 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1163 topDocs = self.searcher.search(q, only_in, max_results)
1164 for found in topDocs.scoreDocs:
1165 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1166 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1170 # def multisearch(self, query, max_results=50):
1173 # - (phrase) OR -> content
1176 # - (keywords) -> authors
1181 # queryreader = StringReader(query)
1182 # tokens = self.get_tokens(queryreader)
1184 # top_level = BooleanQuery()
1185 # Should = BooleanClause.Occur.SHOULD
1187 # phrase_level = BooleanQuery()
1188 # phrase_level.setBoost(1.3)
1190 # p_content = self.make_phrase(tokens, joined=True)
1191 # p_title = self.make_phrase(tokens, 'title')
1192 # p_author = self.make_phrase(tokens, 'author')
1194 # phrase_level.add(BooleanClause(p_content, Should))
1195 # phrase_level.add(BooleanClause(p_title, Should))
1196 # phrase_level.add(BooleanClause(p_author, Should))
1198 # kw_level = BooleanQuery()
1200 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1201 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1202 # kw_level.add(j_themes, Should)
1203 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1204 # j_con = self.make_term_query(tokens, joined=True)
1205 # kw_level.add(j_con, Should)
1207 # top_level.add(BooleanClause(phrase_level, Should))
1208 # top_level.add(BooleanClause(kw_level, Should))
1212 def get_snippets(self, scoreDoc, query, field='content'):
1214 Returns a snippet for found scoreDoc.
1216 htmlFormatter = SimpleHTMLFormatter()
1217 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1219 stored = self.searcher.doc(scoreDoc.doc)
1221 position = stored.get('snippets_position')
1222 length = stored.get('snippets_length')
1223 if position is None or length is None:
1226 book_id = int(stored.get('book_id'))
1227 snippets = Snippets(book_id).open()
1230 text = snippets.get((int(position),
1235 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1236 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1237 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1239 except Exception, e:
1241 if hasattr(e, 'getJavaException'):
1242 e2 = unicode(e.getJavaException())
1243 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1248 def enum_to_array(enum):
1250 Converts a lucene TermEnum to array of Terms, suitable for
1259 if not enum.next(): break
1262 return JArray('object')(terms, Term)
1264 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1266 Search for Tag objects using query.
1269 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1270 tops = self.searcher.search(query, filters, max_results)
1273 for found in tops.scoreDocs:
1274 doc = self.searcher.doc(found.doc)
1275 is_pdcounter = doc.get('is_pdcounter')
1276 category = doc.get('tag_category')
1277 if is_pdcounter == 'true':
1278 if category == 'pd_author':
1279 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1280 elif category == 'pd_book':
1281 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1282 tag.category = 'pd_book' # make it look more lik a tag.
1284 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1286 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1287 # don't add the pdcounter tag if same tag already exists
1288 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1290 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1291 print 'returning %s' % tags
1294 def search_books(self, query, filter=None, max_results=10):
1296 Searches for Book objects using query
1299 tops = self.searcher.search(query, filter, max_results)
1300 for found in tops.scoreDocs:
1301 doc = self.searcher.doc(found.doc)
1302 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1305 def make_prefix_phrase(self, toks, field):
1306 q = MultiPhraseQuery()
1307 for i in range(len(toks)):
1308 t = Term(field, toks[i])
1309 if i == len(toks) - 1:
1310 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1320 def term_filter(term, inverse=False):
1321 only_term = TermsFilter()
1322 only_term.addTerm(term)
1325 neg = BooleanFilter()
1326 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1331 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1333 Return auto-complete hints for tags
1334 using prefix search.
1336 toks = self.get_tokens(string, field='SIMPLE')
1337 top = BooleanQuery()
1339 for field in ['tag_name', 'tag_name_pl']:
1341 q = self.make_prefix_phrase(toks, field)
1343 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1344 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1346 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1348 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1350 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1352 Returns auto-complete hints for book titles
1353 Because we do not index 'pseudo' title-tags.
1356 toks = self.get_tokens(string, field='SIMPLE')
1359 q = self.make_prefix_phrase(toks, 'title')
1361 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1363 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1366 def chain_filters(filters, op=ChainedFilter.AND):
1368 Chains a filter list together
1370 filters = filter(lambda x: x is not None, filters)
1371 if not filters or filters is []:
1373 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1376 def filtered_categories(self, tags):
1378 Return a list of tag categories, present in tags list.
1382 cats[t.category] = True