1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6 File, Field, Integer, \
7 NumericField, Version, Document, JavaError, IndexSearcher, \
8 QueryParser, PerFieldAnalyzerWrapper, \
9 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12 HashSet, BooleanClause, Term, CharTermAttribute, \
13 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17 initVM, CLASSPATH, JArray, JavaError
21 JVM = initVM(CLASSPATH)
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
38 class WLAnalyzer(PerFieldAnalyzerWrapper):
40 polish = PolishAnalyzer(Version.LUCENE_34)
41 # polish_gap.setPositionIncrementGap(999)
43 simple = SimpleAnalyzer(Version.LUCENE_34)
44 # simple_gap.setPositionIncrementGap(999)
46 keyword = KeywordAnalyzer(Version.LUCENE_34)
48 # not sure if needed: there's NOT_ANALYZED meaning basically the same
50 PerFieldAnalyzerWrapper.__init__(self, polish)
52 self.addAnalyzer("tags", simple)
53 self.addAnalyzer("technical_editors", simple)
54 self.addAnalyzer("editors", simple)
55 self.addAnalyzer("url", keyword)
56 self.addAnalyzer("source_url", keyword)
57 self.addAnalyzer("source_name", simple)
58 self.addAnalyzer("publisher", simple)
59 self.addAnalyzer("authors", simple)
60 self.addAnalyzer("title", simple)
62 self.addAnalyzer("is_book", keyword)
63 # shouldn't the title have two forms? _pl and simple?
65 self.addAnalyzer("themes", simple)
66 self.addAnalyzer("themes_pl", polish)
68 self.addAnalyzer("tag_name", simple)
69 self.addAnalyzer("tag_name_pl", polish)
71 self.addAnalyzer("translators", simple)
73 self.addAnalyzer("KEYWORD", keyword)
74 self.addAnalyzer("SIMPLE", simple)
75 self.addAnalyzer("POLISH", polish)
78 class IndexStore(object):
80 Provides access to search index.
82 self.store - lucene index directory
86 self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
88 def make_index_dir(self):
90 os.makedirs(settings.SEARCH_INDEX)
91 except OSError as exc:
92 if exc.errno == errno.EEXIST:
100 class IndexChecker(IndexStore):
102 IndexStore.__init__(self)
105 checker = CheckIndex(self.store)
106 status = checker.checkIndex()
110 class Snippets(object):
112 This class manages snippet files for indexed object (book)
113 the snippets are concatenated together, and their positions and
114 lengths are kept in lucene index fields.
116 SNIPPET_DIR = "snippets"
118 def __init__(self, book_id, revision=None):
120 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
121 except OSError as exc:
122 if exc.errno == errno.EEXIST:
125 self.book_id = book_id
126 self.revision = revision
131 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
132 else: fn = "%d" % self.book_id
134 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
136 def open(self, mode='r'):
138 Open the snippet file. Call .close() afterwards.
144 if os.path.exists(self.path):
147 if not os.path.exists(self.path):
150 print "using %s" % self.path
152 self.file = open(self.path, mode)
156 def add(self, snippet):
158 Append a snippet (unicode) to the snippet file.
159 Return a (position, length) tuple
161 txt = snippet.encode('utf-8')
164 pos = (self.position, l)
170 Given a tuple of (position, length) return an unicode
171 of the snippet stored there.
173 self.file.seek(pos[0], 0)
174 txt = self.file.read(pos[1]).decode('utf-8')
178 """Close snippet file"""
193 class BaseIndex(IndexStore):
196 Provides basic operations on index: opening, closing, optimizing.
198 def __init__(self, analyzer=None):
199 super(BaseIndex, self).__init__()
202 analyzer = WLAnalyzer()
203 self.analyzer = analyzer
205 def open(self, timeout=None):
207 raise Exception("Index is already opened")
208 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
210 conf.setWriteLockTimeout(long(timeout))
211 self.index = IndexWriter(self.store, conf)
215 self.index.optimize()
219 self.index.optimize()
220 except JavaError, je:
221 print "Error during optimize phase, check index: %s" % je
226 index_changed.send_robust(self)
228 super(BaseIndex, self).close()
234 def __exit__(self, type, value, tb):
238 index_changed = Signal()
241 class Index(BaseIndex):
243 Class indexing books.
245 def __init__(self, analyzer=None):
246 super(Index, self).__init__(analyzer)
248 def index_tags(self, *tags, **kw):
250 Re-index global tag list.
251 Removes all tags from index, then index them again.
252 Indexed fields include: id, name (with and without polish stems), category
254 remove_only = kw.get('remove_only', False)
255 # first, remove tags from index.
259 b_id_cat = BooleanQuery()
261 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
262 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
264 if isinstance(tag, PDCounterAuthor):
265 q_cat = TermQuery(Term('tag_category', 'pd_author'))
266 elif isinstance(tag, PDCounterBook):
267 q_cat = TermQuery(Term('tag_category', 'pd_book'))
269 q_cat = TermQuery(Term('tag_category', tag.category))
270 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
272 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
274 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
275 self.index.deleteDocuments(q)
278 # then add them [all or just one passed]
280 tags = catalogue.models.Tag.objects.exclude(category='set') + \
281 PDCounterAuthor.objects.all() + \
282 PDCounterBook.objects.all()
285 if isinstance(tag, PDCounterAuthor):
287 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
288 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
289 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
291 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
292 self.index.addDocument(doc)
293 elif isinstance(tag, PDCounterBook):
295 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
296 doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
297 doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
299 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
300 self.index.addDocument(doc)
303 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
304 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
305 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
307 self.index.addDocument(doc)
309 def create_book_doc(self, book):
311 Create a lucene document referring book id.
314 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
315 if book.parent is not None:
316 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
319 def remove_book(self, book, remove_snippets=True):
320 """Removes a book from search index.
321 book - Book instance."""
322 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
323 self.index.deleteDocuments(q)
326 snippets = Snippets(book.id)
329 def index_book(self, book, book_info=None, overwrite=True):
332 Creates a lucene document for extracted metadata
333 and calls self.index_content() to index the contents of the book.
336 # we don't remove snippets, since they might be still needed by
337 # threads using not reopened index
338 self.remove_book(book, remove_snippets=False)
340 book_doc = self.create_book_doc(book)
341 meta_fields = self.extract_metadata(book, book_info)
342 for f in meta_fields.values():
343 if isinstance(f, list) or isinstance(f, tuple):
348 self.index.addDocument(book_doc)
351 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
356 'dramat_wierszowany_l',
357 'dramat_wierszowany_lp',
358 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
362 ignore_content_tags = [
364 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
366 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
369 footnote_tags = ['pa', 'pt', 'pr', 'pe']
371 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
373 published_date_re = re.compile("([0-9]+)[\]. ]*$")
375 def extract_metadata(self, book, book_info=None):
377 Extract metadata from book and returns a map of fields keyed by fieldname
381 if book_info is None:
382 book_info = dcparser.parse(open(book.xml_file.path))
384 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
385 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
386 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
389 for field in dcparser.BookInfo.FIELDS:
390 if hasattr(book_info, field.name):
391 if not getattr(book_info, field.name):
393 # since no type information is available, we use validator
394 type_indicator = field.validator
395 if type_indicator == dcparser.as_unicode:
396 s = getattr(book_info, field.name)
400 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
401 except JavaError as je:
402 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
403 elif type_indicator == dcparser.as_person:
404 p = getattr(book_info, field.name)
405 if isinstance(p, dcparser.Person):
408 persons = ', '.join(map(unicode, p))
409 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
410 elif type_indicator == dcparser.as_date:
411 dt = getattr(book_info, field.name)
412 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
413 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
417 if hasattr(book_info, 'source_name') and book_info.source_name:
418 match = self.published_date_re.search(book_info.source_name)
419 if match is not None:
420 pd = str(match.groups()[0])
422 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
426 def add_gaps(self, fields, fieldname):
428 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
429 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
433 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
434 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
436 def get_master(self, root):
438 Returns the first master tag from an etree.
440 for master in root.iter():
441 if master.tag in self.master_tags:
444 def index_content(self, book, book_fields=[]):
446 Walks the book XML and extract content from it.
447 Adds parts for each header tag and for each fragment.
449 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
450 root = wld.edoc.getroot()
452 master = self.get_master(root)
456 def walker(node, ignore_tags=[]):
458 if node.tag not in ignore_tags:
459 yield node, None, None
460 if node.text is not None:
461 yield None, node.text, None
462 for child in list(node):
463 for b, t, e in walker(child):
465 yield None, None, node
467 if node.tail is not None:
468 yield None, node.tail, None
471 def fix_format(text):
472 # separator = [u" ", u"\t", u".", u";", u","]
473 if isinstance(text, list):
474 # need to join it first
475 text = filter(lambda s: s is not None, content)
476 text = u' '.join(text)
477 # for i in range(len(text)):
479 # if text[i][0] not in separator\
480 # and text[i - 1][-1] not in separator:
481 # text.insert(i, u" ")
483 return re.sub("(?m)/$", "", text)
485 def add_part(snippets, **fields):
486 doc = self.create_book_doc(book)
487 for f in book_fields:
490 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
491 doc.add(NumericField("header_span", Field.Store.YES, True)\
492 .setIntValue('header_span' in fields and fields['header_span'] or 1))
493 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
495 print ">>[%s]>%s<<<" % (fields.get('fragment_anchor', ''), fields['content'])
497 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
498 Field.TermVector.WITH_POSITIONS_OFFSETS))
500 snip_pos = snippets.add(fields["content"])
501 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
502 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
503 if snippets.revision:
504 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
506 if 'fragment_anchor' in fields:
507 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
508 Field.Store.YES, Field.Index.NOT_ANALYZED))
510 if 'themes' in fields:
511 themes, themes_pl = zip(*[
512 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
513 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
514 for theme in fields['themes']])
516 themes = self.add_gaps(themes, 'themes')
517 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
527 if isinstance(s, unicode):
528 return s.encode('utf-8')
533 snippets = Snippets(book.id).open('w')
535 for header, position in zip(list(master), range(len(master))):
537 if header.tag in self.skip_header_tags:
539 if header.tag is etree.Comment:
546 def all_content(text):
547 for frag in fragments.values():
548 frag['content'].append(text)
550 handle_text = [all_content]
553 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
555 if start is not None and start.tag in self.footnote_tags:
557 def collect_footnote(t):
559 handle_text.append(collect_footnote)
560 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
562 doc = add_part(snippets, header_index=position, header_type=header.tag,
563 content=u''.join(footnote),
564 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
566 self.index.addDocument(doc)
567 #print "@ footnote text: %s" % footnote
570 # handle fragments and themes.
571 if start is not None and start.tag == 'begin':
572 fid = start.attrib['id'][1:]
573 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
575 # themes for this fragment
576 elif start is not None and start.tag == 'motyw':
577 fid = start.attrib['id'][1:]
578 handle_text.append(None)
579 if start.text is not None:
580 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
581 elif end is not None and end.tag == 'motyw':
584 elif start is not None and start.tag == 'end':
585 fid = start.attrib['id'][1:]
586 if fid not in fragments:
587 continue # a broken <end> node, skip it
588 frag = fragments[fid]
589 if frag['themes'] == []:
590 continue # empty themes list.
593 doc = add_part(snippets,
594 header_type=frag['start_header'],
595 header_index=frag['start_section'],
596 header_span=position - frag['start_section'] + 1,
598 content=fix_format(frag['content']),
599 themes=frag['themes'])
600 #print '@ FRAG %s' % frag['content']
601 self.index.addDocument(doc)
605 if text is not None and handle_text is not []:
606 hdl = handle_text[-1]
610 # in the end, add a section text.
611 doc = add_part(snippets, header_index=position, header_type=header.tag,
612 content=fix_format(content))
613 #print '@ CONTENT: %s' % fix_format(content)
615 self.index.addDocument(doc)
621 def log_exception_wrapper(f):
626 print("Error in indexing thread: %s" % e)
627 traceback.print_exc()
632 class ReusableIndex(Index):
634 Works like index, but does not close/optimize Lucene index
635 until program exit (uses atexit hook).
636 This is usefull for importbooks command.
638 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
642 def open(self, analyzer=None, **kw):
643 if ReusableIndex.index:
644 self.index = ReusableIndex.index
646 print("opening index")
647 Index.open(self, analyzer, **kw)
648 ReusableIndex.index = self.index
649 atexit.register(ReusableIndex.close_reusable)
651 # def index_book(self, *args, **kw):
652 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
653 # ReusableIndex.pool_jobs.append(job)
656 def close_reusable():
657 if ReusableIndex.index:
658 print("closing index")
659 ReusableIndex.index.optimize()
660 ReusableIndex.index.close()
661 ReusableIndex.index = None
663 index_changed.send_robust(None)
666 if ReusableIndex.index:
667 ReusableIndex.index.commit()
670 class JoinSearch(object):
672 This mixin could be used to handle block join queries.
675 def __init__(self, *args, **kw):
676 super(JoinSearch, self).__init__(*args, **kw)
678 def wrapjoins(self, query, fields=[]):
680 This functions modifies the query in a recursive way,
681 so Term and Phrase Queries contained, which match
682 provided fields are wrapped in a BlockJoinQuery,
683 and so delegated to children documents.
685 if BooleanQuery.instance_(query):
686 qs = BooleanQuery.cast_(query)
688 clause = BooleanClause.cast_(clause)
689 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
693 query.extractTerms(termset)
696 if t.field() not in fields:
698 return BlockJoinQuery(query, self.parent_filter,
699 BlockJoinQuery.ScoreMode.Total)
701 def bsearch(self, query, max_results=50):
702 q = self.query(query)
703 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
705 tops = self.searcher.search(bjq, max_results)
707 for found in tops.scoreDocs:
708 doc = self.searcher.doc(found.doc)
709 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
710 return (bks, tops.totalHits)
713 class SearchResult(object):
714 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
715 if tokens_cache is None: tokens_cache = {}
720 self._score = scoreDocs.score
725 self._processed_hits = None # processed hits
727 stored = search.searcher.doc(scoreDocs.doc)
728 self.book_id = int(stored.get("book_id"))
730 pd = stored.get("published_date")
732 self.published_date = int(pd)
734 self.published_date = 0
736 header_type = stored.get("header_type")
737 # we have a content hit in some header of fragment
738 if header_type is not None:
739 sec = (header_type, int(stored.get("header_index")))
740 header_span = stored.get('header_span')
741 header_span = header_span is not None and int(header_span) or 1
743 fragment = stored.get("fragment_anchor")
746 snippets = snippets.replace("/\n", "\n")
747 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
749 self._hits.append(hit)
752 self.searched = searched
753 self.tokens_cache = tokens_cache
757 return self._score * self.boost
759 def merge(self, other):
760 if self.book_id != other.book_id:
761 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
762 self._hits += other._hits
763 if other.score > self.score:
764 self._score = other._score
768 if hasattr(self, '_book'):
770 return catalogue.models.Book.objects.get(id=self.book_id)
772 book = property(get_book)
776 if self._processed_hits is not None:
777 return self._processed_hits
786 # to sections and fragments
787 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
789 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
791 # sections not covered by fragments
792 sect = filter(lambda s: 0 == len(filter(
793 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
794 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
799 def remove_duplicates(lst, keyfn, compare):
804 if compare(els[eif], e) >= 1:
809 # remove fragments with duplicated fid's and duplicated snippets
810 frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
811 frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or hash(f),
812 lambda a, b: cmp(a[SCORE], b[SCORE]))
814 # remove duplicate sections
818 si = s[POSITION][POSITION_INDEX]
821 if sections[si]['score'] >= s[SCORE]:
824 m = {'score': s[SCORE],
825 'section_number': s[POSITION][POSITION_INDEX] + 1,
830 hits = sections.values()
834 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
835 except catalogue.models.Fragment.DoesNotExist:
839 # Figure out if we were searching for a token matching some word in theme name.
840 themes = frag.tags.filter(category='theme')
842 if self.searched is not None:
843 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
845 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
848 if not theme in themes_hit:
849 themes_hit.append(theme)
852 m = {'score': f[SCORE],
854 'section_number': f[POSITION][POSITION_INDEX] + 1,
856 'themes_hit': themes_hit
861 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
863 self._processed_hits = hits
867 def __unicode__(self):
868 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
871 def aggregate(*result_lists):
873 for rl in result_lists:
875 if r.book_id in books:
876 books[r.book_id].merge(r)
877 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
880 return books.values()
882 def __cmp__(self, other):
883 c = cmp(self.score, other.score)
885 # this is inverted, because earlier date is better
886 return cmp(other.published_date, self.published_date)
893 Given some hint information (information we already know about)
894 our search target - like author, title (specific book), epoch, genre, kind
895 we can narrow down search using filters.
897 def __init__(self, search):
899 Accepts a Searcher instance.
906 def books(self, *books):
908 Give a hint that we search these books.
912 def tags(self, tags):
914 Give a hint that these Tag objects (a list of)
918 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
919 lst = self.book_tags.get(t.category, [])
921 self.book_tags[t.category] = lst
922 if t.category in ['theme', 'theme_pl']:
923 self.part_tags.append(t)
925 def tag_filter(self, tags, field='tags'):
927 Given a lsit of tags and an optional field (but they are normally in tags field)
928 returns a filter accepting only books with specific tags.
933 toks = self.search.get_tokens(tag.name, field=field)
934 tag_phrase = PhraseQuery()
936 tag_phrase.add(Term(field, tok))
937 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
939 return QueryWrapperFilter(q)
941 def book_filter(self):
943 Filters using book tags (all tag kinds except a theme)
945 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
947 return self.tag_filter(tags)
951 def part_filter(self):
953 This filter can be used to look for book parts.
954 It filters on book id and/or themes.
958 fs.append(self.tag_filter(self.part_tags, field='themes'))
960 if self._books != []:
962 for b in self._books:
963 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
964 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
967 return Search.chain_filters(fs)
969 def should_search_for_book(self):
970 return self._books == []
972 def just_search_in(self, all):
973 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
976 if field == 'authors' and 'author' in self.book_tags:
978 if field == 'title' and self._books != []:
980 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
986 class Search(IndexStore):
990 def __init__(self, default_field="content"):
991 IndexStore.__init__(self)
992 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
993 # self.analyzer = WLAnalyzer()
994 reader = IndexReader.open(self.store, True)
995 self.searcher = IndexSearcher(reader)
996 self.parser = QueryParser(Version.LUCENE_34, default_field,
999 self.parent_filter = TermsFilter()
1000 self.parent_filter.addTerm(Term("is_book", "true"))
1001 index_changed.connect(self.reopen)
1004 reader = self.searcher.getIndexReader()
1005 self.searcher.close()
1007 super(Search, self).close()
1008 index_changed.disconnect(self.reopen)
1010 def reopen(self, **unused):
1011 reader = self.searcher.getIndexReader()
1012 rdr = reader.reopen()
1013 print "got signal to reopen index"
1014 if not rdr.equals(reader):
1015 print "will reopen index"
1016 oldsearch = self.searcher
1017 self.searcher = IndexSearcher(rdr)
1021 def query(self, query):
1022 """Parse query in default Lucene Syntax. (for humans)
1024 return self.parser.parse(query)
1026 def simple_search(self, query, max_results=50):
1027 """Runs a query for books using lucene syntax. (for humans)
1028 Returns (books, total_hits)
1031 tops = self.searcher.search(self.query(query), max_results)
1033 for found in tops.scoreDocs:
1034 doc = self.searcher.doc(found.doc)
1035 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1036 return (bks, tops.totalHits)
1038 def get_tokens(self, searched, field='content', cached=None):
1039 """returns tokens analyzed by a proper (for a field) analyzer
1040 argument can be: StringReader, string/unicode, or tokens. In the last case
1041 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1043 if cached is not None and field in cached:
1044 return cached[field]
1046 if isinstance(searched, str) or isinstance(searched, unicode):
1047 searched = StringReader(searched)
1048 elif isinstance(searched, list):
1052 tokens = self.analyzer.reusableTokenStream(field, searched)
1054 while tokens.incrementToken():
1055 cta = tokens.getAttribute(CharTermAttribute.class_)
1056 toks.append(cta.toString())
1058 if cached is not None:
1059 cached[field] = toks
1063 def fuzziness(self, fuzzy):
1064 """Helper method to sanitize fuzziness"""
1067 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1072 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1074 Return a PhraseQuery with a series of tokens.
1077 phrase = MultiPhraseQuery()
1079 term = Term(field, t)
1080 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1084 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
1085 ft = fuzzterm.term()
1087 fuzzterms.append(ft)
1088 if not fuzzterm.next(): break
1090 phrase.add(JArray('object')(fuzzterms, Term))
1094 phrase = PhraseQuery()
1095 phrase.setSlop(slop)
1097 term = Term(field, t)
1101 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1103 Returns term queries joined by boolean query.
1104 modal - applies to boolean query
1105 fuzzy - should the query by fuzzy.
1109 term = Term(field, t)
1111 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1113 term = TermQuery(term)
1114 q.add(BooleanClause(term, modal))
1117 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1118 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1119 if filters is None: filters = []
1120 if tokens_cache is None: tokens_cache = {}
1122 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1124 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1126 filters.append(self.term_filter(Term('is_book', 'true')))
1127 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1129 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1131 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1132 filters=None, tokens_cache=None, boost=None, snippets=True):
1133 if filters is None: filters = []
1134 if tokens_cache is None: tokens_cache = {}
1137 filters.append(self.term_filter(Term('is_book', 'true')))
1139 query = BooleanQuery()
1142 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1144 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1145 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1147 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1149 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1150 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1152 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1154 Search for perfect book matches. Just see if the query matches with some author or title,
1155 taking hints into account.
1157 fields_to_search = ['authors', 'title']
1160 if not hint.should_search_for_book():
1162 fields_to_search = hint.just_search_in(fields_to_search)
1163 only_in = hint.book_filter()
1165 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1169 top = self.searcher.search(q,
1170 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1172 for found in top.scoreDocs:
1173 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1176 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1177 fields_to_search = ['tags', 'authors', 'title']
1181 if not hint.should_search_for_book():
1183 fields_to_search = hint.just_search_in(fields_to_search)
1184 only_in = hint.book_filter()
1186 tokens = self.get_tokens(searched, field='SIMPLE')
1190 for fld in fields_to_search:
1191 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1192 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1195 top = self.searcher.search(q,
1196 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1198 for found in top.scoreDocs:
1199 books.append(SearchResult(self, found, how_found="search_book"))
1203 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1205 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1206 some part/fragment of the book.
1208 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1212 flt = hint.part_filter()
1216 top = self.searcher.search(q,
1217 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1220 for found in top.scoreDocs:
1221 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1225 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1227 Tries to use search terms to match different fields of book (or its parts).
1228 E.g. one word can be an author survey, another be a part of the title, and the rest
1229 are some words from third chapter.
1231 if tokens_cache is None: tokens_cache = {}
1236 only_in = hint.part_filter()
1238 # content only query : themes x content
1241 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1242 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1244 # only search in themes when we do not already filter by themes
1245 if hint is None or hint.just_search_in(['themes']) != []:
1246 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1247 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1249 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1250 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1252 topDocs = self.searcher.search(q, only_in, max_results)
1253 for found in topDocs.scoreDocs:
1254 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1255 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1257 # query themes/content x author/title/tags
1259 in_content = BooleanQuery()
1260 in_meta = BooleanQuery()
1262 for fld in ['themes_pl', 'content']:
1263 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1265 for fld in ['tags', 'authors', 'title']:
1266 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1268 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1269 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1271 topDocs = self.searcher.search(q, only_in, max_results)
1272 for found in topDocs.scoreDocs:
1273 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1274 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1278 # def multisearch(self, query, max_results=50):
1281 # - (phrase) OR -> content
1284 # - (keywords) -> authors
1289 # queryreader = StringReader(query)
1290 # tokens = self.get_tokens(queryreader)
1292 # top_level = BooleanQuery()
1293 # Should = BooleanClause.Occur.SHOULD
1295 # phrase_level = BooleanQuery()
1296 # phrase_level.setBoost(1.3)
1298 # p_content = self.make_phrase(tokens, joined=True)
1299 # p_title = self.make_phrase(tokens, 'title')
1300 # p_author = self.make_phrase(tokens, 'author')
1302 # phrase_level.add(BooleanClause(p_content, Should))
1303 # phrase_level.add(BooleanClause(p_title, Should))
1304 # phrase_level.add(BooleanClause(p_author, Should))
1306 # kw_level = BooleanQuery()
1308 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1309 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1310 # kw_level.add(j_themes, Should)
1311 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1312 # j_con = self.make_term_query(tokens, joined=True)
1313 # kw_level.add(j_con, Should)
1315 # top_level.add(BooleanClause(phrase_level, Should))
1316 # top_level.add(BooleanClause(kw_level, Should))
1320 def get_snippets(self, scoreDoc, query, field='content'):
1322 Returns a snippet for found scoreDoc.
1324 htmlFormatter = SimpleHTMLFormatter()
1325 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1327 stored = self.searcher.doc(scoreDoc.doc)
1329 position = stored.get('snippets_position')
1330 length = stored.get('snippets_length')
1331 if position is None or length is None:
1333 revision = stored.get('snippets_revision')
1334 if revision: revision = int(revision)
1336 book_id = int(stored.get('book_id'))
1337 snippets = Snippets(book_id, revision=revision).open()
1340 text = snippets.get((int(position),
1345 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1346 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1347 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1349 except Exception, e:
1351 if hasattr(e, 'getJavaException'):
1352 e2 = unicode(e.getJavaException())
1353 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1358 def enum_to_array(enum):
1360 Converts a lucene TermEnum to array of Terms, suitable for
1369 if not enum.next(): break
1372 return JArray('object')(terms, Term)
1374 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1376 Search for Tag objects using query.
1379 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1380 tops = self.searcher.search(query, filters, max_results)
1383 for found in tops.scoreDocs:
1384 doc = self.searcher.doc(found.doc)
1385 is_pdcounter = doc.get('is_pdcounter')
1386 category = doc.get('tag_category')
1388 if is_pdcounter == 'true':
1389 if category == 'pd_author':
1390 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1391 elif category == 'pd_book':
1392 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1393 tag.category = 'pd_book' # make it look more lik a tag.
1395 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1397 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1398 # don't add the pdcounter tag if same tag already exists
1399 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1401 except catalogue.models.Tag.DoesNotExist: pass
1402 except PDCounterAuthor.DoesNotExist: pass
1403 except PDCounterBook.DoesNotExist: pass
1405 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1406 print 'returning %s' % tags
1409 def search_books(self, query, filter=None, max_results=10):
1411 Searches for Book objects using query
1414 tops = self.searcher.search(query, filter, max_results)
1415 for found in tops.scoreDocs:
1416 doc = self.searcher.doc(found.doc)
1418 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1419 except catalogue.models.Book.DoesNotExist: pass
1422 def make_prefix_phrase(self, toks, field):
1423 q = MultiPhraseQuery()
1424 for i in range(len(toks)):
1425 t = Term(field, toks[i])
1426 if i == len(toks) - 1:
1427 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1437 def term_filter(term, inverse=False):
1438 only_term = TermsFilter()
1439 only_term.addTerm(term)
1442 neg = BooleanFilter()
1443 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1448 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1450 Return auto-complete hints for tags
1451 using prefix search.
1453 toks = self.get_tokens(string, field='SIMPLE')
1454 top = BooleanQuery()
1456 for field in ['tag_name', 'tag_name_pl']:
1458 q = self.make_prefix_phrase(toks, field)
1460 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1461 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1463 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1465 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1467 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1469 Returns auto-complete hints for book titles
1470 Because we do not index 'pseudo' title-tags.
1473 toks = self.get_tokens(string, field='SIMPLE')
1476 q = self.make_prefix_phrase(toks, 'title')
1478 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1480 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1483 def chain_filters(filters, op=ChainedFilter.AND):
1485 Chains a filter list together
1487 filters = filter(lambda x: x is not None, filters)
1488 if not filters or filters is []:
1490 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1493 def filtered_categories(self, tags):
1495 Return a list of tag categories, present in tags list.
1499 cats[t.category] = True