1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from django.dispatch import Signal
5 from lucene import SimpleFSDirectory, NIOFSDirectory, IndexWriter, IndexReader, IndexWriterConfig, CheckIndex, \
6 File, Field, Integer, \
7 NumericField, Version, Document, JavaError, IndexSearcher, \
8 QueryParser, PerFieldAnalyzerWrapper, \
9 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
10 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
11 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
12 HashSet, BooleanClause, Term, CharTermAttribute, \
13 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
14 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
15 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
16 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
17 initVM, CLASSPATH, JArray, JavaError
21 JVM = initVM(CLASSPATH)
27 from librarian import dcparser
28 from librarian.parser import WLDocument
29 from lxml import etree
30 import catalogue.models
31 from pdcounter.models import Author as PDCounterAuthor, BookStub as PDCounterBook
32 from multiprocessing.pool import ThreadPool
33 from threading import current_thread
34 from itertools import chain
38 log = logging.getLogger('search')
40 class WLAnalyzer(PerFieldAnalyzerWrapper):
42 polish = PolishAnalyzer(Version.LUCENE_34)
43 # polish_gap.setPositionIncrementGap(999)
45 simple = SimpleAnalyzer(Version.LUCENE_34)
46 # simple_gap.setPositionIncrementGap(999)
48 keyword = KeywordAnalyzer(Version.LUCENE_34)
50 # not sure if needed: there's NOT_ANALYZED meaning basically the same
52 PerFieldAnalyzerWrapper.__init__(self, polish)
54 self.addAnalyzer("tags", simple)
55 self.addAnalyzer("technical_editors", simple)
56 self.addAnalyzer("editors", simple)
57 self.addAnalyzer("url", keyword)
58 self.addAnalyzer("source_url", keyword)
59 self.addAnalyzer("source_name", simple)
60 self.addAnalyzer("publisher", simple)
61 self.addAnalyzer("authors", simple)
62 self.addAnalyzer("title", simple)
64 self.addAnalyzer("is_book", keyword)
65 # shouldn't the title have two forms? _pl and simple?
67 self.addAnalyzer("themes", simple)
68 self.addAnalyzer("themes_pl", polish)
70 self.addAnalyzer("tag_name", simple)
71 self.addAnalyzer("tag_name_pl", polish)
73 self.addAnalyzer("translators", simple)
75 self.addAnalyzer("KEYWORD", keyword)
76 self.addAnalyzer("SIMPLE", simple)
77 self.addAnalyzer("POLISH", polish)
80 class IndexStore(object):
82 Provides access to search index.
84 self.store - lucene index directory
88 self.store = NIOFSDirectory(File(settings.SEARCH_INDEX))
90 def make_index_dir(self):
92 os.makedirs(settings.SEARCH_INDEX)
93 except OSError as exc:
94 if exc.errno == errno.EEXIST:
102 class IndexChecker(IndexStore):
104 IndexStore.__init__(self)
107 checker = CheckIndex(self.store)
108 status = checker.checkIndex()
112 class Snippets(object):
114 This class manages snippet files for indexed object (book)
115 the snippets are concatenated together, and their positions and
116 lengths are kept in lucene index fields.
118 SNIPPET_DIR = "snippets"
120 def __init__(self, book_id, revision=None):
122 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
123 except OSError as exc:
124 if exc.errno == errno.EEXIST:
127 self.book_id = book_id
128 self.revision = revision
133 if self.revision: fn = "%d.%d" % (self.book_id, self.revision)
134 else: fn = "%d" % self.book_id
136 return os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, fn)
138 def open(self, mode='r'):
140 Open the snippet file. Call .close() afterwards.
146 if os.path.exists(self.path):
149 if not os.path.exists(self.path):
153 self.file = open(self.path, mode)
157 def add(self, snippet):
159 Append a snippet (unicode) to the snippet file.
160 Return a (position, length) tuple
162 txt = snippet.encode('utf-8')
165 pos = (self.position, l)
171 Given a tuple of (position, length) return an unicode
172 of the snippet stored there.
174 self.file.seek(pos[0], 0)
175 txt = self.file.read(pos[1]).decode('utf-8')
179 """Close snippet file"""
194 class BaseIndex(IndexStore):
197 Provides basic operations on index: opening, closing, optimizing.
199 def __init__(self, analyzer=None):
200 super(BaseIndex, self).__init__()
203 analyzer = WLAnalyzer()
204 self.analyzer = analyzer
206 def open(self, timeout=None):
208 raise Exception("Index is already opened")
209 conf = IndexWriterConfig(Version.LUCENE_34, self.analyzer)
211 conf.setWriteLockTimeout(long(timeout))
212 self.index = IndexWriter(self.store, conf)
216 self.index.optimize()
220 self.index.optimize()
221 except JavaError, je:
222 log.error("Error during optimize phase, check index: %s" % je)
227 index_changed.send_robust(self)
229 super(BaseIndex, self).close()
235 def __exit__(self, type, value, tb):
239 index_changed = Signal()
242 class Index(BaseIndex):
244 Class indexing books.
246 def __init__(self, analyzer=None):
247 super(Index, self).__init__(analyzer)
249 def index_tags(self, *tags, **kw):
251 Re-index global tag list.
252 Removes all tags from index, then index them again.
253 Indexed fields include: id, name (with and without polish stems), category
255 remove_only = kw.get('remove_only', False)
256 # first, remove tags from index.
260 b_id_cat = BooleanQuery()
262 q_id = NumericRangeQuery.newIntRange("tag_id", tag.id, tag.id, True, True)
263 b_id_cat.add(q_id, BooleanClause.Occur.MUST)
265 if isinstance(tag, PDCounterAuthor):
266 q_cat = TermQuery(Term('tag_category', 'pd_author'))
267 elif isinstance(tag, PDCounterBook):
268 q_cat = TermQuery(Term('tag_category', 'pd_book'))
270 q_cat = TermQuery(Term('tag_category', tag.category))
271 b_id_cat.add(q_cat, BooleanClause.Occur.MUST)
273 q.add(b_id_cat, BooleanClause.Occur.SHOULD)
275 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
276 self.index.deleteDocuments(q)
279 # then add them [all or just one passed]
281 tags = chain(catalogue.models.Tag.objects.exclude(category='set'), \
282 PDCounterAuthor.objects.all(), \
283 PDCounterBook.objects.all())
286 if isinstance(tag, PDCounterAuthor):
288 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
289 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
290 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
291 doc.add(Field("tag_category", 'pd_author', Field.Store.YES, Field.Index.NOT_ANALYZED))
292 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
293 self.index.addDocument(doc)
294 elif isinstance(tag, PDCounterBook):
296 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
297 doc.add(Field("tag_name", tag.title, Field.Store.NO, Field.Index.ANALYZED))
298 doc.add(Field("tag_name_pl", tag.title, Field.Store.NO, Field.Index.ANALYZED))
299 doc.add(Field("tag_category", 'pd_book', Field.Store.YES, Field.Index.NOT_ANALYZED))
300 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
301 self.index.addDocument(doc)
304 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
305 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
306 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
307 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
308 self.index.addDocument(doc)
310 def create_book_doc(self, book):
312 Create a lucene document referring book id.
315 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
316 if book.parent is not None:
317 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
320 def remove_book(self, book, remove_snippets=True):
321 """Removes a book from search index.
322 book - Book instance."""
323 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
324 self.index.deleteDocuments(q)
327 snippets = Snippets(book.id)
330 def index_book(self, book, book_info=None, overwrite=True):
333 Creates a lucene document for extracted metadata
334 and calls self.index_content() to index the contents of the book.
337 # we don't remove snippets, since they might be still needed by
338 # threads using not reopened index
339 self.remove_book(book, remove_snippets=False)
341 book_doc = self.create_book_doc(book)
342 meta_fields = self.extract_metadata(book, book_info)
343 for f in meta_fields.values():
344 if isinstance(f, list) or isinstance(f, tuple):
349 self.index.addDocument(book_doc)
352 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
357 'dramat_wierszowany_l',
358 'dramat_wierszowany_lp',
359 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
363 ignore_content_tags = [
365 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
367 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
370 footnote_tags = ['pa', 'pt', 'pr', 'pe']
372 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
374 published_date_re = re.compile("([0-9]+)[\]. ]*$")
376 def extract_metadata(self, book, book_info=None):
378 Extract metadata from book and returns a map of fields keyed by fieldname
382 if book_info is None:
383 book_info = dcparser.parse(open(book.xml_file.path))
385 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
386 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
387 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
390 for field in dcparser.BookInfo.FIELDS:
391 if hasattr(book_info, field.name):
392 if not getattr(book_info, field.name):
394 # since no type information is available, we use validator
395 type_indicator = field.validator
396 if type_indicator == dcparser.as_unicode:
397 s = getattr(book_info, field.name)
401 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
402 except JavaError as je:
403 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
404 elif type_indicator == dcparser.as_person:
405 p = getattr(book_info, field.name)
406 if isinstance(p, dcparser.Person):
409 persons = ', '.join(map(unicode, p))
410 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
411 elif type_indicator == dcparser.as_date:
412 dt = getattr(book_info, field.name)
413 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
414 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
418 if hasattr(book_info, 'source_name') and book_info.source_name:
419 match = self.published_date_re.search(book_info.source_name)
420 if match is not None:
421 pd = str(match.groups()[0])
423 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
427 def add_gaps(self, fields, fieldname):
429 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
430 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
434 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
435 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
437 def get_master(self, root):
439 Returns the first master tag from an etree.
441 for master in root.iter():
442 if master.tag in self.master_tags:
445 def index_content(self, book, book_fields=[]):
447 Walks the book XML and extract content from it.
448 Adds parts for each header tag and for each fragment.
450 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
451 root = wld.edoc.getroot()
453 master = self.get_master(root)
457 def walker(node, ignore_tags=[]):
459 if node.tag not in ignore_tags:
460 yield node, None, None
461 if node.text is not None:
462 yield None, node.text, None
463 for child in list(node):
464 for b, t, e in walker(child):
466 yield None, None, node
468 if node.tail is not None:
469 yield None, node.tail, None
472 def fix_format(text):
473 # separator = [u" ", u"\t", u".", u";", u","]
474 if isinstance(text, list):
475 # need to join it first
476 text = filter(lambda s: s is not None, content)
477 text = u' '.join(text)
478 # for i in range(len(text)):
480 # if text[i][0] not in separator\
481 # and text[i - 1][-1] not in separator:
482 # text.insert(i, u" ")
484 return re.sub("(?m)/$", "", text)
486 def add_part(snippets, **fields):
487 doc = self.create_book_doc(book)
488 for f in book_fields:
491 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
492 doc.add(NumericField("header_span", Field.Store.YES, True)\
493 .setIntValue('header_span' in fields and fields['header_span'] or 1))
494 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
496 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
497 Field.TermVector.WITH_POSITIONS_OFFSETS))
499 snip_pos = snippets.add(fields["content"])
500 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
501 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
502 if snippets.revision:
503 doc.add(NumericField("snippets_revision", Field.Store.YES, True).setIntValue(snippets.revision))
505 if 'fragment_anchor' in fields:
506 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
507 Field.Store.YES, Field.Index.NOT_ANALYZED))
509 if 'themes' in fields:
510 themes, themes_pl = zip(*[
511 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
512 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
513 for theme in fields['themes']])
515 themes = self.add_gaps(themes, 'themes')
516 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
526 if isinstance(s, unicode):
527 return s.encode('utf-8')
532 snippets = Snippets(book.id).open('w')
534 for header, position in zip(list(master), range(len(master))):
536 if header.tag in self.skip_header_tags:
538 if header.tag is etree.Comment:
545 def all_content(text):
546 for frag in fragments.values():
547 frag['content'].append(text)
549 handle_text = [all_content]
552 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
554 if start is not None and start.tag in self.footnote_tags:
556 def collect_footnote(t):
558 handle_text.append(collect_footnote)
559 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
561 doc = add_part(snippets, header_index=position, header_type=header.tag,
562 content=u''.join(footnote),
563 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
565 self.index.addDocument(doc)
566 #print "@ footnote text: %s" % footnote
569 # handle fragments and themes.
570 if start is not None and start.tag == 'begin':
571 fid = start.attrib['id'][1:]
572 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
574 # themes for this fragment
575 elif start is not None and start.tag == 'motyw':
576 fid = start.attrib['id'][1:]
577 handle_text.append(None)
578 if start.text is not None:
579 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
580 elif end is not None and end.tag == 'motyw':
583 elif start is not None and start.tag == 'end':
584 fid = start.attrib['id'][1:]
585 if fid not in fragments:
586 continue # a broken <end> node, skip it
587 frag = fragments[fid]
588 if frag['themes'] == []:
589 continue # empty themes list.
592 doc = add_part(snippets,
593 header_type=frag['start_header'],
594 header_index=frag['start_section'],
595 header_span=position - frag['start_section'] + 1,
597 content=fix_format(frag['content']),
598 themes=frag['themes'])
599 #print '@ FRAG %s' % frag['content']
600 self.index.addDocument(doc)
604 if text is not None and handle_text is not []:
605 hdl = handle_text[-1]
609 # in the end, add a section text.
610 doc = add_part(snippets, header_index=position, header_type=header.tag,
611 content=fix_format(content))
612 #print '@ CONTENT: %s' % fix_format(content)
614 self.index.addDocument(doc)
620 def log_exception_wrapper(f):
625 log.error("Error in indexing thread: %s" % e)
626 traceback.print_exc()
631 class ReusableIndex(Index):
633 Works like index, but does not close/optimize Lucene index
634 until program exit (uses atexit hook).
635 This is usefull for importbooks command.
637 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
641 def open(self, analyzer=None, **kw):
642 if ReusableIndex.index:
643 self.index = ReusableIndex.index
645 Index.open(self, analyzer, **kw)
646 ReusableIndex.index = self.index
647 atexit.register(ReusableIndex.close_reusable)
649 # def index_book(self, *args, **kw):
650 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
651 # ReusableIndex.pool_jobs.append(job)
654 def close_reusable():
655 if ReusableIndex.index:
656 ReusableIndex.index.optimize()
657 ReusableIndex.index.close()
658 ReusableIndex.index = None
660 index_changed.send_robust(None)
663 if ReusableIndex.index:
664 ReusableIndex.index.commit()
667 class JoinSearch(object):
669 This mixin could be used to handle block join queries.
672 def __init__(self, *args, **kw):
673 super(JoinSearch, self).__init__(*args, **kw)
675 def wrapjoins(self, query, fields=[]):
677 This functions modifies the query in a recursive way,
678 so Term and Phrase Queries contained, which match
679 provided fields are wrapped in a BlockJoinQuery,
680 and so delegated to children documents.
682 if BooleanQuery.instance_(query):
683 qs = BooleanQuery.cast_(query)
685 clause = BooleanClause.cast_(clause)
686 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
690 query.extractTerms(termset)
693 if t.field() not in fields:
695 return BlockJoinQuery(query, self.parent_filter,
696 BlockJoinQuery.ScoreMode.Total)
698 def bsearch(self, query, max_results=50):
699 q = self.query(query)
700 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
702 tops = self.searcher.search(bjq, max_results)
704 for found in tops.scoreDocs:
705 doc = self.searcher.doc(found.doc)
706 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
707 return (bks, tops.totalHits)
710 class SearchResult(object):
711 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
712 if tokens_cache is None: tokens_cache = {}
717 self._score = scoreDocs.score
722 self._processed_hits = None # processed hits
724 stored = search.searcher.doc(scoreDocs.doc)
725 self.book_id = int(stored.get("book_id"))
727 pd = stored.get("published_date")
729 self.published_date = int(pd)
731 self.published_date = 0
733 header_type = stored.get("header_type")
734 # we have a content hit in some header of fragment
735 if header_type is not None:
736 sec = (header_type, int(stored.get("header_index")))
737 header_span = stored.get('header_span')
738 header_span = header_span is not None and int(header_span) or 1
740 fragment = stored.get("fragment_anchor")
743 snippets = snippets.replace("/\n", "\n")
744 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
746 self._hits.append(hit)
749 self.searched = searched
750 self.tokens_cache = tokens_cache
754 return self._score * self.boost
756 def merge(self, other):
757 if self.book_id != other.book_id:
758 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
759 self._hits += other._hits
760 if other.score > self.score:
761 self._score = other._score
765 if hasattr(self, '_book'):
767 return catalogue.models.Book.objects.get(id=self.book_id)
769 book = property(get_book)
773 if self._processed_hits is not None:
774 return self._processed_hits
783 # to sections and fragments
784 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
786 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
788 # sections not covered by fragments
789 sect = filter(lambda s: 0 == len(filter(
790 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
791 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
796 def remove_duplicates(lst, keyfn, compare):
801 if compare(els[eif], e) >= 1:
806 # remove fragments with duplicated fid's and duplicated snippets
807 frags = remove_duplicates(frags, lambda f: f[FRAGMENT], lambda a, b: cmp(a[SCORE], b[SCORE]))
808 frags = remove_duplicates(frags, lambda f: f[OTHER]['snippets'] and f[OTHER]['snippets'][0] or f[FRAGMENT],
809 lambda a, b: cmp(a[SCORE], b[SCORE]))
811 # remove duplicate sections
815 si = s[POSITION][POSITION_INDEX]
818 if sections[si]['score'] >= s[SCORE]:
821 m = {'score': s[SCORE],
822 'section_number': s[POSITION][POSITION_INDEX] + 1,
827 hits = sections.values()
831 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
832 except catalogue.models.Fragment.DoesNotExist:
836 # Figure out if we were searching for a token matching some word in theme name.
837 themes = frag.tags.filter(category='theme')
839 if self.searched is not None:
840 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
842 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
845 if not theme in themes_hit:
846 themes_hit.append(theme)
849 m = {'score': f[SCORE],
851 'section_number': f[POSITION][POSITION_INDEX] + 1,
853 'themes_hit': themes_hit
858 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
860 self._processed_hits = hits
864 def __unicode__(self):
865 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
868 def aggregate(*result_lists):
870 for rl in result_lists:
872 if r.book_id in books:
873 books[r.book_id].merge(r)
876 return books.values()
878 def __cmp__(self, other):
879 c = cmp(self.score, other.score)
881 # this is inverted, because earlier date is better
882 return cmp(other.published_date, self.published_date)
889 Given some hint information (information we already know about)
890 our search target - like author, title (specific book), epoch, genre, kind
891 we can narrow down search using filters.
893 def __init__(self, search):
895 Accepts a Searcher instance.
902 def books(self, *books):
904 Give a hint that we search these books.
908 def tags(self, tags):
910 Give a hint that these Tag objects (a list of)
914 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
915 lst = self.book_tags.get(t.category, [])
917 self.book_tags[t.category] = lst
918 if t.category in ['theme', 'theme_pl']:
919 self.part_tags.append(t)
921 def tag_filter(self, tags, field='tags'):
923 Given a lsit of tags and an optional field (but they are normally in tags field)
924 returns a filter accepting only books with specific tags.
929 toks = self.search.get_tokens(tag.name, field=field)
930 tag_phrase = PhraseQuery()
932 tag_phrase.add(Term(field, tok))
933 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
935 return QueryWrapperFilter(q)
937 def book_filter(self):
939 Filters using book tags (all tag kinds except a theme)
941 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
943 return self.tag_filter(tags)
947 def part_filter(self):
949 This filter can be used to look for book parts.
950 It filters on book id and/or themes.
954 fs.append(self.tag_filter(self.part_tags, field='themes'))
956 if self._books != []:
958 for b in self._books:
959 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
960 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
963 return Search.chain_filters(fs)
965 def should_search_for_book(self):
966 return self._books == []
968 def just_search_in(self, all):
969 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
972 if field == 'authors' and 'author' in self.book_tags:
974 if field == 'title' and self._books != []:
976 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
982 class Search(IndexStore):
986 def __init__(self, default_field="content"):
987 IndexStore.__init__(self)
988 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
989 # self.analyzer = WLAnalyzer()
990 reader = IndexReader.open(self.store, True)
991 self.searcher = IndexSearcher(reader)
992 self.parser = QueryParser(Version.LUCENE_34, default_field,
995 self.parent_filter = TermsFilter()
996 self.parent_filter.addTerm(Term("is_book", "true"))
997 index_changed.connect(self.reopen)
1000 reader = self.searcher.getIndexReader()
1001 self.searcher.close()
1003 super(Search, self).close()
1004 index_changed.disconnect(self.reopen)
1006 def reopen(self, **unused):
1007 reader = self.searcher.getIndexReader()
1008 rdr = reader.reopen()
1009 if not rdr.equals(reader):
1010 log.debug('Reopening index')
1011 oldsearch = self.searcher
1012 self.searcher = IndexSearcher(rdr)
1016 def query(self, query):
1017 """Parse query in default Lucene Syntax. (for humans)
1019 return self.parser.parse(query)
1021 def simple_search(self, query, max_results=50):
1022 """Runs a query for books using lucene syntax. (for humans)
1023 Returns (books, total_hits)
1026 tops = self.searcher.search(self.query(query), max_results)
1028 for found in tops.scoreDocs:
1029 doc = self.searcher.doc(found.doc)
1030 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1031 return (bks, tops.totalHits)
1033 def get_tokens(self, searched, field='content', cached=None):
1034 """returns tokens analyzed by a proper (for a field) analyzer
1035 argument can be: StringReader, string/unicode, or tokens. In the last case
1036 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
1038 if cached is not None and field in cached:
1039 return cached[field]
1041 if isinstance(searched, str) or isinstance(searched, unicode):
1042 searched = StringReader(searched)
1043 elif isinstance(searched, list):
1047 tokens = self.analyzer.reusableTokenStream(field, searched)
1049 while tokens.incrementToken():
1050 cta = tokens.getAttribute(CharTermAttribute.class_)
1051 toks.append(cta.toString())
1053 if cached is not None:
1054 cached[field] = toks
1058 def fuzziness(self, fuzzy):
1059 """Helper method to sanitize fuzziness"""
1062 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
1067 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
1069 Return a PhraseQuery with a series of tokens.
1072 phrase = MultiPhraseQuery()
1074 term = Term(field, t)
1075 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
1079 ft = fuzzterm.term()
1081 fuzzterms.append(ft)
1082 if not fuzzterm.next(): break
1084 phrase.add(JArray('object')(fuzzterms, Term))
1088 phrase = PhraseQuery()
1089 phrase.setSlop(slop)
1091 term = Term(field, t)
1095 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
1097 Returns term queries joined by boolean query.
1098 modal - applies to boolean query
1099 fuzzy - should the query by fuzzy.
1103 term = Term(field, t)
1105 term = FuzzyQuery(term, self.fuzziness(fuzzy))
1107 term = TermQuery(term)
1108 q.add(BooleanClause(term, modal))
1111 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
1112 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1113 if filters is None: filters = []
1114 if tokens_cache is None: tokens_cache = {}
1116 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1118 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1120 filters.append(self.term_filter(Term('is_book', 'true')))
1121 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1123 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1125 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1126 filters=None, tokens_cache=None, boost=None, snippets=True):
1127 if filters is None: filters = []
1128 if tokens_cache is None: tokens_cache = {}
1131 filters.append(self.term_filter(Term('is_book', 'true')))
1133 query = BooleanQuery()
1136 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1138 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1139 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1141 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1143 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1144 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1146 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1148 Search for perfect book matches. Just see if the query matches with some author or title,
1149 taking hints into account.
1151 fields_to_search = ['authors', 'title']
1154 if not hint.should_search_for_book():
1156 fields_to_search = hint.just_search_in(fields_to_search)
1157 only_in = hint.book_filter()
1159 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1163 top = self.searcher.search(q,
1164 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1166 for found in top.scoreDocs:
1167 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1170 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1171 fields_to_search = ['tags', 'authors', 'title']
1175 if not hint.should_search_for_book():
1177 fields_to_search = hint.just_search_in(fields_to_search)
1178 only_in = hint.book_filter()
1180 tokens = self.get_tokens(searched, field='SIMPLE')
1184 for fld in fields_to_search:
1185 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1186 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1189 top = self.searcher.search(q,
1190 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1192 for found in top.scoreDocs:
1193 books.append(SearchResult(self, found, how_found="search_book"))
1197 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1199 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1200 some part/fragment of the book.
1202 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1206 flt = hint.part_filter()
1210 top = self.searcher.search(q,
1211 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1214 for found in top.scoreDocs:
1215 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1219 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1221 Tries to use search terms to match different fields of book (or its parts).
1222 E.g. one word can be an author survey, another be a part of the title, and the rest
1223 are some words from third chapter.
1225 if tokens_cache is None: tokens_cache = {}
1230 only_in = hint.part_filter()
1232 # content only query : themes x content
1235 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1236 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1238 # only search in themes when we do not already filter by themes
1239 if hint is None or hint.just_search_in(['themes']) != []:
1240 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1241 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1243 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1244 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1246 topDocs = self.searcher.search(q, only_in, max_results)
1247 for found in topDocs.scoreDocs:
1248 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1250 # query themes/content x author/title/tags
1252 in_content = BooleanQuery()
1253 in_meta = BooleanQuery()
1255 for fld in ['themes_pl', 'content']:
1256 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1258 for fld in ['tags', 'authors', 'title']:
1259 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1261 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1262 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1264 topDocs = self.searcher.search(q, only_in, max_results)
1265 for found in topDocs.scoreDocs:
1266 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1270 # def multisearch(self, query, max_results=50):
1273 # - (phrase) OR -> content
1276 # - (keywords) -> authors
1281 # queryreader = StringReader(query)
1282 # tokens = self.get_tokens(queryreader)
1284 # top_level = BooleanQuery()
1285 # Should = BooleanClause.Occur.SHOULD
1287 # phrase_level = BooleanQuery()
1288 # phrase_level.setBoost(1.3)
1290 # p_content = self.make_phrase(tokens, joined=True)
1291 # p_title = self.make_phrase(tokens, 'title')
1292 # p_author = self.make_phrase(tokens, 'author')
1294 # phrase_level.add(BooleanClause(p_content, Should))
1295 # phrase_level.add(BooleanClause(p_title, Should))
1296 # phrase_level.add(BooleanClause(p_author, Should))
1298 # kw_level = BooleanQuery()
1300 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1301 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1302 # kw_level.add(j_themes, Should)
1303 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1304 # j_con = self.make_term_query(tokens, joined=True)
1305 # kw_level.add(j_con, Should)
1307 # top_level.add(BooleanClause(phrase_level, Should))
1308 # top_level.add(BooleanClause(kw_level, Should))
1312 def get_snippets(self, scoreDoc, query, field='content'):
1314 Returns a snippet for found scoreDoc.
1316 htmlFormatter = SimpleHTMLFormatter()
1317 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1319 stored = self.searcher.doc(scoreDoc.doc)
1321 position = stored.get('snippets_position')
1322 length = stored.get('snippets_length')
1323 if position is None or length is None:
1325 revision = stored.get('snippets_revision')
1326 if revision: revision = int(revision)
1329 book_id = int(stored.get('book_id'))
1330 snippets = Snippets(book_id, revision=revision)
1335 log.error("Cannot open snippet file for book id = %d [rev=%d], %s" % (book_id, revision, e))
1340 text = snippets.get((int(position),
1345 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1346 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1347 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1349 except Exception, e:
1351 if hasattr(e, 'getJavaException'):
1352 e2 = unicode(e.getJavaException())
1353 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1358 def enum_to_array(enum):
1360 Converts a lucene TermEnum to array of Terms, suitable for
1369 if not enum.next(): break
1372 return JArray('object')(terms, Term)
1374 def search_tags(self, query, filt=None, max_results=40, pdcounter=False):
1376 Search for Tag objects using query.
1379 filters = self.chain_filters([filt, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1380 tops = self.searcher.search(query, filt, max_results)
1383 for found in tops.scoreDocs:
1384 doc = self.searcher.doc(found.doc)
1385 is_pdcounter = doc.get('is_pdcounter')
1386 category = doc.get('tag_category')
1388 if is_pdcounter == 'true':
1389 if category == 'pd_author':
1390 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1391 elif category == 'pd_book':
1392 tag = PDCounterBook.objects.get(id=doc.get('tag_id'))
1393 tag.category = 'pd_book' # make it look more lik a tag.
1395 print "Warning. cannot get pdcounter tag_id=%d from db; cat=%s" % (int(doc.get('tag_id')), category)
1397 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1398 # don't add the pdcounter tag if same tag already exists
1399 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1401 except catalogue.models.Tag.DoesNotExist: pass
1402 except PDCounterAuthor.DoesNotExist: pass
1403 except PDCounterBook.DoesNotExist: pass
1405 log.debug('search_tags: %s' % tags)
1409 def search_books(self, query, filt=None, max_results=10):
1411 Searches for Book objects using query
1414 tops = self.searcher.search(query, filt, max_results)
1415 for found in tops.scoreDocs:
1416 doc = self.searcher.doc(found.doc)
1418 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1419 except catalogue.models.Book.DoesNotExist: pass
1422 def make_prefix_phrase(self, toks, field):
1423 q = MultiPhraseQuery()
1424 for i in range(len(toks)):
1425 t = Term(field, toks[i])
1426 if i == len(toks) - 1:
1427 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1437 def term_filter(term, inverse=False):
1438 only_term = TermsFilter()
1439 only_term.addTerm(term)
1442 neg = BooleanFilter()
1443 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1448 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1450 Return auto-complete hints for tags
1451 using prefix search.
1453 toks = self.get_tokens(string, field='SIMPLE')
1454 top = BooleanQuery()
1456 for field in ['tag_name', 'tag_name_pl']:
1458 q = self.make_prefix_phrase(toks, field)
1460 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1461 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1463 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1465 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1467 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1469 Returns auto-complete hints for book titles
1470 Because we do not index 'pseudo' title-tags.
1473 toks = self.get_tokens(string, field='SIMPLE')
1476 q = self.make_prefix_phrase(toks, 'title')
1478 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1480 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1483 def chain_filters(filters, op=ChainedFilter.AND):
1485 Chains a filter list together
1487 filters = filter(lambda x: x is not None, filters)
1488 if not filters or filters is []:
1490 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1493 def filtered_categories(self, tags):
1495 Return a list of tag categories, present in tags list.
1499 cats[t.category] = True