1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
334 source = book_info.source_name
335 if hasattr(book_info, 'source_name'):
336 match = self.published_date_re.search(source)
337 if match is not None:
338 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
342 def add_gaps(self, fields, fieldname):
344 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
349 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
352 def get_master(self, root):
354 Returns the first master tag from an etree.
356 for master in root.iter():
357 if master.tag in self.master_tags:
360 def index_content(self, book, book_fields=[]):
362 Walks the book XML and extract content from it.
363 Adds parts for each header tag and for each fragment.
365 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366 root = wld.edoc.getroot()
368 master = self.get_master(root)
372 def walker(node, ignore_tags=[]):
374 if node.tag not in ignore_tags:
375 yield node, None, None
376 if node.text is not None:
377 yield None, node.text, None
378 for child in list(node):
379 for b, t, e in walker(child):
381 yield None, None, node
383 if node.tail is not None:
384 yield None, node.tail, None
387 def fix_format(text):
388 # separator = [u" ", u"\t", u".", u";", u","]
389 if isinstance(text, list):
390 # need to join it first
391 text = filter(lambda s: s is not None, content)
392 text = u' '.join(text)
393 # for i in range(len(text)):
395 # if text[i][0] not in separator\
396 # and text[i - 1][-1] not in separator:
397 # text.insert(i, u" ")
399 return re.sub("(?m)/$", "", text)
401 def add_part(snippets, **fields):
402 doc = self.create_book_doc(book)
403 for f in book_fields:
406 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
407 doc.add(NumericField("header_span", Field.Store.YES, True)\
408 .setIntValue('header_span' in fields and fields['header_span'] or 1))
409 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
411 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
412 Field.TermVector.WITH_POSITIONS_OFFSETS))
414 snip_pos = snippets.add(fields["content"])
415 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
416 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
418 if 'fragment_anchor' in fields:
419 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
420 Field.Store.YES, Field.Index.NOT_ANALYZED))
422 if 'themes' in fields:
423 themes, themes_pl = zip(*[
424 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
425 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
426 for theme in fields['themes']])
428 themes = self.add_gaps(themes, 'themes')
429 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
439 if isinstance(s, unicode):
440 return s.encode('utf-8')
445 snippets = Snippets(book.id).open('w')
447 for header, position in zip(list(master), range(len(master))):
449 if header.tag in self.skip_header_tags:
451 if header.tag is etree.Comment:
458 def all_content(text):
459 for frag in fragments.values():
460 frag['content'].append(text)
462 handle_text = [all_content]
465 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
467 if start is not None and start.tag in self.footnote_tags:
469 def collect_footnote(t):
471 handle_text.append(collect_footnote)
472 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
474 doc = add_part(snippets, header_index=position, header_type=header.tag,
475 content=u''.join(footnote),
476 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
478 self.index.addDocument(doc)
479 print "@ footnote text: %s" % footnote
482 # handle fragments and themes.
483 if start is not None and start.tag == 'begin':
484 fid = start.attrib['id'][1:]
485 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
487 # themes for this fragment
488 elif start is not None and start.tag == 'motyw':
489 fid = start.attrib['id'][1:]
490 handle_text.append(None)
491 if start.text is not None:
492 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
493 elif end is not None and end.tag == 'motyw':
496 elif start is not None and start.tag == 'end':
497 fid = start.attrib['id'][1:]
498 if fid not in fragments:
499 continue # a broken <end> node, skip it
500 frag = fragments[fid]
501 if frag['themes'] == []:
502 continue # empty themes list.
505 doc = add_part(snippets,
506 header_type=frag['start_header'],
507 header_index=frag['start_section'],
508 header_span=position - frag['start_section'] + 1,
510 content=fix_format(frag['content']),
511 themes=frag['themes'])
512 print '@ FRAG %s' % frag['content']
513 self.index.addDocument(doc)
517 if text is not None and handle_text is not []:
518 hdl = handle_text[-1]
522 # in the end, add a section text.
523 doc = add_part(snippets, header_index=position, header_type=header.tag,
524 content=fix_format(content))
525 print '@ CONTENT: %s' % fix_format(content)
527 self.index.addDocument(doc)
533 def log_exception_wrapper(f):
538 print("Error in indexing thread: %s" % e)
539 traceback.print_exc()
544 class ReusableIndex(Index):
546 Works like index, but does not close/optimize Lucene index
547 until program exit (uses atexit hook).
548 This is usefull for importbooks command.
550 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
554 def open(self, analyzer=None, threads=4):
555 if ReusableIndex.index is not None:
556 self.index = ReusableIndex.index
558 print("opening index")
559 Index.open(self, analyzer)
560 ReusableIndex.index = self.index
561 atexit.register(ReusableIndex.close_reusable)
563 # def index_book(self, *args, **kw):
564 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
565 # ReusableIndex.pool_jobs.append(job)
568 def close_reusable():
569 if ReusableIndex.index is not None:
570 ReusableIndex.index.optimize()
571 ReusableIndex.index.close()
572 ReusableIndex.index = None
578 class JoinSearch(object):
580 This mixin could be used to handle block join queries.
583 def __init__(self, *args, **kw):
584 super(JoinSearch, self).__init__(*args, **kw)
586 def wrapjoins(self, query, fields=[]):
588 This functions modifies the query in a recursive way,
589 so Term and Phrase Queries contained, which match
590 provided fields are wrapped in a BlockJoinQuery,
591 and so delegated to children documents.
593 if BooleanQuery.instance_(query):
594 qs = BooleanQuery.cast_(query)
596 clause = BooleanClause.cast_(clause)
597 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
601 query.extractTerms(termset)
604 if t.field() not in fields:
606 return BlockJoinQuery(query, self.parent_filter,
607 BlockJoinQuery.ScoreMode.Total)
609 def bsearch(self, query, max_results=50):
610 q = self.query(query)
611 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
613 tops = self.searcher.search(bjq, max_results)
615 for found in tops.scoreDocs:
616 doc = self.searcher.doc(found.doc)
617 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
618 return (bks, tops.totalHits)
621 class SearchResult(object):
622 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
623 if tokens_cache is None: tokens_cache = {}
628 self._score = scoreDocs.score
633 self._processed_hits = None # processed hits
635 stored = search.searcher.doc(scoreDocs.doc)
636 self.book_id = int(stored.get("book_id"))
638 pd = stored.get("published_date")
641 self.published_date = int(pd)
643 header_type = stored.get("header_type")
644 # we have a content hit in some header of fragment
645 if header_type is not None:
646 sec = (header_type, int(stored.get("header_index")))
647 header_span = stored.get('header_span')
648 header_span = header_span is not None and int(header_span) or 1
650 fragment = stored.get("fragment_anchor")
653 snippets = snippets.replace("/\n", "\n")
654 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
656 self._hits.append(hit)
659 self.searched = searched
660 self.tokens_cache = tokens_cache
664 return self._score * self.boost
666 def merge(self, other):
667 if self.book_id != other.book_id:
668 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
669 self._hits += other._hits
670 if other.score > self.score:
671 self._score = other._score
675 return catalogue.models.Book.objects.get(id=self.book_id)
677 book = property(get_book)
681 if self._processed_hits is not None:
682 return self._processed_hits
691 # to sections and fragments
692 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
693 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
694 sect = filter(lambda s: 0 == len(filter(
695 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
696 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
701 # remove duplicate fragments
706 if fragments[fid][SCORE] >= f[SCORE]:
709 frags = fragments.values()
711 # remove duplicate sections
715 si = s[POSITION][POSITION_INDEX]
718 if sections[si]['score'] >= s[SCORE]:
721 m = {'score': s[SCORE],
722 'section_number': s[POSITION][POSITION_INDEX] + 1,
727 hits = sections.values()
731 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
732 except catalogue.models.Fragment.DoesNotExist:
736 # Figure out if we were searching for a token matching some word in theme name.
737 themes = frag.tags.filter(category='theme')
739 if self.searched is not None:
740 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
742 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
745 if not theme in themes_hit:
746 themes_hit.append(theme)
749 m = {'score': f[SCORE],
751 'section_number': f[POSITION][POSITION_INDEX] + 1,
753 'themes_hit': themes_hit
758 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
760 self._processed_hits = hits
764 def __unicode__(self):
765 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
768 def aggregate(*result_lists):
770 for rl in result_lists:
772 if r.book_id in books:
773 books[r.book_id].merge(r)
774 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
777 return books.values()
779 def __cmp__(self, other):
780 c = cmp(self.score, other.score)
782 # this is inverted, because earlier date is better
783 return cmp(other.published_date, self.published_date)
790 Given some hint information (information we already know about)
791 our search target - like author, title (specific book), epoch, genre, kind
792 we can narrow down search using filters.
794 def __init__(self, search):
796 Accepts a Searcher instance.
803 def books(self, *books):
805 Give a hint that we search these books.
809 def tags(self, tags):
811 Give a hint that these Tag objects (a list of)
815 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
816 lst = self.book_tags.get(t.category, [])
818 self.book_tags[t.category] = lst
819 if t.category in ['theme', 'theme_pl']:
820 self.part_tags.append(t)
822 def tag_filter(self, tags, field='tags'):
824 Given a lsit of tags and an optional field (but they are normally in tags field)
825 returns a filter accepting only books with specific tags.
830 toks = self.search.get_tokens(tag.name, field=field)
831 tag_phrase = PhraseQuery()
833 tag_phrase.add(Term(field, tok))
834 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
836 return QueryWrapperFilter(q)
838 def book_filter(self):
840 Filters using book tags (all tag kinds except a theme)
842 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
844 return self.tag_filter(tags)
848 def part_filter(self):
850 This filter can be used to look for book parts.
851 It filters on book id and/or themes.
855 fs.append(self.tag_filter(self.part_tags, field='themes'))
857 if self._books != []:
859 for b in self._books:
860 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
861 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
864 return Search.chain_filters(fs)
866 def should_search_for_book(self):
867 return self._books == []
869 def just_search_in(self, all):
870 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
873 if field == 'authors' and 'author' in self.book_tags:
875 if field == 'title' and self._books != []:
877 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
883 class Search(IndexStore):
887 def __init__(self, default_field="content"):
888 IndexStore.__init__(self)
889 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
890 # self.analyzer = WLAnalyzer()
891 self.searcher = IndexSearcher(self.store, True)
892 self.parser = QueryParser(Version.LUCENE_34, default_field,
895 self.parent_filter = TermsFilter()
896 self.parent_filter.addTerm(Term("is_book", "true"))
898 def query(self, query):
899 """Parse query in default Lucene Syntax. (for humans)
901 return self.parser.parse(query)
903 def simple_search(self, query, max_results=50):
904 """Runs a query for books using lucene syntax. (for humans)
905 Returns (books, total_hits)
908 tops = self.searcher.search(self.query(query), max_results)
910 for found in tops.scoreDocs:
911 doc = self.searcher.doc(found.doc)
912 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
913 return (bks, tops.totalHits)
915 def get_tokens(self, searched, field='content', cached=None):
916 """returns tokens analyzed by a proper (for a field) analyzer
917 argument can be: StringReader, string/unicode, or tokens. In the last case
918 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
920 if cached is not None and field in cached:
923 if isinstance(searched, str) or isinstance(searched, unicode):
924 searched = StringReader(searched)
925 elif isinstance(searched, list):
929 tokens = self.analyzer.reusableTokenStream(field, searched)
931 while tokens.incrementToken():
932 cta = tokens.getAttribute(CharTermAttribute.class_)
933 toks.append(cta.toString())
935 if cached is not None:
940 def fuzziness(self, fuzzy):
941 """Helper method to sanitize fuzziness"""
944 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
949 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
951 Return a PhraseQuery with a series of tokens.
954 phrase = MultiPhraseQuery()
956 term = Term(field, t)
957 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
961 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
965 if not fuzzterm.next(): break
967 phrase.add(JArray('object')(fuzzterms, Term))
971 phrase = PhraseQuery()
974 term = Term(field, t)
978 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
980 Returns term queries joined by boolean query.
981 modal - applies to boolean query
982 fuzzy - should the query by fuzzy.
986 term = Term(field, t)
988 term = FuzzyQuery(term, self.fuzziness(fuzzy))
990 term = TermQuery(term)
991 q.add(BooleanClause(term, modal))
994 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
995 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
996 if filters is None: filters = []
997 if tokens_cache is None: tokens_cache = {}
999 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1001 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1003 filters.append(self.term_filter(Term('is_book', 'true')))
1004 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1006 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1008 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1009 filters=None, tokens_cache=None, boost=None, snippets=True):
1010 if filters is None: filters = []
1011 if tokens_cache is None: tokens_cache = {}
1014 filters.append(self.term_filter(Term('is_book', 'true')))
1016 query = BooleanQuery()
1019 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1021 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1022 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1024 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1026 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1027 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1029 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1031 Search for perfect book matches. Just see if the query matches with some author or title,
1032 taking hints into account.
1034 fields_to_search = ['authors', 'title']
1037 if not hint.should_search_for_book():
1039 fields_to_search = hint.just_search_in(fields_to_search)
1040 only_in = hint.book_filter()
1042 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1046 top = self.searcher.search(q,
1047 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1049 for found in top.scoreDocs:
1050 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1053 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1054 fields_to_search = ['tags', 'authors', 'title']
1058 if not hint.should_search_for_book():
1060 fields_to_search = hint.just_search_in(fields_to_search)
1061 only_in = hint.book_filter()
1063 tokens = self.get_tokens(searched, field='SIMPLE')
1067 for fld in fields_to_search:
1068 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1069 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1072 top = self.searcher.search(q,
1073 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1075 for found in top.scoreDocs:
1076 books.append(SearchResult(self, found, how_found="search_book"))
1080 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1082 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1083 some part/fragment of the book.
1085 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1089 flt = hint.part_filter()
1093 top = self.searcher.search(q,
1094 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1097 for found in top.scoreDocs:
1098 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1102 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1104 Tries to use search terms to match different fields of book (or its parts).
1105 E.g. one word can be an author survey, another be a part of the title, and the rest
1106 are some words from third chapter.
1108 if tokens_cache is None: tokens_cache = {}
1113 only_in = hint.part_filter()
1115 # content only query : themes x content
1118 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1119 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1121 # only search in themes when we do not already filter by themes
1122 if hint is None or hint.just_search_in(['themes']) != []:
1123 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1124 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1126 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1127 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1129 topDocs = self.searcher.search(q, only_in, max_results)
1130 for found in topDocs.scoreDocs:
1131 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1132 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1134 # query themes/content x author/title/tags
1136 in_content = BooleanQuery()
1137 in_meta = BooleanQuery()
1139 for fld in ['themes_pl', 'content']:
1140 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1142 for fld in ['tags', 'authors', 'title']:
1143 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1145 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1146 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1148 topDocs = self.searcher.search(q, only_in, max_results)
1149 for found in topDocs.scoreDocs:
1150 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1151 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1155 # def multisearch(self, query, max_results=50):
1158 # - (phrase) OR -> content
1161 # - (keywords) -> authors
1166 # queryreader = StringReader(query)
1167 # tokens = self.get_tokens(queryreader)
1169 # top_level = BooleanQuery()
1170 # Should = BooleanClause.Occur.SHOULD
1172 # phrase_level = BooleanQuery()
1173 # phrase_level.setBoost(1.3)
1175 # p_content = self.make_phrase(tokens, joined=True)
1176 # p_title = self.make_phrase(tokens, 'title')
1177 # p_author = self.make_phrase(tokens, 'author')
1179 # phrase_level.add(BooleanClause(p_content, Should))
1180 # phrase_level.add(BooleanClause(p_title, Should))
1181 # phrase_level.add(BooleanClause(p_author, Should))
1183 # kw_level = BooleanQuery()
1185 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1186 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1187 # kw_level.add(j_themes, Should)
1188 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1189 # j_con = self.make_term_query(tokens, joined=True)
1190 # kw_level.add(j_con, Should)
1192 # top_level.add(BooleanClause(phrase_level, Should))
1193 # top_level.add(BooleanClause(kw_level, Should))
1197 def get_snippets(self, scoreDoc, query, field='content'):
1199 Returns a snippet for found scoreDoc.
1201 htmlFormatter = SimpleHTMLFormatter()
1202 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1204 stored = self.searcher.doc(scoreDoc.doc)
1206 position = stored.get('snippets_position')
1207 length = stored.get('snippets_length')
1208 if position is None or length is None:
1211 snippets = Snippets(stored.get('book_id')).open()
1213 text = snippets.get((int(position),
1218 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1219 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1220 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1225 def enum_to_array(enum):
1227 Converts a lucene TermEnum to array of Terms, suitable for
1236 if not enum.next(): break
1239 return JArray('object')(terms, Term)
1241 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1243 Search for Tag objects using query.
1246 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1247 tops = self.searcher.search(query, filters, max_results)
1250 for found in tops.scoreDocs:
1251 doc = self.searcher.doc(found.doc)
1252 is_pdcounter = doc.get('is_pdcounter')
1254 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1256 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1257 # don't add the pdcounter tag if same tag already exists
1258 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1260 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1261 print 'returning %s' % tags
1264 def search_books(self, query, filter=None, max_results=10):
1266 Searches for Book objects using query
1269 tops = self.searcher.search(query, filter, max_results)
1270 for found in tops.scoreDocs:
1271 doc = self.searcher.doc(found.doc)
1272 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1275 def make_prefix_phrase(self, toks, field):
1276 q = MultiPhraseQuery()
1277 for i in range(len(toks)):
1278 t = Term(field, toks[i])
1279 if i == len(toks) - 1:
1280 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1290 def term_filter(term, inverse=False):
1291 only_term = TermsFilter()
1292 only_term.addTerm(term)
1295 neg = BooleanFilter()
1296 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1301 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1303 Return auto-complete hints for tags
1304 using prefix search.
1306 toks = self.get_tokens(string, field='SIMPLE')
1307 top = BooleanQuery()
1309 for field in ['tag_name', 'tag_name_pl']:
1311 q = self.make_prefix_phrase(toks, field)
1313 q = self.make_term_query(toks, field)
1314 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1316 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1318 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1320 def hint_books(self, string, max_results=50, prefix=True):
1322 Returns auto-complete hints for book titles
1323 Because we do not index 'pseudo' title-tags.
1326 toks = self.get_tokens(string, field='SIMPLE')
1329 q = self.make_prefix_phrase(toks, 'title')
1331 q = self.make_term_query(toks, 'title')
1333 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1336 def chain_filters(filters, op=ChainedFilter.AND):
1338 Chains a filter list together
1340 filters = filter(lambda x: x is not None, filters)
1341 if not filters or filters is []:
1343 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1346 def filtered_categories(self, tags):
1348 Return a list of tag categories, present in tags list.
1352 cats[t.category] = True