1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
335 if hasattr(book_info, 'source_name') and book_info.source_name:
336 match = self.published_date_re.search(book_info.source_name)
337 if match is not None:
338 pd = str(match.groups()[0])
340 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
344 def add_gaps(self, fields, fieldname):
346 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
347 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
351 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
352 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
354 def get_master(self, root):
356 Returns the first master tag from an etree.
358 for master in root.iter():
359 if master.tag in self.master_tags:
362 def index_content(self, book, book_fields=[]):
364 Walks the book XML and extract content from it.
365 Adds parts for each header tag and for each fragment.
367 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
368 root = wld.edoc.getroot()
370 master = self.get_master(root)
374 def walker(node, ignore_tags=[]):
376 if node.tag not in ignore_tags:
377 yield node, None, None
378 if node.text is not None:
379 yield None, node.text, None
380 for child in list(node):
381 for b, t, e in walker(child):
383 yield None, None, node
385 if node.tail is not None:
386 yield None, node.tail, None
389 def fix_format(text):
390 # separator = [u" ", u"\t", u".", u";", u","]
391 if isinstance(text, list):
392 # need to join it first
393 text = filter(lambda s: s is not None, content)
394 text = u' '.join(text)
395 # for i in range(len(text)):
397 # if text[i][0] not in separator\
398 # and text[i - 1][-1] not in separator:
399 # text.insert(i, u" ")
401 return re.sub("(?m)/$", "", text)
403 def add_part(snippets, **fields):
404 doc = self.create_book_doc(book)
405 for f in book_fields:
408 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
409 doc.add(NumericField("header_span", Field.Store.YES, True)\
410 .setIntValue('header_span' in fields and fields['header_span'] or 1))
411 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
413 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
414 Field.TermVector.WITH_POSITIONS_OFFSETS))
416 snip_pos = snippets.add(fields["content"])
417 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
418 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
420 if 'fragment_anchor' in fields:
421 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
422 Field.Store.YES, Field.Index.NOT_ANALYZED))
424 if 'themes' in fields:
425 themes, themes_pl = zip(*[
426 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
427 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
428 for theme in fields['themes']])
430 themes = self.add_gaps(themes, 'themes')
431 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
441 if isinstance(s, unicode):
442 return s.encode('utf-8')
447 snippets = Snippets(book.id).open('w')
449 for header, position in zip(list(master), range(len(master))):
451 if header.tag in self.skip_header_tags:
453 if header.tag is etree.Comment:
460 def all_content(text):
461 for frag in fragments.values():
462 frag['content'].append(text)
464 handle_text = [all_content]
467 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
469 if start is not None and start.tag in self.footnote_tags:
471 def collect_footnote(t):
473 handle_text.append(collect_footnote)
474 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
476 doc = add_part(snippets, header_index=position, header_type=header.tag,
477 content=u''.join(footnote),
478 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
480 self.index.addDocument(doc)
481 #print "@ footnote text: %s" % footnote
484 # handle fragments and themes.
485 if start is not None and start.tag == 'begin':
486 fid = start.attrib['id'][1:]
487 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
489 # themes for this fragment
490 elif start is not None and start.tag == 'motyw':
491 fid = start.attrib['id'][1:]
492 handle_text.append(None)
493 if start.text is not None:
494 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
495 elif end is not None and end.tag == 'motyw':
498 elif start is not None and start.tag == 'end':
499 fid = start.attrib['id'][1:]
500 if fid not in fragments:
501 continue # a broken <end> node, skip it
502 frag = fragments[fid]
503 if frag['themes'] == []:
504 continue # empty themes list.
507 doc = add_part(snippets,
508 header_type=frag['start_header'],
509 header_index=frag['start_section'],
510 header_span=position - frag['start_section'] + 1,
512 content=fix_format(frag['content']),
513 themes=frag['themes'])
514 #print '@ FRAG %s' % frag['content']
515 self.index.addDocument(doc)
519 if text is not None and handle_text is not []:
520 hdl = handle_text[-1]
524 # in the end, add a section text.
525 doc = add_part(snippets, header_index=position, header_type=header.tag,
526 content=fix_format(content))
527 #print '@ CONTENT: %s' % fix_format(content)
529 self.index.addDocument(doc)
535 def log_exception_wrapper(f):
540 print("Error in indexing thread: %s" % e)
541 traceback.print_exc()
546 class ReusableIndex(Index):
548 Works like index, but does not close/optimize Lucene index
549 until program exit (uses atexit hook).
550 This is usefull for importbooks command.
552 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
556 def open(self, analyzer=None, threads=4):
557 if ReusableIndex.index:
558 self.index = ReusableIndex.index
560 print("opening index")
561 Index.open(self, analyzer)
562 ReusableIndex.index = self.index
563 atexit.register(ReusableIndex.close_reusable)
565 # def index_book(self, *args, **kw):
566 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
567 # ReusableIndex.pool_jobs.append(job)
570 def close_reusable():
571 if ReusableIndex.index:
572 print("closing index")
573 ReusableIndex.index.optimize()
574 ReusableIndex.index.close()
575 ReusableIndex.index = None
578 if ReusableIndex.index:
579 ReusableIndex.index.commit()
582 class JoinSearch(object):
584 This mixin could be used to handle block join queries.
587 def __init__(self, *args, **kw):
588 super(JoinSearch, self).__init__(*args, **kw)
590 def wrapjoins(self, query, fields=[]):
592 This functions modifies the query in a recursive way,
593 so Term and Phrase Queries contained, which match
594 provided fields are wrapped in a BlockJoinQuery,
595 and so delegated to children documents.
597 if BooleanQuery.instance_(query):
598 qs = BooleanQuery.cast_(query)
600 clause = BooleanClause.cast_(clause)
601 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
605 query.extractTerms(termset)
608 if t.field() not in fields:
610 return BlockJoinQuery(query, self.parent_filter,
611 BlockJoinQuery.ScoreMode.Total)
613 def bsearch(self, query, max_results=50):
614 q = self.query(query)
615 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
617 tops = self.searcher.search(bjq, max_results)
619 for found in tops.scoreDocs:
620 doc = self.searcher.doc(found.doc)
621 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
622 return (bks, tops.totalHits)
625 class SearchResult(object):
626 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
627 if tokens_cache is None: tokens_cache = {}
632 self._score = scoreDocs.score
637 self._processed_hits = None # processed hits
639 stored = search.searcher.doc(scoreDocs.doc)
640 self.book_id = int(stored.get("book_id"))
642 pd = stored.get("published_date")
645 self.published_date = int(pd)
647 header_type = stored.get("header_type")
648 # we have a content hit in some header of fragment
649 if header_type is not None:
650 sec = (header_type, int(stored.get("header_index")))
651 header_span = stored.get('header_span')
652 header_span = header_span is not None and int(header_span) or 1
654 fragment = stored.get("fragment_anchor")
657 snippets = snippets.replace("/\n", "\n")
658 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
660 self._hits.append(hit)
663 self.searched = searched
664 self.tokens_cache = tokens_cache
668 return self._score * self.boost
670 def merge(self, other):
671 if self.book_id != other.book_id:
672 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
673 self._hits += other._hits
674 if other.score > self.score:
675 self._score = other._score
679 return catalogue.models.Book.objects.get(id=self.book_id)
681 book = property(get_book)
685 if self._processed_hits is not None:
686 return self._processed_hits
695 # to sections and fragments
696 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
697 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
698 sect = filter(lambda s: 0 == len(filter(
699 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
700 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
705 # remove duplicate fragments
710 if fragments[fid][SCORE] >= f[SCORE]:
713 frags = fragments.values()
715 # remove duplicate sections
719 si = s[POSITION][POSITION_INDEX]
722 if sections[si]['score'] >= s[SCORE]:
725 m = {'score': s[SCORE],
726 'section_number': s[POSITION][POSITION_INDEX] + 1,
731 hits = sections.values()
735 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT], book__id=self.book_id)
736 except catalogue.models.Fragment.DoesNotExist:
740 # Figure out if we were searching for a token matching some word in theme name.
741 themes = frag.tags.filter(category='theme')
743 if self.searched is not None:
744 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
746 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
749 if not theme in themes_hit:
750 themes_hit.append(theme)
753 m = {'score': f[SCORE],
755 'section_number': f[POSITION][POSITION_INDEX] + 1,
757 'themes_hit': themes_hit
762 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
764 self._processed_hits = hits
768 def __unicode__(self):
769 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
772 def aggregate(*result_lists):
774 for rl in result_lists:
776 if r.book_id in books:
777 books[r.book_id].merge(r)
778 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
781 return books.values()
783 def __cmp__(self, other):
784 c = cmp(self.score, other.score)
786 # this is inverted, because earlier date is better
787 return cmp(other.published_date, self.published_date)
794 Given some hint information (information we already know about)
795 our search target - like author, title (specific book), epoch, genre, kind
796 we can narrow down search using filters.
798 def __init__(self, search):
800 Accepts a Searcher instance.
807 def books(self, *books):
809 Give a hint that we search these books.
813 def tags(self, tags):
815 Give a hint that these Tag objects (a list of)
819 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
820 lst = self.book_tags.get(t.category, [])
822 self.book_tags[t.category] = lst
823 if t.category in ['theme', 'theme_pl']:
824 self.part_tags.append(t)
826 def tag_filter(self, tags, field='tags'):
828 Given a lsit of tags and an optional field (but they are normally in tags field)
829 returns a filter accepting only books with specific tags.
834 toks = self.search.get_tokens(tag.name, field=field)
835 tag_phrase = PhraseQuery()
837 tag_phrase.add(Term(field, tok))
838 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
840 return QueryWrapperFilter(q)
842 def book_filter(self):
844 Filters using book tags (all tag kinds except a theme)
846 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
848 return self.tag_filter(tags)
852 def part_filter(self):
854 This filter can be used to look for book parts.
855 It filters on book id and/or themes.
859 fs.append(self.tag_filter(self.part_tags, field='themes'))
861 if self._books != []:
863 for b in self._books:
864 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
865 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
868 return Search.chain_filters(fs)
870 def should_search_for_book(self):
871 return self._books == []
873 def just_search_in(self, all):
874 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
877 if field == 'authors' and 'author' in self.book_tags:
879 if field == 'title' and self._books != []:
881 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
887 class Search(IndexStore):
891 def __init__(self, default_field="content"):
892 IndexStore.__init__(self)
893 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
894 # self.analyzer = WLAnalyzer()
895 self.searcher = IndexSearcher(self.store, True)
896 self.parser = QueryParser(Version.LUCENE_34, default_field,
899 self.parent_filter = TermsFilter()
900 self.parent_filter.addTerm(Term("is_book", "true"))
902 def query(self, query):
903 """Parse query in default Lucene Syntax. (for humans)
905 return self.parser.parse(query)
907 def simple_search(self, query, max_results=50):
908 """Runs a query for books using lucene syntax. (for humans)
909 Returns (books, total_hits)
912 tops = self.searcher.search(self.query(query), max_results)
914 for found in tops.scoreDocs:
915 doc = self.searcher.doc(found.doc)
916 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
917 return (bks, tops.totalHits)
919 def get_tokens(self, searched, field='content', cached=None):
920 """returns tokens analyzed by a proper (for a field) analyzer
921 argument can be: StringReader, string/unicode, or tokens. In the last case
922 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
924 if cached is not None and field in cached:
927 if isinstance(searched, str) or isinstance(searched, unicode):
928 searched = StringReader(searched)
929 elif isinstance(searched, list):
933 tokens = self.analyzer.reusableTokenStream(field, searched)
935 while tokens.incrementToken():
936 cta = tokens.getAttribute(CharTermAttribute.class_)
937 toks.append(cta.toString())
939 if cached is not None:
944 def fuzziness(self, fuzzy):
945 """Helper method to sanitize fuzziness"""
948 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
953 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
955 Return a PhraseQuery with a series of tokens.
958 phrase = MultiPhraseQuery()
960 term = Term(field, t)
961 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
965 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
969 if not fuzzterm.next(): break
971 phrase.add(JArray('object')(fuzzterms, Term))
975 phrase = PhraseQuery()
978 term = Term(field, t)
982 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
984 Returns term queries joined by boolean query.
985 modal - applies to boolean query
986 fuzzy - should the query by fuzzy.
990 term = Term(field, t)
992 term = FuzzyQuery(term, self.fuzziness(fuzzy))
994 term = TermQuery(term)
995 q.add(BooleanClause(term, modal))
998 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
999 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
1000 if filters is None: filters = []
1001 if tokens_cache is None: tokens_cache = {}
1003 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1005 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1007 filters.append(self.term_filter(Term('is_book', 'true')))
1008 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1010 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1012 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1013 filters=None, tokens_cache=None, boost=None, snippets=True):
1014 if filters is None: filters = []
1015 if tokens_cache is None: tokens_cache = {}
1018 filters.append(self.term_filter(Term('is_book', 'true')))
1020 query = BooleanQuery()
1023 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1025 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1026 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1028 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1030 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1031 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1033 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1035 Search for perfect book matches. Just see if the query matches with some author or title,
1036 taking hints into account.
1038 fields_to_search = ['authors', 'title']
1041 if not hint.should_search_for_book():
1043 fields_to_search = hint.just_search_in(fields_to_search)
1044 only_in = hint.book_filter()
1046 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1050 top = self.searcher.search(q,
1051 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1053 for found in top.scoreDocs:
1054 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1057 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1058 fields_to_search = ['tags', 'authors', 'title']
1062 if not hint.should_search_for_book():
1064 fields_to_search = hint.just_search_in(fields_to_search)
1065 only_in = hint.book_filter()
1067 tokens = self.get_tokens(searched, field='SIMPLE')
1071 for fld in fields_to_search:
1072 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1073 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1076 top = self.searcher.search(q,
1077 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1079 for found in top.scoreDocs:
1080 books.append(SearchResult(self, found, how_found="search_book"))
1084 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1086 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1087 some part/fragment of the book.
1089 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1093 flt = hint.part_filter()
1097 top = self.searcher.search(q,
1098 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1101 for found in top.scoreDocs:
1102 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1106 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1108 Tries to use search terms to match different fields of book (or its parts).
1109 E.g. one word can be an author survey, another be a part of the title, and the rest
1110 are some words from third chapter.
1112 if tokens_cache is None: tokens_cache = {}
1117 only_in = hint.part_filter()
1119 # content only query : themes x content
1122 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1123 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1125 # only search in themes when we do not already filter by themes
1126 if hint is None or hint.just_search_in(['themes']) != []:
1127 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1128 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1130 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1131 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1133 topDocs = self.searcher.search(q, only_in, max_results)
1134 for found in topDocs.scoreDocs:
1135 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1136 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1138 # query themes/content x author/title/tags
1140 in_content = BooleanQuery()
1141 in_meta = BooleanQuery()
1143 for fld in ['themes_pl', 'content']:
1144 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1146 for fld in ['tags', 'authors', 'title']:
1147 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1149 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1150 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1152 topDocs = self.searcher.search(q, only_in, max_results)
1153 for found in topDocs.scoreDocs:
1154 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1155 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1159 # def multisearch(self, query, max_results=50):
1162 # - (phrase) OR -> content
1165 # - (keywords) -> authors
1170 # queryreader = StringReader(query)
1171 # tokens = self.get_tokens(queryreader)
1173 # top_level = BooleanQuery()
1174 # Should = BooleanClause.Occur.SHOULD
1176 # phrase_level = BooleanQuery()
1177 # phrase_level.setBoost(1.3)
1179 # p_content = self.make_phrase(tokens, joined=True)
1180 # p_title = self.make_phrase(tokens, 'title')
1181 # p_author = self.make_phrase(tokens, 'author')
1183 # phrase_level.add(BooleanClause(p_content, Should))
1184 # phrase_level.add(BooleanClause(p_title, Should))
1185 # phrase_level.add(BooleanClause(p_author, Should))
1187 # kw_level = BooleanQuery()
1189 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1190 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1191 # kw_level.add(j_themes, Should)
1192 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1193 # j_con = self.make_term_query(tokens, joined=True)
1194 # kw_level.add(j_con, Should)
1196 # top_level.add(BooleanClause(phrase_level, Should))
1197 # top_level.add(BooleanClause(kw_level, Should))
1201 def get_snippets(self, scoreDoc, query, field='content'):
1203 Returns a snippet for found scoreDoc.
1205 htmlFormatter = SimpleHTMLFormatter()
1206 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1208 stored = self.searcher.doc(scoreDoc.doc)
1210 position = stored.get('snippets_position')
1211 length = stored.get('snippets_length')
1212 if position is None or length is None:
1215 book_id = int(stored.get('book_id'))
1216 snippets = Snippets(book_id).open()
1219 text = snippets.get((int(position),
1224 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1225 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1226 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1228 except Exception, e:
1230 if hasattr(e, 'getJavaException'):
1231 e2 = unicode(e.getJavaException())
1232 raise Exception("Problem fetching snippets for book %d, @%d len=%d" % (book_id, int(position), int(length)),
1237 def enum_to_array(enum):
1239 Converts a lucene TermEnum to array of Terms, suitable for
1248 if not enum.next(): break
1251 return JArray('object')(terms, Term)
1253 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1255 Search for Tag objects using query.
1258 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1259 tops = self.searcher.search(query, filters, max_results)
1262 for found in tops.scoreDocs:
1263 doc = self.searcher.doc(found.doc)
1264 is_pdcounter = doc.get('is_pdcounter')
1266 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1268 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1269 # don't add the pdcounter tag if same tag already exists
1270 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1272 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1273 print 'returning %s' % tags
1276 def search_books(self, query, filter=None, max_results=10):
1278 Searches for Book objects using query
1281 tops = self.searcher.search(query, filter, max_results)
1282 for found in tops.scoreDocs:
1283 doc = self.searcher.doc(found.doc)
1284 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1287 def make_prefix_phrase(self, toks, field):
1288 q = MultiPhraseQuery()
1289 for i in range(len(toks)):
1290 t = Term(field, toks[i])
1291 if i == len(toks) - 1:
1292 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1302 def term_filter(term, inverse=False):
1303 only_term = TermsFilter()
1304 only_term.addTerm(term)
1307 neg = BooleanFilter()
1308 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1313 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True, fuzzy=False):
1315 Return auto-complete hints for tags
1316 using prefix search.
1318 toks = self.get_tokens(string, field='SIMPLE')
1319 top = BooleanQuery()
1321 for field in ['tag_name', 'tag_name_pl']:
1323 q = self.make_prefix_phrase(toks, field)
1325 q = self.make_term_query(toks, field, fuzzy=fuzzy)
1326 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1328 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1330 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1332 def hint_books(self, string, max_results=50, prefix=True, fuzzy=False):
1334 Returns auto-complete hints for book titles
1335 Because we do not index 'pseudo' title-tags.
1338 toks = self.get_tokens(string, field='SIMPLE')
1341 q = self.make_prefix_phrase(toks, 'title')
1343 q = self.make_term_query(toks, 'title', fuzzy=fuzzy)
1345 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1348 def chain_filters(filters, op=ChainedFilter.AND):
1350 Chains a filter list together
1352 filters = filter(lambda x: x is not None, filters)
1353 if not filters or filters is []:
1355 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1358 def filtered_categories(self, tags):
1360 Return a list of tag categories, present in tags list.
1364 cats[t.category] = True