1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
335 if hasattr(book_info, 'source_name') and book_info.source_name:
336 match = self.published_date_re.search(book_info.source_name)
337 if match is not None:
338 pd = str(match.groups()[0])
340 fields["published_date"] = Field("published_date", pd, Field.Store.YES, Field.Index.NOT_ANALYZED)
344 def add_gaps(self, fields, fieldname):
346 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
347 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
351 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
352 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
354 def get_master(self, root):
356 Returns the first master tag from an etree.
358 for master in root.iter():
359 if master.tag in self.master_tags:
362 def index_content(self, book, book_fields=[]):
364 Walks the book XML and extract content from it.
365 Adds parts for each header tag and for each fragment.
367 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
368 root = wld.edoc.getroot()
370 master = self.get_master(root)
374 def walker(node, ignore_tags=[]):
376 if node.tag not in ignore_tags:
377 yield node, None, None
378 if node.text is not None:
379 yield None, node.text, None
380 for child in list(node):
381 for b, t, e in walker(child):
383 yield None, None, node
385 if node.tail is not None:
386 yield None, node.tail, None
389 def fix_format(text):
390 # separator = [u" ", u"\t", u".", u";", u","]
391 if isinstance(text, list):
392 # need to join it first
393 text = filter(lambda s: s is not None, content)
394 text = u' '.join(text)
395 # for i in range(len(text)):
397 # if text[i][0] not in separator\
398 # and text[i - 1][-1] not in separator:
399 # text.insert(i, u" ")
401 return re.sub("(?m)/$", "", text)
403 def add_part(snippets, **fields):
404 doc = self.create_book_doc(book)
405 for f in book_fields:
408 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
409 doc.add(NumericField("header_span", Field.Store.YES, True)\
410 .setIntValue('header_span' in fields and fields['header_span'] or 1))
411 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
413 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
414 Field.TermVector.WITH_POSITIONS_OFFSETS))
416 snip_pos = snippets.add(fields["content"])
417 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
418 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
420 if 'fragment_anchor' in fields:
421 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
422 Field.Store.YES, Field.Index.NOT_ANALYZED))
424 if 'themes' in fields:
425 themes, themes_pl = zip(*[
426 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
427 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
428 for theme in fields['themes']])
430 themes = self.add_gaps(themes, 'themes')
431 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
441 if isinstance(s, unicode):
442 return s.encode('utf-8')
447 snippets = Snippets(book.id).open('w')
449 for header, position in zip(list(master), range(len(master))):
451 if header.tag in self.skip_header_tags:
453 if header.tag is etree.Comment:
460 def all_content(text):
461 for frag in fragments.values():
462 frag['content'].append(text)
464 handle_text = [all_content]
467 for start, text, end in walker(header, ignore_tags=self.ignore_content_tags):
469 if start is not None and start.tag in self.footnote_tags:
471 def collect_footnote(t):
473 handle_text.append(collect_footnote)
474 elif end is not None and footnote is not [] and end.tag in self.footnote_tags:
476 doc = add_part(snippets, header_index=position, header_type=header.tag,
477 content=u''.join(footnote),
478 is_footnote=Field("is_footnote", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED))
480 self.index.addDocument(doc)
481 #print "@ footnote text: %s" % footnote
484 # handle fragments and themes.
485 if start is not None and start.tag == 'begin':
486 fid = start.attrib['id'][1:]
487 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
489 # themes for this fragment
490 elif start is not None and start.tag == 'motyw':
491 fid = start.attrib['id'][1:]
492 handle_text.append(None)
493 if start.text is not None:
494 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
495 elif end is not None and end.tag == 'motyw':
498 elif start is not None and start.tag == 'end':
499 fid = start.attrib['id'][1:]
500 if fid not in fragments:
501 continue # a broken <end> node, skip it
502 frag = fragments[fid]
503 if frag['themes'] == []:
504 continue # empty themes list.
507 doc = add_part(snippets,
508 header_type=frag['start_header'],
509 header_index=frag['start_section'],
510 header_span=position - frag['start_section'] + 1,
512 content=fix_format(frag['content']),
513 themes=frag['themes'])
514 #print '@ FRAG %s' % frag['content']
515 self.index.addDocument(doc)
519 if text is not None and handle_text is not []:
520 hdl = handle_text[-1]
524 # in the end, add a section text.
525 doc = add_part(snippets, header_index=position, header_type=header.tag,
526 content=fix_format(content))
527 #print '@ CONTENT: %s' % fix_format(content)
529 self.index.addDocument(doc)
535 def log_exception_wrapper(f):
540 print("Error in indexing thread: %s" % e)
541 traceback.print_exc()
546 class ReusableIndex(Index):
548 Works like index, but does not close/optimize Lucene index
549 until program exit (uses atexit hook).
550 This is usefull for importbooks command.
552 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
556 def open(self, analyzer=None, threads=4):
557 if ReusableIndex.index is not None:
558 self.index = ReusableIndex.index
560 print("opening index")
561 Index.open(self, analyzer)
562 ReusableIndex.index = self.index
563 atexit.register(ReusableIndex.close_reusable)
565 # def index_book(self, *args, **kw):
566 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
567 # ReusableIndex.pool_jobs.append(job)
570 def close_reusable():
571 if ReusableIndex.index is not None:
572 ReusableIndex.index.optimize()
573 ReusableIndex.index.close()
574 ReusableIndex.index = None
580 class JoinSearch(object):
582 This mixin could be used to handle block join queries.
585 def __init__(self, *args, **kw):
586 super(JoinSearch, self).__init__(*args, **kw)
588 def wrapjoins(self, query, fields=[]):
590 This functions modifies the query in a recursive way,
591 so Term and Phrase Queries contained, which match
592 provided fields are wrapped in a BlockJoinQuery,
593 and so delegated to children documents.
595 if BooleanQuery.instance_(query):
596 qs = BooleanQuery.cast_(query)
598 clause = BooleanClause.cast_(clause)
599 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
603 query.extractTerms(termset)
606 if t.field() not in fields:
608 return BlockJoinQuery(query, self.parent_filter,
609 BlockJoinQuery.ScoreMode.Total)
611 def bsearch(self, query, max_results=50):
612 q = self.query(query)
613 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
615 tops = self.searcher.search(bjq, max_results)
617 for found in tops.scoreDocs:
618 doc = self.searcher.doc(found.doc)
619 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
620 return (bks, tops.totalHits)
623 class SearchResult(object):
624 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
625 if tokens_cache is None: tokens_cache = {}
630 self._score = scoreDocs.score
635 self._processed_hits = None # processed hits
637 stored = search.searcher.doc(scoreDocs.doc)
638 self.book_id = int(stored.get("book_id"))
640 pd = stored.get("published_date")
643 self.published_date = int(pd)
645 header_type = stored.get("header_type")
646 # we have a content hit in some header of fragment
647 if header_type is not None:
648 sec = (header_type, int(stored.get("header_index")))
649 header_span = stored.get('header_span')
650 header_span = header_span is not None and int(header_span) or 1
652 fragment = stored.get("fragment_anchor")
655 snippets = snippets.replace("/\n", "\n")
656 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
658 self._hits.append(hit)
661 self.searched = searched
662 self.tokens_cache = tokens_cache
666 return self._score * self.boost
668 def merge(self, other):
669 if self.book_id != other.book_id:
670 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
671 self._hits += other._hits
672 if other.score > self.score:
673 self._score = other._score
677 return catalogue.models.Book.objects.get(id=self.book_id)
679 book = property(get_book)
683 if self._processed_hits is not None:
684 return self._processed_hits
693 # to sections and fragments
694 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
695 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
696 sect = filter(lambda s: 0 == len(filter(
697 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
698 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
703 # remove duplicate fragments
708 if fragments[fid][SCORE] >= f[SCORE]:
711 frags = fragments.values()
713 # remove duplicate sections
717 si = s[POSITION][POSITION_INDEX]
720 if sections[si]['score'] >= s[SCORE]:
723 m = {'score': s[SCORE],
724 'section_number': s[POSITION][POSITION_INDEX] + 1,
729 hits = sections.values()
733 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
734 except catalogue.models.Fragment.DoesNotExist:
738 # Figure out if we were searching for a token matching some word in theme name.
739 themes = frag.tags.filter(category='theme')
741 if self.searched is not None:
742 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
744 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
747 if not theme in themes_hit:
748 themes_hit.append(theme)
751 m = {'score': f[SCORE],
753 'section_number': f[POSITION][POSITION_INDEX] + 1,
755 'themes_hit': themes_hit
760 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
762 self._processed_hits = hits
766 def __unicode__(self):
767 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
770 def aggregate(*result_lists):
772 for rl in result_lists:
774 if r.book_id in books:
775 books[r.book_id].merge(r)
776 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
779 return books.values()
781 def __cmp__(self, other):
782 c = cmp(self.score, other.score)
784 # this is inverted, because earlier date is better
785 return cmp(other.published_date, self.published_date)
792 Given some hint information (information we already know about)
793 our search target - like author, title (specific book), epoch, genre, kind
794 we can narrow down search using filters.
796 def __init__(self, search):
798 Accepts a Searcher instance.
805 def books(self, *books):
807 Give a hint that we search these books.
811 def tags(self, tags):
813 Give a hint that these Tag objects (a list of)
817 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
818 lst = self.book_tags.get(t.category, [])
820 self.book_tags[t.category] = lst
821 if t.category in ['theme', 'theme_pl']:
822 self.part_tags.append(t)
824 def tag_filter(self, tags, field='tags'):
826 Given a lsit of tags and an optional field (but they are normally in tags field)
827 returns a filter accepting only books with specific tags.
832 toks = self.search.get_tokens(tag.name, field=field)
833 tag_phrase = PhraseQuery()
835 tag_phrase.add(Term(field, tok))
836 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
838 return QueryWrapperFilter(q)
840 def book_filter(self):
842 Filters using book tags (all tag kinds except a theme)
844 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
846 return self.tag_filter(tags)
850 def part_filter(self):
852 This filter can be used to look for book parts.
853 It filters on book id and/or themes.
857 fs.append(self.tag_filter(self.part_tags, field='themes'))
859 if self._books != []:
861 for b in self._books:
862 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
863 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
866 return Search.chain_filters(fs)
868 def should_search_for_book(self):
869 return self._books == []
871 def just_search_in(self, all):
872 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
875 if field == 'authors' and 'author' in self.book_tags:
877 if field == 'title' and self._books != []:
879 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
885 class Search(IndexStore):
889 def __init__(self, default_field="content"):
890 IndexStore.__init__(self)
891 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
892 # self.analyzer = WLAnalyzer()
893 self.searcher = IndexSearcher(self.store, True)
894 self.parser = QueryParser(Version.LUCENE_34, default_field,
897 self.parent_filter = TermsFilter()
898 self.parent_filter.addTerm(Term("is_book", "true"))
900 def query(self, query):
901 """Parse query in default Lucene Syntax. (for humans)
903 return self.parser.parse(query)
905 def simple_search(self, query, max_results=50):
906 """Runs a query for books using lucene syntax. (for humans)
907 Returns (books, total_hits)
910 tops = self.searcher.search(self.query(query), max_results)
912 for found in tops.scoreDocs:
913 doc = self.searcher.doc(found.doc)
914 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
915 return (bks, tops.totalHits)
917 def get_tokens(self, searched, field='content', cached=None):
918 """returns tokens analyzed by a proper (for a field) analyzer
919 argument can be: StringReader, string/unicode, or tokens. In the last case
920 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
922 if cached is not None and field in cached:
925 if isinstance(searched, str) or isinstance(searched, unicode):
926 searched = StringReader(searched)
927 elif isinstance(searched, list):
931 tokens = self.analyzer.reusableTokenStream(field, searched)
933 while tokens.incrementToken():
934 cta = tokens.getAttribute(CharTermAttribute.class_)
935 toks.append(cta.toString())
937 if cached is not None:
942 def fuzziness(self, fuzzy):
943 """Helper method to sanitize fuzziness"""
946 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
951 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
953 Return a PhraseQuery with a series of tokens.
956 phrase = MultiPhraseQuery()
958 term = Term(field, t)
959 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
963 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
967 if not fuzzterm.next(): break
969 phrase.add(JArray('object')(fuzzterms, Term))
973 phrase = PhraseQuery()
976 term = Term(field, t)
980 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
982 Returns term queries joined by boolean query.
983 modal - applies to boolean query
984 fuzzy - should the query by fuzzy.
988 term = Term(field, t)
990 term = FuzzyQuery(term, self.fuzziness(fuzzy))
992 term = TermQuery(term)
993 q.add(BooleanClause(term, modal))
996 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
997 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
998 if filters is None: filters = []
999 if tokens_cache is None: tokens_cache = {}
1001 tokens = self.get_tokens(searched, field, cached=tokens_cache)
1003 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
1005 filters.append(self.term_filter(Term('is_book', 'true')))
1006 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1008 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
1010 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
1011 filters=None, tokens_cache=None, boost=None, snippets=True):
1012 if filters is None: filters = []
1013 if tokens_cache is None: tokens_cache = {}
1016 filters.append(self.term_filter(Term('is_book', 'true')))
1018 query = BooleanQuery()
1021 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1023 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1024 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1026 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1028 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1029 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1031 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1033 Search for perfect book matches. Just see if the query matches with some author or title,
1034 taking hints into account.
1036 fields_to_search = ['authors', 'title']
1039 if not hint.should_search_for_book():
1041 fields_to_search = hint.just_search_in(fields_to_search)
1042 only_in = hint.book_filter()
1044 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1048 top = self.searcher.search(q,
1049 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1051 for found in top.scoreDocs:
1052 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1055 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1056 fields_to_search = ['tags', 'authors', 'title']
1060 if not hint.should_search_for_book():
1062 fields_to_search = hint.just_search_in(fields_to_search)
1063 only_in = hint.book_filter()
1065 tokens = self.get_tokens(searched, field='SIMPLE')
1069 for fld in fields_to_search:
1070 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1071 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1074 top = self.searcher.search(q,
1075 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1077 for found in top.scoreDocs:
1078 books.append(SearchResult(self, found, how_found="search_book"))
1082 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1084 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1085 some part/fragment of the book.
1087 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1091 flt = hint.part_filter()
1095 top = self.searcher.search(q,
1096 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1099 for found in top.scoreDocs:
1100 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1104 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1106 Tries to use search terms to match different fields of book (or its parts).
1107 E.g. one word can be an author survey, another be a part of the title, and the rest
1108 are some words from third chapter.
1110 if tokens_cache is None: tokens_cache = {}
1115 only_in = hint.part_filter()
1117 # content only query : themes x content
1120 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1121 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1123 # only search in themes when we do not already filter by themes
1124 if hint is None or hint.just_search_in(['themes']) != []:
1125 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1126 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1128 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1129 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1131 topDocs = self.searcher.search(q, only_in, max_results)
1132 for found in topDocs.scoreDocs:
1133 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1134 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1136 # query themes/content x author/title/tags
1138 in_content = BooleanQuery()
1139 in_meta = BooleanQuery()
1141 for fld in ['themes_pl', 'content']:
1142 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1144 for fld in ['tags', 'authors', 'title']:
1145 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1147 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1148 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1150 topDocs = self.searcher.search(q, only_in, max_results)
1151 for found in topDocs.scoreDocs:
1152 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1153 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1157 # def multisearch(self, query, max_results=50):
1160 # - (phrase) OR -> content
1163 # - (keywords) -> authors
1168 # queryreader = StringReader(query)
1169 # tokens = self.get_tokens(queryreader)
1171 # top_level = BooleanQuery()
1172 # Should = BooleanClause.Occur.SHOULD
1174 # phrase_level = BooleanQuery()
1175 # phrase_level.setBoost(1.3)
1177 # p_content = self.make_phrase(tokens, joined=True)
1178 # p_title = self.make_phrase(tokens, 'title')
1179 # p_author = self.make_phrase(tokens, 'author')
1181 # phrase_level.add(BooleanClause(p_content, Should))
1182 # phrase_level.add(BooleanClause(p_title, Should))
1183 # phrase_level.add(BooleanClause(p_author, Should))
1185 # kw_level = BooleanQuery()
1187 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1188 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1189 # kw_level.add(j_themes, Should)
1190 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1191 # j_con = self.make_term_query(tokens, joined=True)
1192 # kw_level.add(j_con, Should)
1194 # top_level.add(BooleanClause(phrase_level, Should))
1195 # top_level.add(BooleanClause(kw_level, Should))
1199 def get_snippets(self, scoreDoc, query, field='content'):
1201 Returns a snippet for found scoreDoc.
1203 htmlFormatter = SimpleHTMLFormatter()
1204 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1206 stored = self.searcher.doc(scoreDoc.doc)
1208 position = stored.get('snippets_position')
1209 length = stored.get('snippets_length')
1210 if position is None or length is None:
1213 snippets = Snippets(stored.get('book_id')).open()
1215 text = snippets.get((int(position),
1220 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1221 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1222 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1227 def enum_to_array(enum):
1229 Converts a lucene TermEnum to array of Terms, suitable for
1238 if not enum.next(): break
1241 return JArray('object')(terms, Term)
1243 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1245 Search for Tag objects using query.
1248 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1249 tops = self.searcher.search(query, filters, max_results)
1252 for found in tops.scoreDocs:
1253 doc = self.searcher.doc(found.doc)
1254 is_pdcounter = doc.get('is_pdcounter')
1256 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1258 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1259 # don't add the pdcounter tag if same tag already exists
1260 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1262 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1263 print 'returning %s' % tags
1266 def search_books(self, query, filter=None, max_results=10):
1268 Searches for Book objects using query
1271 tops = self.searcher.search(query, filter, max_results)
1272 for found in tops.scoreDocs:
1273 doc = self.searcher.doc(found.doc)
1274 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1277 def make_prefix_phrase(self, toks, field):
1278 q = MultiPhraseQuery()
1279 for i in range(len(toks)):
1280 t = Term(field, toks[i])
1281 if i == len(toks) - 1:
1282 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1292 def term_filter(term, inverse=False):
1293 only_term = TermsFilter()
1294 only_term.addTerm(term)
1297 neg = BooleanFilter()
1298 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1303 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1305 Return auto-complete hints for tags
1306 using prefix search.
1308 toks = self.get_tokens(string, field='SIMPLE')
1309 top = BooleanQuery()
1311 for field in ['tag_name', 'tag_name_pl']:
1313 q = self.make_prefix_phrase(toks, field)
1315 q = self.make_term_query(toks, field)
1316 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1318 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1320 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1322 def hint_books(self, string, max_results=50, prefix=True):
1324 Returns auto-complete hints for book titles
1325 Because we do not index 'pseudo' title-tags.
1328 toks = self.get_tokens(string, field='SIMPLE')
1331 q = self.make_prefix_phrase(toks, 'title')
1333 q = self.make_term_query(toks, 'title')
1335 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1338 def chain_filters(filters, op=ChainedFilter.AND):
1340 Chains a filter list together
1342 filters = filter(lambda x: x is not None, filters)
1343 if not filters or filters is []:
1345 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1348 def filtered_categories(self, tags):
1350 Return a list of tag categories, present in tags list.
1354 cats[t.category] = True