1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
334 source = book_info.source_name
335 if hasattr(book_info, 'source_name'):
336 match = self.published_date_re.search(source)
337 if match is not None:
338 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
342 def add_gaps(self, fields, fieldname):
344 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
349 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
352 def get_master(self, root):
354 Returns the first master tag from an etree.
356 for master in root.iter():
357 if master.tag in self.master_tags:
360 def index_content(self, book, book_fields=[]):
362 Walks the book XML and extract content from it.
363 Adds parts for each header tag and for each fragment.
365 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366 root = wld.edoc.getroot()
368 master = self.get_master(root)
372 def walker(node, ignore_tags=[]):
374 for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
375 for b, e in walker(child):
380 def fix_format(text):
381 # separator = [u" ", u"\t", u".", u";", u","]
382 if isinstance(text, list):
383 # need to join it first
384 text = filter(lambda s: s is not None, content)
385 text = u' '.join(text)
386 # for i in range(len(text)):
388 # if text[i][0] not in separator\
389 # and text[i - 1][-1] not in separator:
390 # text.insert(i, u" ")
392 return re.sub("(?m)/$", "", text)
394 def add_part(snippets, **fields):
395 doc = self.create_book_doc(book)
396 for f in book_fields:
399 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
400 doc.add(NumericField("header_span", Field.Store.YES, True)\
401 .setIntValue('header_span' in fields and fields['header_span'] or 1))
402 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
404 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
405 Field.TermVector.WITH_POSITIONS_OFFSETS))
407 snip_pos = snippets.add(fields["content"])
408 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
409 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
411 if 'fragment_anchor' in fields:
412 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
413 Field.Store.YES, Field.Index.NOT_ANALYZED))
415 if 'themes' in fields:
416 themes, themes_pl = zip(*[
417 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
418 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
419 for theme in fields['themes']])
421 themes = self.add_gaps(themes, 'themes')
422 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
432 if isinstance(s, unicode):
433 return s.encode('utf-8')
438 snippets = Snippets(book.id).open('w')
440 for header, position in zip(list(master), range(len(master))):
442 if header.tag in self.skip_header_tags:
444 if header.tag is etree.Comment:
451 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
453 # if start is not None and start.tag in self.footnote_tags:
454 # footnote = ' '.join(start.itertext())
455 # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
456 # doc = add_part(snippets, header_index=position, header_type=header.tag,
459 # self.index.addDocument(doc)
463 # handle fragments and themes.
464 if start is not None and start.tag == 'begin':
465 fid = start.attrib['id'][1:]
466 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
468 elif start is not None and start.tag == 'motyw':
469 fid = start.attrib['id'][1:]
470 if start.text is not None:
471 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
473 elif start is not None and start.tag == 'end':
474 fid = start.attrib['id'][1:]
475 if fid not in fragments:
476 continue # a broken <end> node, skip it
477 # import pdb; pdb.set_trace()
478 frag = fragments[fid]
479 if frag['themes'] == []:
480 continue # empty themes list.
483 doc = add_part(snippets,
484 header_type=frag['start_header'],
485 header_index=frag['start_section'],
486 header_span=position - frag['start_section'] + 1,
488 content=fix_format(frag['content']),
489 themes=frag['themes'])
491 self.index.addDocument(doc)
494 elif start is not None:
495 for frag in fragments.values():
496 frag['content'].append(start.text)
497 content.append(start.text)
498 elif end is not None:
499 for frag in fragments.values():
500 frag['content'].append(end.tail)
501 content.append(end.tail)
503 # in the end, add a section text.
504 doc = add_part(snippets, header_index=position, header_type=header.tag,
505 content=fix_format(content))
507 self.index.addDocument(doc)
513 def log_exception_wrapper(f):
518 print("Error in indexing thread: %s" % e)
519 traceback.print_exc()
524 class ReusableIndex(Index):
526 Works like index, but does not close/optimize Lucene index
527 until program exit (uses atexit hook).
528 This is usefull for importbooks command.
530 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
534 def open(self, analyzer=None, threads=4):
535 if ReusableIndex.index is not None:
536 self.index = ReusableIndex.index
538 print("opening index")
539 Index.open(self, analyzer)
540 ReusableIndex.index = self.index
541 atexit.register(ReusableIndex.close_reusable)
543 # def index_book(self, *args, **kw):
544 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
545 # ReusableIndex.pool_jobs.append(job)
548 def close_reusable():
549 if ReusableIndex.index is not None:
550 ReusableIndex.index.optimize()
551 ReusableIndex.index.close()
552 ReusableIndex.index = None
558 class JoinSearch(object):
560 This mixin could be used to handle block join queries.
563 def __init__(self, *args, **kw):
564 super(JoinSearch, self).__init__(*args, **kw)
566 def wrapjoins(self, query, fields=[]):
568 This functions modifies the query in a recursive way,
569 so Term and Phrase Queries contained, which match
570 provided fields are wrapped in a BlockJoinQuery,
571 and so delegated to children documents.
573 if BooleanQuery.instance_(query):
574 qs = BooleanQuery.cast_(query)
576 clause = BooleanClause.cast_(clause)
577 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
581 query.extractTerms(termset)
584 if t.field() not in fields:
586 return BlockJoinQuery(query, self.parent_filter,
587 BlockJoinQuery.ScoreMode.Total)
589 def bsearch(self, query, max_results=50):
590 q = self.query(query)
591 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
593 tops = self.searcher.search(bjq, max_results)
595 for found in tops.scoreDocs:
596 doc = self.searcher.doc(found.doc)
597 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
598 return (bks, tops.totalHits)
601 class SearchResult(object):
602 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
603 if tokens_cache is None: tokens_cache = {}
608 self._score = scoreDocs.score
613 self._processed_hits = None # processed hits
615 stored = search.searcher.doc(scoreDocs.doc)
616 self.book_id = int(stored.get("book_id"))
618 pd = stored.get("published_date")
621 self.published_date = int(pd)
623 header_type = stored.get("header_type")
624 # we have a content hit in some header of fragment
625 if header_type is not None:
626 sec = (header_type, int(stored.get("header_index")))
627 header_span = stored.get('header_span')
628 header_span = header_span is not None and int(header_span) or 1
630 fragment = stored.get("fragment_anchor")
633 snippets = snippets.replace("/\n", "\n")
634 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
636 self._hits.append(hit)
639 self.searched = searched
640 self.tokens_cache = tokens_cache
644 return self._score * self.boost
646 def merge(self, other):
647 if self.book_id != other.book_id:
648 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
649 self._hits += other._hits
650 if other.score > self.score:
651 self._score = other._score
655 return catalogue.models.Book.objects.get(id=self.book_id)
657 book = property(get_book)
661 if self._processed_hits is not None:
662 return self._processed_hits
671 # to sections and fragments
672 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
673 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
674 sect = filter(lambda s: 0 == len(filter(
675 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
676 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
681 # remove duplicate fragments
686 if fragments[fid][SCORE] >= f[SCORE]:
689 frags = fragments.values()
691 # remove duplicate sections
695 si = s[POSITION][POSITION_INDEX]
698 if sections[si]['score'] >= s[SCORE]:
701 m = {'score': s[SCORE],
702 'section_number': s[POSITION][POSITION_INDEX] + 1,
707 hits = sections.values()
711 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
712 except catalogue.models.Fragment.DoesNotExist:
716 # Figure out if we were searching for a token matching some word in theme name.
717 themes = frag.tags.filter(category='theme')
719 if self.searched is not None:
720 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
722 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
725 if not theme in themes_hit:
726 themes_hit.append(theme)
729 m = {'score': f[SCORE],
731 'section_number': f[POSITION][POSITION_INDEX] + 1,
733 'themes_hit': themes_hit
738 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
740 self._processed_hits = hits
744 def __unicode__(self):
745 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
748 def aggregate(*result_lists):
750 for rl in result_lists:
752 if r.book_id in books:
753 books[r.book_id].merge(r)
754 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
757 return books.values()
759 def __cmp__(self, other):
760 c = cmp(self.score, other.score)
762 if not hasattr(other,'published_date') or not hasattr(self, 'published_date'):
763 import pdb; pdb.set_trace()
764 # this is inverted, because earlier date is better
765 return cmp(other.published_date, self.published_date)
772 Given some hint information (information we already know about)
773 our search target - like author, title (specific book), epoch, genre, kind
774 we can narrow down search using filters.
776 def __init__(self, search):
778 Accepts a Searcher instance.
785 def books(self, *books):
787 Give a hint that we search these books.
791 def tags(self, tags):
793 Give a hint that these Tag objects (a list of)
797 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
798 lst = self.book_tags.get(t.category, [])
800 self.book_tags[t.category] = lst
801 if t.category in ['theme', 'theme_pl']:
802 self.part_tags.append(t)
804 def tag_filter(self, tags, field='tags'):
806 Given a lsit of tags and an optional field (but they are normally in tags field)
807 returns a filter accepting only books with specific tags.
812 toks = self.search.get_tokens(tag.name, field=field)
813 tag_phrase = PhraseQuery()
815 tag_phrase.add(Term(field, tok))
816 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
818 return QueryWrapperFilter(q)
820 def book_filter(self):
822 Filters using book tags (all tag kinds except a theme)
824 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
826 return self.tag_filter(tags)
830 def part_filter(self):
832 This filter can be used to look for book parts.
833 It filters on book id and/or themes.
837 fs.append(self.tag_filter(self.part_tags, field='themes'))
839 if self._books != []:
841 for b in self._books:
842 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
843 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
846 return Search.chain_filters(fs)
848 def should_search_for_book(self):
849 return self._books == []
851 def just_search_in(self, all):
852 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
855 if field == 'authors' and 'author' in self.book_tags:
857 if field == 'title' and self._books != []:
859 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
865 class Search(IndexStore):
869 def __init__(self, default_field="content"):
870 IndexStore.__init__(self)
871 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
872 # self.analyzer = WLAnalyzer()
873 self.searcher = IndexSearcher(self.store, True)
874 self.parser = QueryParser(Version.LUCENE_34, default_field,
877 self.parent_filter = TermsFilter()
878 self.parent_filter.addTerm(Term("is_book", "true"))
880 def query(self, query):
881 """Parse query in default Lucene Syntax. (for humans)
883 return self.parser.parse(query)
885 def simple_search(self, query, max_results=50):
886 """Runs a query for books using lucene syntax. (for humans)
887 Returns (books, total_hits)
890 tops = self.searcher.search(self.query(query), max_results)
892 for found in tops.scoreDocs:
893 doc = self.searcher.doc(found.doc)
894 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
895 return (bks, tops.totalHits)
897 def get_tokens(self, searched, field='content', cached=None):
898 """returns tokens analyzed by a proper (for a field) analyzer
899 argument can be: StringReader, string/unicode, or tokens. In the last case
900 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
902 if cached is not None and field in cached:
905 if isinstance(searched, str) or isinstance(searched, unicode):
906 searched = StringReader(searched)
907 elif isinstance(searched, list):
911 tokens = self.analyzer.reusableTokenStream(field, searched)
913 while tokens.incrementToken():
914 cta = tokens.getAttribute(CharTermAttribute.class_)
915 toks.append(cta.toString())
917 if cached is not None:
922 def fuzziness(self, fuzzy):
923 """Helper method to sanitize fuzziness"""
926 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
931 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
933 Return a PhraseQuery with a series of tokens.
936 phrase = MultiPhraseQuery()
938 term = Term(field, t)
939 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
943 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
947 if not fuzzterm.next(): break
949 phrase.add(JArray('object')(fuzzterms, Term))
953 phrase = PhraseQuery()
956 term = Term(field, t)
960 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
962 Returns term queries joined by boolean query.
963 modal - applies to boolean query
964 fuzzy - should the query by fuzzy.
968 term = Term(field, t)
970 term = FuzzyQuery(term, self.fuzziness(fuzzy))
972 term = TermQuery(term)
973 q.add(BooleanClause(term, modal))
976 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
977 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
978 if filters is None: filters = []
979 if tokens_cache is None: tokens_cache = {}
981 tokens = self.get_tokens(searched, field, cached=tokens_cache)
983 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
985 filters.append(self.term_filter(Term('is_book', 'true')))
986 top = self.searcher.search(query, self.chain_filters(filters), max_results)
988 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
990 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
991 filters=None, tokens_cache=None, boost=None, snippets=True):
992 if filters is None: filters = []
993 if tokens_cache is None: tokens_cache = {}
996 filters.append(self.term_filter(Term('is_book', 'true')))
998 query = BooleanQuery()
1001 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1003 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1004 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1006 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1008 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1009 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1011 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1013 Search for perfect book matches. Just see if the query matches with some author or title,
1014 taking hints into account.
1016 fields_to_search = ['authors', 'title']
1019 if not hint.should_search_for_book():
1021 fields_to_search = hint.just_search_in(fields_to_search)
1022 only_in = hint.book_filter()
1024 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1028 top = self.searcher.search(q,
1029 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1031 for found in top.scoreDocs:
1032 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1035 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1036 fields_to_search = ['tags', 'authors', 'title']
1040 if not hint.should_search_for_book():
1042 fields_to_search = hint.just_search_in(fields_to_search)
1043 only_in = hint.book_filter()
1045 tokens = self.get_tokens(searched, field='SIMPLE')
1049 for fld in fields_to_search:
1050 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1051 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1054 top = self.searcher.search(q,
1055 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1057 for found in top.scoreDocs:
1058 books.append(SearchResult(self, found, how_found="search_book"))
1062 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1064 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1065 some part/fragment of the book.
1067 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1071 flt = hint.part_filter()
1075 top = self.searcher.search(q,
1076 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1079 for found in top.scoreDocs:
1080 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1084 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1086 Tries to use search terms to match different fields of book (or its parts).
1087 E.g. one word can be an author survey, another be a part of the title, and the rest
1088 are some words from third chapter.
1090 if tokens_cache is None: tokens_cache = {}
1095 only_in = hint.part_filter()
1097 # content only query : themes x content
1100 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1101 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1103 # only search in themes when we do not already filter by themes
1104 if hint is None or hint.just_search_in(['themes']) != []:
1105 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1106 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1108 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1109 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1111 topDocs = self.searcher.search(q, only_in, max_results)
1112 for found in topDocs.scoreDocs:
1113 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1114 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1116 # query themes/content x author/title/tags
1118 in_content = BooleanQuery()
1119 in_meta = BooleanQuery()
1121 for fld in ['themes_pl', 'content']:
1122 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1124 for fld in ['tags', 'authors', 'title']:
1125 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1127 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1128 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1130 topDocs = self.searcher.search(q, only_in, max_results)
1131 for found in topDocs.scoreDocs:
1132 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1133 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1137 # def multisearch(self, query, max_results=50):
1140 # - (phrase) OR -> content
1143 # - (keywords) -> authors
1148 # queryreader = StringReader(query)
1149 # tokens = self.get_tokens(queryreader)
1151 # top_level = BooleanQuery()
1152 # Should = BooleanClause.Occur.SHOULD
1154 # phrase_level = BooleanQuery()
1155 # phrase_level.setBoost(1.3)
1157 # p_content = self.make_phrase(tokens, joined=True)
1158 # p_title = self.make_phrase(tokens, 'title')
1159 # p_author = self.make_phrase(tokens, 'author')
1161 # phrase_level.add(BooleanClause(p_content, Should))
1162 # phrase_level.add(BooleanClause(p_title, Should))
1163 # phrase_level.add(BooleanClause(p_author, Should))
1165 # kw_level = BooleanQuery()
1167 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1168 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1169 # kw_level.add(j_themes, Should)
1170 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1171 # j_con = self.make_term_query(tokens, joined=True)
1172 # kw_level.add(j_con, Should)
1174 # top_level.add(BooleanClause(phrase_level, Should))
1175 # top_level.add(BooleanClause(kw_level, Should))
1179 def get_snippets(self, scoreDoc, query, field='content'):
1181 Returns a snippet for found scoreDoc.
1183 htmlFormatter = SimpleHTMLFormatter()
1184 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1186 stored = self.searcher.doc(scoreDoc.doc)
1188 position = stored.get('snippets_position')
1189 length = stored.get('snippets_length')
1190 if position is None or length is None:
1193 snippets = Snippets(stored.get('book_id')).open()
1195 text = snippets.get((int(position),
1200 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1201 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1202 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1207 def enum_to_array(enum):
1209 Converts a lucene TermEnum to array of Terms, suitable for
1218 if not enum.next(): break
1221 return JArray('object')(terms, Term)
1223 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1225 Search for Tag objects using query.
1228 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1229 tops = self.searcher.search(query, filters, max_results)
1232 for found in tops.scoreDocs:
1233 doc = self.searcher.doc(found.doc)
1234 is_pdcounter = doc.get('is_pdcounter')
1236 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1238 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1239 # don't add the pdcounter tag if same tag already exists
1240 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1242 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1243 print 'returning %s' % tags
1246 def search_books(self, query, filter=None, max_results=10):
1248 Searches for Book objects using query
1251 tops = self.searcher.search(query, filter, max_results)
1252 for found in tops.scoreDocs:
1253 doc = self.searcher.doc(found.doc)
1254 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1257 def make_prefix_phrase(self, toks, field):
1258 q = MultiPhraseQuery()
1259 for i in range(len(toks)):
1260 t = Term(field, toks[i])
1261 if i == len(toks) - 1:
1262 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1272 def term_filter(term, inverse=False):
1273 only_term = TermsFilter()
1274 only_term.addTerm(term)
1277 neg = BooleanFilter()
1278 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1283 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1285 Return auto-complete hints for tags
1286 using prefix search.
1288 toks = self.get_tokens(string, field='SIMPLE')
1289 top = BooleanQuery()
1291 for field in ['tag_name', 'tag_name_pl']:
1293 q = self.make_prefix_phrase(toks, field)
1295 q = self.make_term_query(toks, field)
1296 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1298 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1300 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1302 def hint_books(self, string, max_results=50, prefix=True):
1304 Returns auto-complete hints for book titles
1305 Because we do not index 'pseudo' title-tags.
1308 toks = self.get_tokens(string, field='SIMPLE')
1311 q = self.make_prefix_phrase(toks, 'title')
1313 q = self.make_term_query(toks, 'title')
1315 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1318 def chain_filters(filters, op=ChainedFilter.AND):
1320 Chains a filter list together
1322 filters = filter(lambda x: x is not None, filters)
1323 if not filters or filters is []:
1325 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1328 def filtered_categories(self, tags):
1330 Return a list of tag categories, present in tags list.
1334 cats[t.category] = True