1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
334 source = book_info.source_name
335 if hasattr(book_info, 'source_name'):
336 match = self.published_date_re.search(source)
337 if match is not None:
338 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
342 def add_gaps(self, fields, fieldname):
344 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
345 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
349 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
350 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
352 def get_master(self, root):
354 Returns the first master tag from an etree.
356 for master in root.iter():
357 if master.tag in self.master_tags:
360 def index_content(self, book, book_fields=[]):
362 Walks the book XML and extract content from it.
363 Adds parts for each header tag and for each fragment.
365 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
366 root = wld.edoc.getroot()
368 master = self.get_master(root)
372 def walker(node, ignore_tags=[]):
374 for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
375 for b, e in walker(child):
380 def fix_format(text):
381 # separator = [u" ", u"\t", u".", u";", u","]
382 if isinstance(text, list):
383 # need to join it first
384 text = filter(lambda s: s is not None, content)
385 text = u' '.join(text)
386 # for i in range(len(text)):
388 # if text[i][0] not in separator\
389 # and text[i - 1][-1] not in separator:
390 # text.insert(i, u" ")
392 return re.sub("(?m)/$", "", text)
394 def add_part(snippets, **fields):
395 doc = self.create_book_doc(book)
396 for f in book_fields:
399 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
400 doc.add(NumericField("header_span", Field.Store.YES, True)\
401 .setIntValue('header_span' in fields and fields['header_span'] or 1))
402 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
404 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
405 Field.TermVector.WITH_POSITIONS_OFFSETS))
407 snip_pos = snippets.add(fields["content"])
408 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
409 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
411 if 'fragment_anchor' in fields:
412 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
413 Field.Store.YES, Field.Index.NOT_ANALYZED))
415 if 'themes' in fields:
416 themes, themes_pl = zip(*[
417 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
418 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
419 for theme in fields['themes']])
421 themes = self.add_gaps(themes, 'themes')
422 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
432 if isinstance(s, unicode):
433 return s.encode('utf-8')
438 snippets = Snippets(book.id).open('w')
440 for header, position in zip(list(master), range(len(master))):
442 if header.tag in self.skip_header_tags:
444 if header.tag is etree.Comment:
451 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
453 # if start is not None and start.tag in self.footnote_tags:
454 # footnote = ' '.join(start.itertext())
455 # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
456 # doc = add_part(snippets, header_index=position, header_type=header.tag,
459 # self.index.addDocument(doc)
463 # handle fragments and themes.
464 if start is not None and start.tag == 'begin':
465 fid = start.attrib['id'][1:]
466 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
468 elif start is not None and start.tag == 'motyw':
469 fid = start.attrib['id'][1:]
470 if start.text is not None:
471 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
473 elif start is not None and start.tag == 'end':
474 fid = start.attrib['id'][1:]
475 if fid not in fragments:
476 continue # a broken <end> node, skip it
477 # import pdb; pdb.set_trace()
478 frag = fragments[fid]
479 if frag['themes'] == []:
480 continue # empty themes list.
483 doc = add_part(snippets,
484 header_type=frag['start_header'],
485 header_index=frag['start_section'],
486 header_span=position - frag['start_section'] + 1,
488 content=fix_format(frag['content']),
489 themes=frag['themes'])
491 self.index.addDocument(doc)
494 elif start is not None:
495 for frag in fragments.values():
496 frag['content'].append(start.text)
497 content.append(start.text)
498 elif end is not None:
499 for frag in fragments.values():
500 frag['content'].append(end.tail)
501 content.append(end.tail)
503 # in the end, add a section text.
504 doc = add_part(snippets, header_index=position, header_type=header.tag,
505 content=fix_format(content))
507 self.index.addDocument(doc)
513 def log_exception_wrapper(f):
518 print("Error in indexing thread: %s" % e)
519 traceback.print_exc()
524 class ReusableIndex(Index):
526 Works like index, but does not close/optimize Lucene index
527 until program exit (uses atexit hook).
528 This is usefull for importbooks command.
530 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
534 def open(self, analyzer=None, threads=4):
535 if ReusableIndex.index is not None:
536 self.index = ReusableIndex.index
538 print("opening index")
539 Index.open(self, analyzer)
540 ReusableIndex.index = self.index
541 atexit.register(ReusableIndex.close_reusable)
543 # def index_book(self, *args, **kw):
544 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
545 # ReusableIndex.pool_jobs.append(job)
548 def close_reusable():
549 if ReusableIndex.index is not None:
550 ReusableIndex.index.optimize()
551 ReusableIndex.index.close()
552 ReusableIndex.index = None
558 class JoinSearch(object):
560 This mixin could be used to handle block join queries.
563 def __init__(self, *args, **kw):
564 super(JoinSearch, self).__init__(*args, **kw)
566 def wrapjoins(self, query, fields=[]):
568 This functions modifies the query in a recursive way,
569 so Term and Phrase Queries contained, which match
570 provided fields are wrapped in a BlockJoinQuery,
571 and so delegated to children documents.
573 if BooleanQuery.instance_(query):
574 qs = BooleanQuery.cast_(query)
576 clause = BooleanClause.cast_(clause)
577 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
581 query.extractTerms(termset)
584 if t.field() not in fields:
586 return BlockJoinQuery(query, self.parent_filter,
587 BlockJoinQuery.ScoreMode.Total)
589 def bsearch(self, query, max_results=50):
590 q = self.query(query)
591 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
593 tops = self.searcher.search(bjq, max_results)
595 for found in tops.scoreDocs:
596 doc = self.searcher.doc(found.doc)
597 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
598 return (bks, tops.totalHits)
601 class SearchResult(object):
602 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
603 if tokens_cache is None: tokens_cache = {}
608 self._score = scoreDocs.score
613 self._processed_hits = None # processed hits
615 stored = search.searcher.doc(scoreDocs.doc)
616 self.book_id = int(stored.get("book_id"))
618 pd = stored.get("published_date")
621 self.published_date = int(pd)
623 header_type = stored.get("header_type")
624 # we have a content hit in some header of fragment
625 if header_type is not None:
626 sec = (header_type, int(stored.get("header_index")))
627 header_span = stored.get('header_span')
628 header_span = header_span is not None and int(header_span) or 1
630 fragment = stored.get("fragment_anchor")
633 snippets = snippets.replace("/\n", "\n")
634 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
636 self._hits.append(hit)
639 self.searched = searched
640 self.tokens_cache = tokens_cache
644 return self._score * self.boost
646 def merge(self, other):
647 if self.book_id != other.book_id:
648 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
649 self._hits += other._hits
650 if other.score > self.score:
651 self._score = other._score
655 return catalogue.models.Book.objects.get(id=self.book_id)
657 book = property(get_book)
661 if self._processed_hits is not None:
662 return self._processed_hits
671 # to sections and fragments
672 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
673 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
674 sect = filter(lambda s: 0 == len(filter(
675 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
676 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
681 # remove duplicate fragments
686 if fragments[fid][SCORE] >= f[SCORE]:
689 frags = fragments.values()
691 # remove duplicate sections
695 si = s[POSITION][POSITION_INDEX]
698 if sections[si]['score'] >= s[SCORE]:
701 m = {'score': s[SCORE],
702 'section_number': s[POSITION][POSITION_INDEX] + 1,
707 hits = sections.values()
711 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
712 except catalogue.models.Fragment.DoesNotExist:
716 # Figure out if we were searching for a token matching some word in theme name.
717 themes = frag.tags.filter(category='theme')
719 if self.searched is not None:
720 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
722 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
725 if not theme in themes_hit:
726 themes_hit.append(theme)
729 m = {'score': f[SCORE],
731 'section_number': f[POSITION][POSITION_INDEX] + 1,
733 'themes_hit': themes_hit
738 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
740 self._processed_hits = hits
744 def __unicode__(self):
745 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
748 def aggregate(*result_lists):
750 for rl in result_lists:
752 if r.book_id in books:
753 books[r.book_id].merge(r)
754 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
757 return books.values()
759 def __cmp__(self, other):
760 c = cmp(self.score, other.score)
762 # this is inverted, because earlier date is better
763 return cmp(other.published_date, self.published_date)
770 Given some hint information (information we already know about)
771 our search target - like author, title (specific book), epoch, genre, kind
772 we can narrow down search using filters.
774 def __init__(self, search):
776 Accepts a Searcher instance.
783 def books(self, *books):
785 Give a hint that we search these books.
789 def tags(self, tags):
791 Give a hint that these Tag objects (a list of)
795 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
796 lst = self.book_tags.get(t.category, [])
798 self.book_tags[t.category] = lst
799 if t.category in ['theme', 'theme_pl']:
800 self.part_tags.append(t)
802 def tag_filter(self, tags, field='tags'):
804 Given a lsit of tags and an optional field (but they are normally in tags field)
805 returns a filter accepting only books with specific tags.
810 toks = self.search.get_tokens(tag.name, field=field)
811 tag_phrase = PhraseQuery()
813 tag_phrase.add(Term(field, tok))
814 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
816 return QueryWrapperFilter(q)
818 def book_filter(self):
820 Filters using book tags (all tag kinds except a theme)
822 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
824 return self.tag_filter(tags)
828 def part_filter(self):
830 This filter can be used to look for book parts.
831 It filters on book id and/or themes.
835 fs.append(self.tag_filter(self.part_tags, field='themes'))
837 if self._books != []:
839 for b in self._books:
840 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
841 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
844 return Search.chain_filters(fs)
846 def should_search_for_book(self):
847 return self._books == []
849 def just_search_in(self, all):
850 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
853 if field == 'authors' and 'author' in self.book_tags:
855 if field == 'title' and self._books != []:
857 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
863 class Search(IndexStore):
867 def __init__(self, default_field="content"):
868 IndexStore.__init__(self)
869 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
870 # self.analyzer = WLAnalyzer()
871 self.searcher = IndexSearcher(self.store, True)
872 self.parser = QueryParser(Version.LUCENE_34, default_field,
875 self.parent_filter = TermsFilter()
876 self.parent_filter.addTerm(Term("is_book", "true"))
878 def query(self, query):
879 """Parse query in default Lucene Syntax. (for humans)
881 return self.parser.parse(query)
883 def simple_search(self, query, max_results=50):
884 """Runs a query for books using lucene syntax. (for humans)
885 Returns (books, total_hits)
888 tops = self.searcher.search(self.query(query), max_results)
890 for found in tops.scoreDocs:
891 doc = self.searcher.doc(found.doc)
892 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
893 return (bks, tops.totalHits)
895 def get_tokens(self, searched, field='content', cached=None):
896 """returns tokens analyzed by a proper (for a field) analyzer
897 argument can be: StringReader, string/unicode, or tokens. In the last case
898 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
900 if cached is not None and field in cached:
903 if isinstance(searched, str) or isinstance(searched, unicode):
904 searched = StringReader(searched)
905 elif isinstance(searched, list):
909 tokens = self.analyzer.reusableTokenStream(field, searched)
911 while tokens.incrementToken():
912 cta = tokens.getAttribute(CharTermAttribute.class_)
913 toks.append(cta.toString())
915 if cached is not None:
920 def fuzziness(self, fuzzy):
921 """Helper method to sanitize fuzziness"""
924 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
929 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
931 Return a PhraseQuery with a series of tokens.
934 phrase = MultiPhraseQuery()
936 term = Term(field, t)
937 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
941 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
945 if not fuzzterm.next(): break
947 phrase.add(JArray('object')(fuzzterms, Term))
951 phrase = PhraseQuery()
954 term = Term(field, t)
958 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
960 Returns term queries joined by boolean query.
961 modal - applies to boolean query
962 fuzzy - should the query by fuzzy.
966 term = Term(field, t)
968 term = FuzzyQuery(term, self.fuzziness(fuzzy))
970 term = TermQuery(term)
971 q.add(BooleanClause(term, modal))
974 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
975 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
976 if filters is None: filters = []
977 if tokens_cache is None: tokens_cache = {}
979 tokens = self.get_tokens(searched, field, cached=tokens_cache)
981 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
983 filters.append(self.term_filter(Term('is_book', 'true')))
984 top = self.searcher.search(query, self.chain_filters(filters), max_results)
986 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
988 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
989 filters=None, tokens_cache=None, boost=None, snippets=True):
990 if filters is None: filters = []
991 if tokens_cache is None: tokens_cache = {}
994 filters.append(self.term_filter(Term('is_book', 'true')))
996 query = BooleanQuery()
999 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1001 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1002 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1004 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1006 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1007 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1009 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1011 Search for perfect book matches. Just see if the query matches with some author or title,
1012 taking hints into account.
1014 fields_to_search = ['authors', 'title']
1017 if not hint.should_search_for_book():
1019 fields_to_search = hint.just_search_in(fields_to_search)
1020 only_in = hint.book_filter()
1022 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1026 top = self.searcher.search(q,
1027 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1029 for found in top.scoreDocs:
1030 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1033 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1034 fields_to_search = ['tags', 'authors', 'title']
1038 if not hint.should_search_for_book():
1040 fields_to_search = hint.just_search_in(fields_to_search)
1041 only_in = hint.book_filter()
1043 tokens = self.get_tokens(searched, field='SIMPLE')
1047 for fld in fields_to_search:
1048 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1049 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1052 top = self.searcher.search(q,
1053 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1055 for found in top.scoreDocs:
1056 books.append(SearchResult(self, found, how_found="search_book"))
1060 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1062 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1063 some part/fragment of the book.
1065 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1069 flt = hint.part_filter()
1073 top = self.searcher.search(q,
1074 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1077 for found in top.scoreDocs:
1078 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1082 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1084 Tries to use search terms to match different fields of book (or its parts).
1085 E.g. one word can be an author survey, another be a part of the title, and the rest
1086 are some words from third chapter.
1088 if tokens_cache is None: tokens_cache = {}
1093 only_in = hint.part_filter()
1095 # content only query : themes x content
1098 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1099 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1101 # only search in themes when we do not already filter by themes
1102 if hint is None or hint.just_search_in(['themes']) != []:
1103 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1104 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1106 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1107 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1109 topDocs = self.searcher.search(q, only_in, max_results)
1110 for found in topDocs.scoreDocs:
1111 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1112 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1114 # query themes/content x author/title/tags
1116 in_content = BooleanQuery()
1117 in_meta = BooleanQuery()
1119 for fld in ['themes_pl', 'content']:
1120 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1122 for fld in ['tags', 'authors', 'title']:
1123 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1125 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1126 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1128 topDocs = self.searcher.search(q, only_in, max_results)
1129 for found in topDocs.scoreDocs:
1130 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1131 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1135 # def multisearch(self, query, max_results=50):
1138 # - (phrase) OR -> content
1141 # - (keywords) -> authors
1146 # queryreader = StringReader(query)
1147 # tokens = self.get_tokens(queryreader)
1149 # top_level = BooleanQuery()
1150 # Should = BooleanClause.Occur.SHOULD
1152 # phrase_level = BooleanQuery()
1153 # phrase_level.setBoost(1.3)
1155 # p_content = self.make_phrase(tokens, joined=True)
1156 # p_title = self.make_phrase(tokens, 'title')
1157 # p_author = self.make_phrase(tokens, 'author')
1159 # phrase_level.add(BooleanClause(p_content, Should))
1160 # phrase_level.add(BooleanClause(p_title, Should))
1161 # phrase_level.add(BooleanClause(p_author, Should))
1163 # kw_level = BooleanQuery()
1165 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1166 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1167 # kw_level.add(j_themes, Should)
1168 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1169 # j_con = self.make_term_query(tokens, joined=True)
1170 # kw_level.add(j_con, Should)
1172 # top_level.add(BooleanClause(phrase_level, Should))
1173 # top_level.add(BooleanClause(kw_level, Should))
1177 def get_snippets(self, scoreDoc, query, field='content'):
1179 Returns a snippet for found scoreDoc.
1181 htmlFormatter = SimpleHTMLFormatter()
1182 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1184 stored = self.searcher.doc(scoreDoc.doc)
1186 position = stored.get('snippets_position')
1187 length = stored.get('snippets_length')
1188 if position is None or length is None:
1191 snippets = Snippets(stored.get('book_id')).open()
1193 text = snippets.get((int(position),
1198 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1199 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1200 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1205 def enum_to_array(enum):
1207 Converts a lucene TermEnum to array of Terms, suitable for
1216 if not enum.next(): break
1219 return JArray('object')(terms, Term)
1221 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1223 Search for Tag objects using query.
1226 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1227 tops = self.searcher.search(query, filters, max_results)
1230 for found in tops.scoreDocs:
1231 doc = self.searcher.doc(found.doc)
1232 is_pdcounter = doc.get('is_pdcounter')
1234 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1236 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1237 # don't add the pdcounter tag if same tag already exists
1238 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1240 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1241 print 'returning %s' % tags
1244 def search_books(self, query, filter=None, max_results=10):
1246 Searches for Book objects using query
1249 tops = self.searcher.search(query, filter, max_results)
1250 for found in tops.scoreDocs:
1251 doc = self.searcher.doc(found.doc)
1252 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1255 def make_prefix_phrase(self, toks, field):
1256 q = MultiPhraseQuery()
1257 for i in range(len(toks)):
1258 t = Term(field, toks[i])
1259 if i == len(toks) - 1:
1260 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1270 def term_filter(term, inverse=False):
1271 only_term = TermsFilter()
1272 only_term.addTerm(term)
1275 neg = BooleanFilter()
1276 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1281 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1283 Return auto-complete hints for tags
1284 using prefix search.
1286 toks = self.get_tokens(string, field='SIMPLE')
1287 top = BooleanQuery()
1289 for field in ['tag_name', 'tag_name_pl']:
1291 q = self.make_prefix_phrase(toks, field)
1293 q = self.make_term_query(toks, field)
1294 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1296 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1298 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1300 def hint_books(self, string, max_results=50, prefix=True):
1302 Returns auto-complete hints for book titles
1303 Because we do not index 'pseudo' title-tags.
1306 toks = self.get_tokens(string, field='SIMPLE')
1309 q = self.make_prefix_phrase(toks, 'title')
1311 q = self.make_term_query(toks, 'title')
1313 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1316 def chain_filters(filters, op=ChainedFilter.AND):
1318 Chains a filter list together
1320 filters = filter(lambda x: x is not None, filters)
1321 if not filters or filters is []:
1323 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1326 def filtered_categories(self, tags):
1328 Return a list of tag categories, present in tags list.
1332 cats[t.category] = True