1 # -*- coding: utf-8 -*-
3 from django.conf import settings
4 from lucene import SimpleFSDirectory, IndexWriter, CheckIndex, \
5 File, Field, Integer, \
6 NumericField, Version, Document, JavaError, IndexSearcher, \
7 QueryParser, PerFieldAnalyzerWrapper, \
8 SimpleAnalyzer, PolishAnalyzer, ArrayList, \
9 KeywordAnalyzer, NumericRangeQuery, NumericRangeFilter, BooleanQuery, \
10 BlockJoinQuery, BlockJoinCollector, Filter, TermsFilter, ChainedFilter, \
11 HashSet, BooleanClause, Term, CharTermAttribute, \
12 PhraseQuery, MultiPhraseQuery, StringReader, TermQuery, \
13 FuzzyQuery, FuzzyTermEnum, PrefixTermEnum, Sort, Integer, \
14 SimpleHTMLFormatter, Highlighter, QueryScorer, TokenSources, TextFragment, \
15 BooleanFilter, TermsFilter, FilterClause, QueryWrapperFilter, \
16 initVM, CLASSPATH, JArray, JavaError
20 JVM = initVM(CLASSPATH)
26 from librarian import dcparser
27 from librarian.parser import WLDocument
28 from lxml import etree
29 import catalogue.models
30 from pdcounter.models import Author as PDCounterAuthor
31 from multiprocessing.pool import ThreadPool
32 from threading import current_thread
37 class WLAnalyzer(PerFieldAnalyzerWrapper):
39 polish = PolishAnalyzer(Version.LUCENE_34)
40 # polish_gap.setPositionIncrementGap(999)
42 simple = SimpleAnalyzer(Version.LUCENE_34)
43 # simple_gap.setPositionIncrementGap(999)
45 keyword = KeywordAnalyzer(Version.LUCENE_34)
47 # not sure if needed: there's NOT_ANALYZED meaning basically the same
49 PerFieldAnalyzerWrapper.__init__(self, polish)
51 self.addAnalyzer("tags", simple)
52 self.addAnalyzer("technical_editors", simple)
53 self.addAnalyzer("editors", simple)
54 self.addAnalyzer("url", keyword)
55 self.addAnalyzer("source_url", keyword)
56 self.addAnalyzer("source_name", simple)
57 self.addAnalyzer("publisher", simple)
58 self.addAnalyzer("authors", simple)
59 self.addAnalyzer("title", simple)
61 self.addAnalyzer("is_book", keyword)
62 # shouldn't the title have two forms? _pl and simple?
64 self.addAnalyzer("themes", simple)
65 self.addAnalyzer("themes_pl", polish)
67 self.addAnalyzer("tag_name", simple)
68 self.addAnalyzer("tag_name_pl", polish)
70 self.addAnalyzer("translators", simple)
72 self.addAnalyzer("KEYWORD", keyword)
73 self.addAnalyzer("SIMPLE", simple)
74 self.addAnalyzer("POLISH", polish)
77 class IndexStore(object):
79 Provides access to search index.
81 self.store - lucene index directory
85 self.store = SimpleFSDirectory(File(settings.SEARCH_INDEX))
87 def make_index_dir(self):
89 os.makedirs(settings.SEARCH_INDEX)
90 except OSError as exc:
91 if exc.errno == errno.EEXIST:
96 class IndexChecker(IndexStore):
98 IndexStore.__init__(self)
101 checker = CheckIndex(self.store)
102 status = checker.checkIndex()
106 class Snippets(object):
108 This class manages snippet files for indexed object (book)
109 the snippets are concatenated together, and their positions and
110 lengths are kept in lucene index fields.
112 SNIPPET_DIR = "snippets"
114 def __init__(self, book_id):
116 os.makedirs(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR))
117 except OSError as exc:
118 if exc.errno == errno.EEXIST:
121 self.book_id = book_id
124 def open(self, mode='r'):
126 Open the snippet file. Call .close() afterwards.
130 self.file = open(os.path.join(settings.SEARCH_INDEX, self.SNIPPET_DIR, str(self.book_id)), mode)
134 def add(self, snippet):
136 Append a snippet (unicode) to the snippet file.
137 Return a (position, length) tuple
139 txt = snippet.encode('utf-8')
142 pos = (self.position, l)
148 Given a tuple of (position, length) return an unicode
149 of the snippet stored there.
151 self.file.seek(pos[0], 0)
152 txt = self.file.read(pos[1]).decode('utf-8')
156 """Close snippet file"""
160 class BaseIndex(IndexStore):
163 Provides basic operations on index: opening, closing, optimizing.
165 def __init__(self, analyzer=None):
166 super(BaseIndex, self).__init__()
169 analyzer = WLAnalyzer()
170 self.analyzer = analyzer
172 def open(self, analyzer=None):
174 raise Exception("Index is already opened")
175 self.index = IndexWriter(self.store, self.analyzer,\
176 IndexWriter.MaxFieldLength.LIMITED)
180 self.index.optimize()
184 self.index.optimize()
185 except JavaError, je:
186 print "Error during optimize phase, check index: %s" % je
195 def __exit__(self, type, value, tb):
199 class Index(BaseIndex):
201 Class indexing books.
203 def __init__(self, analyzer=None):
204 super(Index, self).__init__(analyzer)
206 def index_tags(self):
208 Re-index global tag list.
209 Removes all tags from index, then index them again.
210 Indexed fields include: id, name (with and without polish stems), category
212 q = NumericRangeQuery.newIntRange("tag_id", 0, Integer.MAX_VALUE, True, True)
213 self.index.deleteDocuments(q)
215 for tag in catalogue.models.Tag.objects.all():
217 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(tag.id)))
218 doc.add(Field("tag_name", tag.name, Field.Store.NO, Field.Index.ANALYZED))
219 doc.add(Field("tag_name_pl", tag.name, Field.Store.NO, Field.Index.ANALYZED))
220 doc.add(Field("tag_category", tag.category, Field.Store.NO, Field.Index.NOT_ANALYZED))
221 self.index.addDocument(doc)
223 for pdtag in PDCounterAuthor.objects.all():
225 doc.add(NumericField("tag_id", Field.Store.YES, True).setIntValue(int(pdtag.id)))
226 doc.add(Field("tag_name", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
227 doc.add(Field("tag_name_pl", pdtag.name, Field.Store.NO, Field.Index.ANALYZED))
228 doc.add(Field("tag_category", 'pdcounter', Field.Store.NO, Field.Index.NOT_ANALYZED))
229 doc.add(Field("is_pdcounter", 'true', Field.Store.YES, Field.Index.NOT_ANALYZED))
230 self.index.addDocument(doc)
232 def create_book_doc(self, book):
234 Create a lucene document referring book id.
237 doc.add(NumericField("book_id", Field.Store.YES, True).setIntValue(int(book.id)))
238 if book.parent is not None:
239 doc.add(NumericField("parent_id", Field.Store.YES, True).setIntValue(int(book.parent.id)))
242 def remove_book(self, book):
243 """Removes a book from search index.
244 book - Book instance."""
245 q = NumericRangeQuery.newIntRange("book_id", book.id, book.id, True, True)
246 self.index.deleteDocuments(q)
248 def index_book(self, book, book_info=None, overwrite=True):
251 Creates a lucene document for extracted metadata
252 and calls self.index_content() to index the contents of the book.
255 self.remove_book(book)
257 book_doc = self.create_book_doc(book)
258 meta_fields = self.extract_metadata(book, book_info)
259 for f in meta_fields.values():
260 if isinstance(f, list) or isinstance(f, tuple):
266 self.index.addDocument(book_doc)
269 self.index_content(book, book_fields=[meta_fields['title'], meta_fields['authors'], meta_fields['published_date']])
274 'dramat_wierszowany_l',
275 'dramat_wierszowany_lp',
276 'dramat_wspolczesny', 'liryka_l', 'liryka_lp',
280 ignore_content_tags = [
282 'zastepnik_tekstu', 'sekcja_asterysk', 'separator_linia', 'zastepnik_wersu',
284 'naglowek_aktu', 'naglowek_sceny', 'naglowek_czesc',
287 footnote_tags = ['pa', 'pt', 'pr', 'pe']
289 skip_header_tags = ['autor_utworu', 'nazwa_utworu', 'dzielo_nadrzedne', '{http://www.w3.org/1999/02/22-rdf-syntax-ns#}RDF']
291 published_date_re = re.compile("([0-9]+)[\]. ]*$")
293 def extract_metadata(self, book, book_info=None):
295 Extract metadata from book and returns a map of fields keyed by fieldname
299 if book_info is None:
300 book_info = dcparser.parse(open(book.xml_file.path))
302 fields['slug'] = Field("slug", book.slug, Field.Store.NO, Field.Index.ANALYZED_NO_NORMS)
303 fields['tags'] = self.add_gaps([Field("tags", t.name, Field.Store.NO, Field.Index.ANALYZED) for t in book.tags], 'tags')
304 fields['is_book'] = Field("is_book", 'true', Field.Store.NO, Field.Index.NOT_ANALYZED)
307 for field in dcparser.BookInfo.FIELDS:
308 if hasattr(book_info, field.name):
309 if not getattr(book_info, field.name):
311 # since no type information is available, we use validator
312 type_indicator = field.validator
313 if type_indicator == dcparser.as_unicode:
314 s = getattr(book_info, field.name)
318 fields[field.name] = Field(field.name, s, Field.Store.NO, Field.Index.ANALYZED)
319 except JavaError as je:
320 raise Exception("failed to add field: %s = '%s', %s(%s)" % (field.name, s, je.message, je.args))
321 elif type_indicator == dcparser.as_person:
322 p = getattr(book_info, field.name)
323 if isinstance(p, dcparser.Person):
326 persons = ', '.join(map(unicode, p))
327 fields[field.name] = Field(field.name, persons, Field.Store.NO, Field.Index.ANALYZED)
328 elif type_indicator == dcparser.as_date:
329 dt = getattr(book_info, field.name)
330 fields[field.name] = Field(field.name, "%04d%02d%02d" %\
331 (dt.year, dt.month, dt.day), Field.Store.NO, Field.Index.NOT_ANALYZED)
334 source = book_info.source_name
335 match = self.published_date_re.search(source)
336 if match is not None:
337 fields["published_date"] = Field("published_date", str(match.groups()[0]), Field.Store.YES, Field.Index.NOT_ANALYZED)
341 def add_gaps(self, fields, fieldname):
343 Interposes a list of fields with gap-fields, which are indexed spaces and returns it.
344 This allows for doing phrase queries which do not overlap the gaps (when slop is 0).
348 yield Field(fieldname, ' ', Field.Store.NO, Field.Index.NOT_ANALYZED)
349 return reduce(lambda a, b: a + b, zip(fields, gap()))[0:-1]
351 def get_master(self, root):
353 Returns the first master tag from an etree.
355 for master in root.iter():
356 if master.tag in self.master_tags:
359 def index_content(self, book, book_fields=[]):
361 Walks the book XML and extract content from it.
362 Adds parts for each header tag and for each fragment.
364 wld = WLDocument.from_file(book.xml_file.path, parse_dublincore=False)
365 root = wld.edoc.getroot()
367 master = self.get_master(root)
371 def walker(node, ignore_tags=[]):
373 for child in filter(lambda n: n.tag not in ignore_tags, list(node)):
374 for b, e in walker(child):
379 def fix_format(text):
380 # separator = [u" ", u"\t", u".", u";", u","]
381 if isinstance(text, list):
382 # need to join it first
383 text = filter(lambda s: s is not None, content)
384 text = u' '.join(text)
385 # for i in range(len(text)):
387 # if text[i][0] not in separator\
388 # and text[i - 1][-1] not in separator:
389 # text.insert(i, u" ")
391 return re.sub("(?m)/$", "", text)
393 def add_part(snippets, **fields):
394 doc = self.create_book_doc(book)
395 for f in book_fields:
398 doc.add(NumericField('header_index', Field.Store.YES, True).setIntValue(fields["header_index"]))
399 doc.add(NumericField("header_span", Field.Store.YES, True)\
400 .setIntValue('header_span' in fields and fields['header_span'] or 1))
401 doc.add(Field('header_type', fields["header_type"], Field.Store.YES, Field.Index.NOT_ANALYZED))
403 doc.add(Field('content', fields["content"], Field.Store.NO, Field.Index.ANALYZED, \
404 Field.TermVector.WITH_POSITIONS_OFFSETS))
406 snip_pos = snippets.add(fields["content"])
407 doc.add(NumericField("snippets_position", Field.Store.YES, True).setIntValue(snip_pos[0]))
408 doc.add(NumericField("snippets_length", Field.Store.YES, True).setIntValue(snip_pos[1]))
410 if 'fragment_anchor' in fields:
411 doc.add(Field("fragment_anchor", fields['fragment_anchor'],
412 Field.Store.YES, Field.Index.NOT_ANALYZED))
414 if 'themes' in fields:
415 themes, themes_pl = zip(*[
416 (Field("themes", theme, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS),
417 Field("themes_pl", theme, Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS))
418 for theme in fields['themes']])
420 themes = self.add_gaps(themes, 'themes')
421 themes_pl = self.add_gaps(themes_pl, 'themes_pl')
431 if isinstance(s, unicode):
432 return s.encode('utf-8')
437 snippets = Snippets(book.id).open('w')
439 for header, position in zip(list(master), range(len(master))):
441 if header.tag in self.skip_header_tags:
443 if header.tag is etree.Comment:
450 for start, end in walker(header, ignore_tags=self.ignore_content_tags):
452 # if start is not None and start.tag in self.footnote_tags:
453 # footnote = ' '.join(start.itertext())
454 # elif end is not None and footnote is not None and end.tag in self.footnote_tags:
455 # doc = add_part(snippets, header_index=position, header_type=header.tag,
458 # self.index.addDocument(doc)
462 # handle fragments and themes.
463 if start is not None and start.tag == 'begin':
464 fid = start.attrib['id'][1:]
465 fragments[fid] = {'content': [], 'themes': [], 'start_section': position, 'start_header': header.tag}
467 elif start is not None and start.tag == 'motyw':
468 fid = start.attrib['id'][1:]
469 if start.text is not None:
470 fragments[fid]['themes'] += map(str.strip, map(give_me_utf8, start.text.split(',')))
472 elif start is not None and start.tag == 'end':
473 fid = start.attrib['id'][1:]
474 if fid not in fragments:
475 continue # a broken <end> node, skip it
476 # import pdb; pdb.set_trace()
477 frag = fragments[fid]
478 if frag['themes'] == []:
479 continue # empty themes list.
482 doc = add_part(snippets,
483 header_type=frag['start_header'],
484 header_index=frag['start_section'],
485 header_span=position - frag['start_section'] + 1,
487 content=fix_format(frag['content']),
488 themes=frag['themes'])
490 self.index.addDocument(doc)
493 elif start is not None:
494 for frag in fragments.values():
495 frag['content'].append(start.text)
496 content.append(start.text)
497 elif end is not None:
498 for frag in fragments.values():
499 frag['content'].append(end.tail)
500 content.append(end.tail)
502 # in the end, add a section text.
503 doc = add_part(snippets, header_index=position, header_type=header.tag,
504 content=fix_format(content))
506 self.index.addDocument(doc)
512 def log_exception_wrapper(f):
517 print("Error in indexing thread: %s" % e)
518 traceback.print_exc()
523 class ReusableIndex(Index):
525 Works like index, but does not close/optimize Lucene index
526 until program exit (uses atexit hook).
527 This is usefull for importbooks command.
529 if you cannot rely on atexit, use ReusableIndex.close_reusable() yourself.
533 def open(self, analyzer=None, threads=4):
534 if ReusableIndex.index is not None:
535 self.index = ReusableIndex.index
537 print("opening index")
538 Index.open(self, analyzer)
539 ReusableIndex.index = self.index
540 atexit.register(ReusableIndex.close_reusable)
542 # def index_book(self, *args, **kw):
543 # job = ReusableIndex.pool.apply_async(log_exception_wrapper(Index.index_book), (self,) + args, kw)
544 # ReusableIndex.pool_jobs.append(job)
547 def close_reusable():
548 if ReusableIndex.index is not None:
549 ReusableIndex.index.optimize()
550 ReusableIndex.index.close()
551 ReusableIndex.index = None
557 class JoinSearch(object):
559 This mixin could be used to handle block join queries.
562 def __init__(self, *args, **kw):
563 super(JoinSearch, self).__init__(*args, **kw)
565 def wrapjoins(self, query, fields=[]):
567 This functions modifies the query in a recursive way,
568 so Term and Phrase Queries contained, which match
569 provided fields are wrapped in a BlockJoinQuery,
570 and so delegated to children documents.
572 if BooleanQuery.instance_(query):
573 qs = BooleanQuery.cast_(query)
575 clause = BooleanClause.cast_(clause)
576 clause.setQuery(self.wrapjoins(clause.getQuery(), fields))
580 query.extractTerms(termset)
583 if t.field() not in fields:
585 return BlockJoinQuery(query, self.parent_filter,
586 BlockJoinQuery.ScoreMode.Total)
588 def bsearch(self, query, max_results=50):
589 q = self.query(query)
590 bjq = BlockJoinQuery(q, self.parent_filter, BlockJoinQuery.ScoreMode.Avg)
592 tops = self.searcher.search(bjq, max_results)
594 for found in tops.scoreDocs:
595 doc = self.searcher.doc(found.doc)
596 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
597 return (bks, tops.totalHits)
600 class SearchResult(object):
601 def __init__(self, search, scoreDocs, score=None, how_found=None, snippets=None, searched=None, tokens_cache=None):
602 if tokens_cache is None: tokens_cache = {}
607 self._score = scoreDocs.score
612 self._processed_hits = None # processed hits
614 stored = search.searcher.doc(scoreDocs.doc)
615 self.book_id = int(stored.get("book_id"))
617 pd = stored.get("published_date")
620 self.published_date = int(pd)
622 header_type = stored.get("header_type")
623 # we have a content hit in some header of fragment
624 if header_type is not None:
625 sec = (header_type, int(stored.get("header_index")))
626 header_span = stored.get('header_span')
627 header_span = header_span is not None and int(header_span) or 1
629 fragment = stored.get("fragment_anchor")
632 snippets = snippets.replace("/\n", "\n")
633 hit = (sec + (header_span,), fragment, scoreDocs.score, {'how_found': how_found, 'snippets': snippets and [snippets] or []})
635 self._hits.append(hit)
638 self.searched = searched
639 self.tokens_cache = tokens_cache
643 return self._score * self.boost
645 def merge(self, other):
646 if self.book_id != other.book_id:
647 raise ValueError("this search result is or book %d; tried to merge with %d" % (self.book_id, other.book_id))
648 self._hits += other._hits
649 if other.score > self.score:
650 self._score = other._score
654 return catalogue.models.Book.objects.get(id=self.book_id)
656 book = property(get_book)
660 if self._processed_hits is not None:
661 return self._processed_hits
670 # to sections and fragments
671 frags = filter(lambda r: r[FRAGMENT] is not None, self._hits)
672 sect = filter(lambda r: r[FRAGMENT] is None, self._hits)
673 sect = filter(lambda s: 0 == len(filter(
674 lambda f: s[POSITION][POSITION_INDEX] >= f[POSITION][POSITION_INDEX]
675 and s[POSITION][POSITION_INDEX] < f[POSITION][POSITION_INDEX] + f[POSITION][POSITION_SPAN],
680 # remove duplicate fragments
685 if fragments[fid][SCORE] >= f[SCORE]:
688 frags = fragments.values()
690 # remove duplicate sections
694 si = s[POSITION][POSITION_INDEX]
697 if sections[si]['score'] >= s[SCORE]:
700 m = {'score': s[SCORE],
701 'section_number': s[POSITION][POSITION_INDEX] + 1,
706 hits = sections.values()
710 frag = catalogue.models.Fragment.objects.get(anchor=f[FRAGMENT])
711 except catalogue.models.Fragment.DoesNotExist:
715 # Figure out if we were searching for a token matching some word in theme name.
716 themes = frag.tags.filter(category='theme')
718 if self.searched is not None:
719 tokens = self.search.get_tokens(self.searched, 'POLISH', cached=self.tokens_cache)
721 name_tokens = self.search.get_tokens(theme.name, 'POLISH')
724 if not theme in themes_hit:
725 themes_hit.append(theme)
728 m = {'score': f[SCORE],
730 'section_number': f[POSITION][POSITION_INDEX] + 1,
732 'themes_hit': themes_hit
737 hits.sort(lambda a, b: cmp(a['score'], b['score']), reverse=True)
739 self._processed_hits = hits
743 def __unicode__(self):
744 return u'SearchResult(book_id=%d, score=%d)' % (self.book_id, self.score)
747 def aggregate(*result_lists):
749 for rl in result_lists:
751 if r.book_id in books:
752 books[r.book_id].merge(r)
753 #print(u"already have one with score %f, and this one has score %f" % (books[book.id][0], found.score))
756 return books.values()
758 def __cmp__(self, other):
759 c = cmp(self.score, other.score)
761 if not hasattr(other,'published_date') or not hasattr(self, 'published_date'):
762 import pdb; pdb.set_trace()
763 # this is inverted, because earlier date is better
764 return cmp(other.published_date, self.published_date)
771 Given some hint information (information we already know about)
772 our search target - like author, title (specific book), epoch, genre, kind
773 we can narrow down search using filters.
775 def __init__(self, search):
777 Accepts a Searcher instance.
784 def books(self, *books):
786 Give a hint that we search these books.
790 def tags(self, tags):
792 Give a hint that these Tag objects (a list of)
796 if t.category in ['author', 'title', 'epoch', 'genre', 'kind']:
797 lst = self.book_tags.get(t.category, [])
799 self.book_tags[t.category] = lst
800 if t.category in ['theme', 'theme_pl']:
801 self.part_tags.append(t)
803 def tag_filter(self, tags, field='tags'):
805 Given a lsit of tags and an optional field (but they are normally in tags field)
806 returns a filter accepting only books with specific tags.
811 toks = self.search.get_tokens(tag.name, field=field)
812 tag_phrase = PhraseQuery()
814 tag_phrase.add(Term(field, tok))
815 q.add(BooleanClause(tag_phrase, BooleanClause.Occur.MUST))
817 return QueryWrapperFilter(q)
819 def book_filter(self):
821 Filters using book tags (all tag kinds except a theme)
823 tags = reduce(lambda a, b: a + b, self.book_tags.values(), [])
825 return self.tag_filter(tags)
829 def part_filter(self):
831 This filter can be used to look for book parts.
832 It filters on book id and/or themes.
836 fs.append(self.tag_filter(self.part_tags, field='themes'))
838 if self._books != []:
840 for b in self._books:
841 id_filter = NumericRangeFilter.newIntRange('book_id', b.id, b.id, True, True)
842 bf.add(FilterClause(id_filter, BooleanClause.Occur.SHOULD))
845 return Search.chain_filters(fs)
847 def should_search_for_book(self):
848 return self._books == []
850 def just_search_in(self, all):
851 """Holds logic to figure out which indexes should be search, when we have some hinst already"""
854 if field == 'authors' and 'author' in self.book_tags:
856 if field == 'title' and self._books != []:
858 if (field == 'themes' or field == 'themes_pl') and self.part_tags:
864 class Search(IndexStore):
868 def __init__(self, default_field="content"):
869 IndexStore.__init__(self)
870 self.analyzer = WLAnalyzer() # PolishAnalyzer(Version.LUCENE_34)
871 # self.analyzer = WLAnalyzer()
872 self.searcher = IndexSearcher(self.store, True)
873 self.parser = QueryParser(Version.LUCENE_34, default_field,
876 self.parent_filter = TermsFilter()
877 self.parent_filter.addTerm(Term("is_book", "true"))
879 def query(self, query):
880 """Parse query in default Lucene Syntax. (for humans)
882 return self.parser.parse(query)
884 def simple_search(self, query, max_results=50):
885 """Runs a query for books using lucene syntax. (for humans)
886 Returns (books, total_hits)
889 tops = self.searcher.search(self.query(query), max_results)
891 for found in tops.scoreDocs:
892 doc = self.searcher.doc(found.doc)
893 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
894 return (bks, tops.totalHits)
896 def get_tokens(self, searched, field='content', cached=None):
897 """returns tokens analyzed by a proper (for a field) analyzer
898 argument can be: StringReader, string/unicode, or tokens. In the last case
899 they will just be returned (so we can reuse tokens, if we don't change the analyzer)
901 if cached is not None and field in cached:
904 if isinstance(searched, str) or isinstance(searched, unicode):
905 searched = StringReader(searched)
906 elif isinstance(searched, list):
910 tokens = self.analyzer.reusableTokenStream(field, searched)
912 while tokens.incrementToken():
913 cta = tokens.getAttribute(CharTermAttribute.class_)
914 toks.append(cta.toString())
916 if cached is not None:
921 def fuzziness(self, fuzzy):
922 """Helper method to sanitize fuzziness"""
925 if isinstance(fuzzy, float) and fuzzy > 0.0 and fuzzy <= 1.0:
930 def make_phrase(self, tokens, field='content', slop=2, fuzzy=False):
932 Return a PhraseQuery with a series of tokens.
935 phrase = MultiPhraseQuery()
937 term = Term(field, t)
938 fuzzterm = FuzzyTermEnum(self.searcher.getIndexReader(), term, self.fuzziness(fuzzy))
942 # print("fuzz %s" % unicode(fuzzterm.term()).encode('utf-8'))
946 if not fuzzterm.next(): break
948 phrase.add(JArray('object')(fuzzterms, Term))
952 phrase = PhraseQuery()
955 term = Term(field, t)
959 def make_term_query(self, tokens, field='content', modal=BooleanClause.Occur.SHOULD, fuzzy=False):
961 Returns term queries joined by boolean query.
962 modal - applies to boolean query
963 fuzzy - should the query by fuzzy.
967 term = Term(field, t)
969 term = FuzzyQuery(term, self.fuzziness(fuzzy))
971 term = TermQuery(term)
972 q.add(BooleanClause(term, modal))
975 def search_phrase(self, searched, field, book=True, max_results=20, fuzzy=False,
976 filters=None, tokens_cache=None, boost=None, snippets=False, slop=2):
977 if filters is None: filters = []
978 if tokens_cache is None: tokens_cache = {}
980 tokens = self.get_tokens(searched, field, cached=tokens_cache)
982 query = self.make_phrase(tokens, field=field, fuzzy=fuzzy, slop=slop)
984 filters.append(self.term_filter(Term('is_book', 'true')))
985 top = self.searcher.search(query, self.chain_filters(filters), max_results)
987 return [SearchResult(self, found, snippets=(snippets and self.get_snippets(found, query) or None), searched=searched) for found in top.scoreDocs]
989 def search_some(self, searched, fields, book=True, max_results=20, fuzzy=False,
990 filters=None, tokens_cache=None, boost=None, snippets=True):
991 if filters is None: filters = []
992 if tokens_cache is None: tokens_cache = {}
995 filters.append(self.term_filter(Term('is_book', 'true')))
997 query = BooleanQuery()
1000 tokens = self.get_tokens(searched, fld, cached=tokens_cache)
1002 query.add(BooleanClause(self.make_term_query(tokens, field=fld,
1003 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1005 top = self.searcher.search(query, self.chain_filters(filters), max_results)
1007 return [SearchResult(self, found, searched=searched, tokens_cache=tokens_cache,
1008 snippets=(snippets and self.get_snippets(found, query) or None)) for found in top.scoreDocs]
1010 def search_perfect_book(self, searched, max_results=20, fuzzy=False, hint=None):
1012 Search for perfect book matches. Just see if the query matches with some author or title,
1013 taking hints into account.
1015 fields_to_search = ['authors', 'title']
1018 if not hint.should_search_for_book():
1020 fields_to_search = hint.just_search_in(fields_to_search)
1021 only_in = hint.book_filter()
1023 qrys = [self.make_phrase(self.get_tokens(searched, field=fld), field=fld, fuzzy=fuzzy) for fld in fields_to_search]
1027 top = self.searcher.search(q,
1028 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1030 for found in top.scoreDocs:
1031 books.append(SearchResult(self, found, how_found="search_perfect_book"))
1034 def search_book(self, searched, max_results=20, fuzzy=False, hint=None):
1035 fields_to_search = ['tags', 'authors', 'title']
1039 if not hint.should_search_for_book():
1041 fields_to_search = hint.just_search_in(fields_to_search)
1042 only_in = hint.book_filter()
1044 tokens = self.get_tokens(searched, field='SIMPLE')
1048 for fld in fields_to_search:
1049 q.add(BooleanClause(self.make_term_query(tokens, field=fld,
1050 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1053 top = self.searcher.search(q,
1054 self.chain_filters([only_in, self.term_filter(Term('is_book', 'true'))]),
1056 for found in top.scoreDocs:
1057 books.append(SearchResult(self, found, how_found="search_book"))
1061 def search_perfect_parts(self, searched, max_results=20, fuzzy=False, hint=None):
1063 Search for book parts which contains a phrase perfectly matching (with a slop of 2, default for make_phrase())
1064 some part/fragment of the book.
1066 qrys = [self.make_phrase(self.get_tokens(searched), field=fld, fuzzy=fuzzy) for fld in ['content']]
1070 flt = hint.part_filter()
1074 top = self.searcher.search(q,
1075 self.chain_filters([self.term_filter(Term('is_book', 'true'), inverse=True),
1078 for found in top.scoreDocs:
1079 books.append(SearchResult(self, found, snippets=self.get_snippets(found, q), how_found='search_perfect_parts'))
1083 def search_everywhere(self, searched, max_results=20, fuzzy=False, hint=None, tokens_cache=None):
1085 Tries to use search terms to match different fields of book (or its parts).
1086 E.g. one word can be an author survey, another be a part of the title, and the rest
1087 are some words from third chapter.
1089 if tokens_cache is None: tokens_cache = {}
1094 only_in = hint.part_filter()
1096 # content only query : themes x content
1099 tokens_pl = self.get_tokens(searched, field='content', cached=tokens_cache)
1100 tokens = self.get_tokens(searched, field='SIMPLE', cached=tokens_cache)
1102 # only search in themes when we do not already filter by themes
1103 if hint is None or hint.just_search_in(['themes']) != []:
1104 q.add(BooleanClause(self.make_term_query(tokens_pl, field='themes_pl',
1105 fuzzy=fuzzy), BooleanClause.Occur.MUST))
1107 q.add(BooleanClause(self.make_term_query(tokens_pl, field='content',
1108 fuzzy=fuzzy), BooleanClause.Occur.SHOULD))
1110 topDocs = self.searcher.search(q, only_in, max_results)
1111 for found in topDocs.scoreDocs:
1112 books.append(SearchResult(self, found, how_found='search_everywhere_themesXcontent', searched=searched))
1113 print "* %s theme x content: %s" % (searched, books[-1]._hits)
1115 # query themes/content x author/title/tags
1117 in_content = BooleanQuery()
1118 in_meta = BooleanQuery()
1120 for fld in ['themes_pl', 'content']:
1121 in_content.add(BooleanClause(self.make_term_query(tokens_pl, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1123 for fld in ['tags', 'authors', 'title']:
1124 in_meta.add(BooleanClause(self.make_term_query(tokens, field=fld, fuzzy=False), BooleanClause.Occur.SHOULD))
1126 q.add(BooleanClause(in_content, BooleanClause.Occur.MUST))
1127 q.add(BooleanClause(in_meta, BooleanClause.Occur.SHOULD))
1129 topDocs = self.searcher.search(q, only_in, max_results)
1130 for found in topDocs.scoreDocs:
1131 books.append(SearchResult(self, found, how_found='search_everywhere', searched=searched))
1132 print "* %s scatter search: %s" % (searched, books[-1]._hits)
1136 # def multisearch(self, query, max_results=50):
1139 # - (phrase) OR -> content
1142 # - (keywords) -> authors
1147 # queryreader = StringReader(query)
1148 # tokens = self.get_tokens(queryreader)
1150 # top_level = BooleanQuery()
1151 # Should = BooleanClause.Occur.SHOULD
1153 # phrase_level = BooleanQuery()
1154 # phrase_level.setBoost(1.3)
1156 # p_content = self.make_phrase(tokens, joined=True)
1157 # p_title = self.make_phrase(tokens, 'title')
1158 # p_author = self.make_phrase(tokens, 'author')
1160 # phrase_level.add(BooleanClause(p_content, Should))
1161 # phrase_level.add(BooleanClause(p_title, Should))
1162 # phrase_level.add(BooleanClause(p_author, Should))
1164 # kw_level = BooleanQuery()
1166 # kw_level.add(self.make_term_query(tokens, 'author'), Should)
1167 # j_themes = self.make_term_query(tokens, 'themes', joined=True)
1168 # kw_level.add(j_themes, Should)
1169 # kw_level.add(self.make_term_query(tokens, 'tags'), Should)
1170 # j_con = self.make_term_query(tokens, joined=True)
1171 # kw_level.add(j_con, Should)
1173 # top_level.add(BooleanClause(phrase_level, Should))
1174 # top_level.add(BooleanClause(kw_level, Should))
1178 def get_snippets(self, scoreDoc, query, field='content'):
1180 Returns a snippet for found scoreDoc.
1182 htmlFormatter = SimpleHTMLFormatter()
1183 highlighter = Highlighter(htmlFormatter, QueryScorer(query))
1185 stored = self.searcher.doc(scoreDoc.doc)
1187 position = stored.get('snippets_position')
1188 length = stored.get('snippets_length')
1189 if position is None or length is None:
1192 snippets = Snippets(stored.get('book_id')).open()
1194 text = snippets.get((int(position),
1199 tokenStream = TokenSources.getAnyTokenStream(self.searcher.getIndexReader(), scoreDoc.doc, field, self.analyzer)
1200 # highlighter.getBestTextFragments(tokenStream, text, False, 10)
1201 snip = highlighter.getBestFragments(tokenStream, text, 3, "...")
1206 def enum_to_array(enum):
1208 Converts a lucene TermEnum to array of Terms, suitable for
1217 if not enum.next(): break
1220 return JArray('object')(terms, Term)
1222 def search_tags(self, query, filters=None, max_results=40, pdcounter=False):
1224 Search for Tag objects using query.
1227 filters = self.chain_filters([filter, self.term_filter(Term('is_pdcounter', 'true'), inverse=True)])
1228 tops = self.searcher.search(query, filters, max_results)
1231 for found in tops.scoreDocs:
1232 doc = self.searcher.doc(found.doc)
1233 is_pdcounter = doc.get('is_pdcounter')
1235 tag = PDCounterAuthor.objects.get(id=doc.get('tag_id'))
1237 tag = catalogue.models.Tag.objects.get(id=doc.get("tag_id"))
1238 # don't add the pdcounter tag if same tag already exists
1239 if not (is_pdcounter and filter(lambda t: tag.slug == t.slug, tags)):
1241 # print "%s (%d) -> %f" % (tag, tag.id, found.score)
1242 print 'returning %s' % tags
1245 def search_books(self, query, filter=None, max_results=10):
1247 Searches for Book objects using query
1250 tops = self.searcher.search(query, filter, max_results)
1251 for found in tops.scoreDocs:
1252 doc = self.searcher.doc(found.doc)
1253 bks.append(catalogue.models.Book.objects.get(id=doc.get("book_id")))
1256 def make_prefix_phrase(self, toks, field):
1257 q = MultiPhraseQuery()
1258 for i in range(len(toks)):
1259 t = Term(field, toks[i])
1260 if i == len(toks) - 1:
1261 pterms = Search.enum_to_array(PrefixTermEnum(self.searcher.getIndexReader(), t))
1271 def term_filter(term, inverse=False):
1272 only_term = TermsFilter()
1273 only_term.addTerm(term)
1276 neg = BooleanFilter()
1277 neg.add(FilterClause(only_term, BooleanClause.Occur.MUST_NOT))
1282 def hint_tags(self, string, max_results=50, pdcounter=True, prefix=True):
1284 Return auto-complete hints for tags
1285 using prefix search.
1287 toks = self.get_tokens(string, field='SIMPLE')
1288 top = BooleanQuery()
1290 for field in ['tag_name', 'tag_name_pl']:
1292 q = self.make_prefix_phrase(toks, field)
1294 q = self.make_term_query(toks, field)
1295 top.add(BooleanClause(q, BooleanClause.Occur.SHOULD))
1297 no_book_cat = self.term_filter(Term("tag_category", "book"), inverse=True)
1299 return self.search_tags(top, no_book_cat, max_results=max_results, pdcounter=pdcounter)
1301 def hint_books(self, string, max_results=50, prefix=True):
1303 Returns auto-complete hints for book titles
1304 Because we do not index 'pseudo' title-tags.
1307 toks = self.get_tokens(string, field='SIMPLE')
1310 q = self.make_prefix_phrase(toks, 'title')
1312 q = self.make_term_query(toks, 'title')
1314 return self.search_books(q, self.term_filter(Term("is_book", "true")), max_results=max_results)
1317 def chain_filters(filters, op=ChainedFilter.AND):
1319 Chains a filter list together
1321 filters = filter(lambda x: x is not None, filters)
1322 if not filters or filters is []:
1324 chf = ChainedFilter(JArray('object')(filters, Filter), op)
1327 def filtered_categories(self, tags):
1329 Return a list of tag categories, present in tags list.
1333 cats[t.category] = True