samples/LuceneInAction/lia/advsearching/BooksLikeThis.py

   1 # ====================================================================
   2 #   Licensed under the Apache License, Version 2.0 (the "License");
   3 #   you may not use this file except in compliance with the License.
   4 #   You may obtain a copy of the License at
   5 #
   6 #       http://www.apache.org/licenses/LICENSE-2.0
   7 #
   8 #   Unless required by applicable law or agreed to in writing, software
   9 #   distributed under the License is distributed on an "AS IS" BASIS,
  10 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  11 #   See the License for the specific language governing permissions and
  12 #   limitations under the License.
  13 # ====================================================================
  14
  15 import os
  16
  17 from lucene import \
  18      Document, IndexReader, Term, BooleanQuery, IndexSearcher, TermQuery, \
  19      SimpleFSDirectory, File, System, BooleanClause
  20
  21
  22 class BooksLikeThis(object):
  23
  24     def main(cls, argv):
  25
  26         indexDir = System.getProperty("index.dir")
  27         directory = SimpleFSDirectory(File(indexDir))
  28
  29         reader = IndexReader.open(directory, True)
  30         blt = BooksLikeThis(reader)
  31
  32         for id in xrange(reader.maxDoc()):
  33             if reader.isDeleted(id):
  34                 continue
  35             doc = reader.document(id)
  36             print ''
  37             print doc.get("title").encode('utf-8')
  38
  39             docs = blt.docsLike(id, doc, 10)
  40             if not docs:
  41                 print "  None like this"
  42             else:
  43                 for doc in docs:
  44                     print " ->", doc.get("title").encode('utf-8')
  45
  46     def __init__(self, reader):
  47
  48         self.reader = reader
  49         self.searcher = IndexSearcher(reader)
  50
  51     def docsLike(self, id, doc, max):
  52
  53         authors = doc.getValues("author")
  54         authorQuery = BooleanQuery()
  55         for author in authors:
  56             authorQuery.add(TermQuery(Term("author", author)),
  57                             BooleanClause.Occur.SHOULD)
  58         authorQuery.setBoost(2.0)
  59
  60         vector = self.reader.getTermFreqVector(id, "subject")
  61
  62         subjectQuery = BooleanQuery()
  63         for term in vector.getTerms():
  64             tq = TermQuery(Term("subject", term))
  65             subjectQuery.add(tq, BooleanClause.Occur.SHOULD)
  66
  67         likeThisQuery = BooleanQuery()
  68         likeThisQuery.add(authorQuery, BooleanClause.Occur.SHOULD)
  69         likeThisQuery.add(subjectQuery, BooleanClause.Occur.SHOULD)
  70
  71         # exclude myself
  72         likeThisQuery.add(TermQuery(Term("isbn", doc.get("isbn"))),
  73                           BooleanClause.Occur.MUST_NOT)
  74
  75         print "  Query:", likeThisQuery.toString("contents")
  76         scoreDocs = self.searcher.search(likeThisQuery, 50).scoreDocs
  77
  78         docs = []
  79         for scoreDoc in scoreDocs:
  80             doc = self.searcher.doc(scoreDoc.doc)
  81             if len(docs) < max:
  82                 docs.append(doc)
  83             else:
  84                 break
  85
  86         return docs
  87
  88     main = classmethod(main)