lucene-java-3.4.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java

   1 /**
   2  * Copyright 2004 The Apache Software Foundation.
   3  *
   4  * Licensed under the Apache License, Version 2.0 (the "License");
   5  * you may not use this file except in compliance with the License.
   6  * You may obtain a copy of the License at
   7  *
   8  *     http://www.apache.org/licenses/LICENSE-2.0
   9  *
  10  * Unless required by applicable law or agreed to in writing, software
  11  * distributed under the License is distributed on an "AS IS" BASIS,
  12  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  13  * See the License for the specific language governing permissions and
  14  * limitations under the License.
  15  */
  16 package org.apache.lucene.search.similar;
  17
  18 import java.io.IOException;
  19 import java.io.StringReader;
  20 import java.util.HashSet;
  21 import java.util.Set;
  22
  23 import org.apache.lucene.analysis.Analyzer;
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  26 import org.apache.lucene.index.Term;
  27 import org.apache.lucene.search.BooleanClause;
  28 import org.apache.lucene.search.BooleanQuery;
  29 import org.apache.lucene.search.IndexSearcher;
  30 import org.apache.lucene.search.Query;
  31 import org.apache.lucene.search.TermQuery;
  32
  33 /**
  34  * Simple similarity measures.
  35  *
  36  * @see MoreLikeThis
  37  */
  38 public final class SimilarityQueries
  39 {
  40         /**
  41          *
  42          */
  43         private SimilarityQueries()
  44         {
  45         }
  46
  47         /**
  48          * Simple similarity query generators.
  49          * Takes every unique word and forms a boolean query where all words are optional.
  50          * After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
  51          * The only caveat is the first hit returned <b>should be</b> your source document - you'll
  52          * need to then ignore that.
  53          *
  54          * <p>
  55          * So, if you have a code fragment like this:
  56          * <br>
  57          * <code>
  58          * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
  59          * </code>
  60          *
  61          * <p>
  62          * The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
  63          *
  64          * <p>
  65          * The philosophy behind this method is "two documents are similar if they share lots of words".
  66          * Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
  67          *
  68          * <P>
  69          * This method is fail-safe in that if a long 'body' is passed in and
  70          * {@link BooleanQuery#add BooleanQuery.add()} (used internally)
  71          * throws
  72          * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
  73          * query as it is will be returned.
  74          *
  75          * @param body the body of the document you want to find similar documents to
  76          * @param a the analyzer to use to parse the body
  77          * @param field the field you want to search on, probably something like "contents" or "body"
  78          * @param stop optional set of stop words to ignore
  79          * @return a query with all unique words in 'body'
  80          * @throws IOException this can't happen...
  81          */
  82     public static Query formSimilarQuery( String body,
  83                                                                                   Analyzer a,
  84                                                                                   String field,
  85                                                                                   Set<?> stop)
  86                                                                                   throws IOException
  87         {
  88                 TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
  89                 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
  90
  91                 BooleanQuery tmp = new BooleanQuery();
  92                 Set<String> already = new HashSet<String>(); // ignore dups
  93                 while (ts.incrementToken()) {
  94                   String word = termAtt.toString();
  95                         // ignore opt stop words
  96                         if ( stop != null &&
  97                                  stop.contains( word)) continue;
  98                         // ignore dups
  99                         if ( ! already.add( word)) continue;
 100                         // add to query
 101                         TermQuery tq = new TermQuery( new Term( field, word));
 102                         try
 103                         {
 104                                 tmp.add( tq, BooleanClause.Occur.SHOULD);
 105                         }
 106                         catch( BooleanQuery.TooManyClauses too)
 107                         {
 108                                 // fail-safe, just return what we have, not the end of the world
 109                                 break;
 110                         }
 111                 }
 112                 return tmp;
 113         }
 114 }