2 * Copyright 2004 The Apache Software Foundation.
4 * Licensed under the Apache License, Version 2.0 (the "License");
5 * you may not use this file except in compliance with the License.
6 * You may obtain a copy of the License at
8 * http://www.apache.org/licenses/LICENSE-2.0
10 * Unless required by applicable law or agreed to in writing, software
11 * distributed under the License is distributed on an "AS IS" BASIS,
12 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13 * See the License for the specific language governing permissions and
14 * limitations under the License.
16 package org.apache.lucene.search.similar;
18 import java.io.IOException;
19 import java.io.StringReader;
20 import java.util.HashSet;
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.TokenStream;
25 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
26 import org.apache.lucene.index.Term;
27 import org.apache.lucene.search.BooleanClause;
28 import org.apache.lucene.search.BooleanQuery;
29 import org.apache.lucene.search.IndexSearcher;
30 import org.apache.lucene.search.Query;
31 import org.apache.lucene.search.TermQuery;
34 * Simple similarity measures.
38 public final class SimilarityQueries
43 private SimilarityQueries()
48 * Simple similarity query generators.
49 * Takes every unique word and forms a boolean query where all words are optional.
50 * After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
51 * The only caveat is the first hit returned <b>should be</b> your source document - you'll
52 * need to then ignore that.
55 * So, if you have a code fragment like this:
58 * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
62 * The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
65 * The philosophy behind this method is "two documents are similar if they share lots of words".
66 * Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
69 * This method is fail-safe in that if a long 'body' is passed in and
70 * {@link BooleanQuery#add BooleanQuery.add()} (used internally)
72 * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
73 * query as it is will be returned.
75 * @param body the body of the document you want to find similar documents to
76 * @param a the analyzer to use to parse the body
77 * @param field the field you want to search on, probably something like "contents" or "body"
78 * @param stop optional set of stop words to ignore
79 * @return a query with all unique words in 'body'
80 * @throws IOException this can't happen...
82 public static Query formSimilarQuery( String body,
88 TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
89 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
91 BooleanQuery tmp = new BooleanQuery();
92 Set<String> already = new HashSet<String>(); // ignore dups
93 while (ts.incrementToken()) {
94 String word = termAtt.toString();
95 // ignore opt stop words
97 stop.contains( word)) continue;
99 if ( ! already.add( word)) continue;
101 TermQuery tq = new TermQuery( new Term( field, word));
104 tmp.add( tq, BooleanClause.Occur.SHOULD);
106 catch( BooleanQuery.TooManyClauses too)
108 // fail-safe, just return what we have, not the end of the world