X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java?ds=inline diff --git a/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java new file mode 100644 index 0000000..30feb3b --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java @@ -0,0 +1,114 @@ +/** + * Copyright 2004 The Apache Software Foundation. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +package org.apache.lucene.search.similar; + +import java.io.IOException; +import java.io.StringReader; +import java.util.HashSet; +import java.util.Set; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.Term; +import org.apache.lucene.search.BooleanClause; +import org.apache.lucene.search.BooleanQuery; +import org.apache.lucene.search.IndexSearcher; +import org.apache.lucene.search.Query; +import org.apache.lucene.search.TermQuery; + +/** + * Simple similarity measures. + * + * @see MoreLikeThis + */ +public final class SimilarityQueries +{ + /** + * + */ + private SimilarityQueries() + { + } + + /** + * Simple similarity query generators. + * Takes every unique word and forms a boolean query where all words are optional. + * After you get this you'll use to to query your {@link IndexSearcher} for similar docs. + * The only caveat is the first hit returned should be your source document - you'll + * need to then ignore that. + * + *

+ * So, if you have a code fragment like this: + *
+ * + * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null); + * + * + *

+ * The query returned, in string form, will be '(i use lucene to search fast searchers are good'). + * + *

+ * The philosophy behind this method is "two documents are similar if they share lots of words". + * Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words. + * + *

+ * This method is fail-safe in that if a long 'body' is passed in and + * {@link BooleanQuery#add BooleanQuery.add()} (used internally) + * throws + * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the + * query as it is will be returned. + * + * @param body the body of the document you want to find similar documents to + * @param a the analyzer to use to parse the body + * @param field the field you want to search on, probably something like "contents" or "body" + * @param stop optional set of stop words to ignore + * @return a query with all unique words in 'body' + * @throws IOException this can't happen... + */ + public static Query formSimilarQuery( String body, + Analyzer a, + String field, + Set stop) + throws IOException + { + TokenStream ts = a.reusableTokenStream( field, new StringReader( body)); + CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); + + BooleanQuery tmp = new BooleanQuery(); + Set already = new HashSet(); // ignore dups + while (ts.incrementToken()) { + String word = termAtt.toString(); + // ignore opt stop words + if ( stop != null && + stop.contains( word)) continue; + // ignore dups + if ( ! already.add( word)) continue; + // add to query + TermQuery tq = new TermQuery( new Term( field, word)); + try + { + tmp.add( tq, BooleanClause.Occur.SHOULD); + } + catch( BooleanQuery.TooManyClauses too) + { + // fail-safe, just return what we have, not the end of the world + break; + } + } + return tmp; + } +}