X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java?ds=inline

diff --git a/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
new file mode 100644
index 0000000..30feb3b
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/similar/SimilarityQueries.java
@@ -0,0 +1,114 @@
+/**
+ * Copyright 2004 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+package org.apache.lucene.search.similar;
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.HashSet;
+import java.util.Set;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.search.BooleanClause;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+
+/**
+ * Simple similarity measures.
+ *
+ * @see MoreLikeThis
+ */
+public final class SimilarityQueries
+{
+	/**
+	 *
+	 */
+	private SimilarityQueries()
+	{
+	}
+	
+	/**
+	 * Simple similarity query generators.
+	 * Takes every unique word and forms a boolean query where all words are optional.
+	 * After you get this you'll use to to query your {@link IndexSearcher} for similar docs.
+	 * The only caveat is the first hit returned <b>should be</b> your source document - you'll
+	 * need to then ignore that.
+	 *
+	 * <p>
+	 * So, if you have a code fragment like this:
+	 * <br>
+	 * <code>
+	 * Query q = formSimilaryQuery( "I use Lucene to search fast. Fast searchers are good", new StandardAnalyzer(), "contents", null);
+	 * </code>
+	 *
+	 * <p>
+	 * The query returned, in string form, will be <code>'(i use lucene to search fast searchers are good')</code>.
+	 *
+	 * <p>
+	 * The philosophy behind this method is "two documents are similar if they share lots of words".
+	 * Note that behind the scenes, Lucene's scoring algorithm will tend to give two documents a higher similarity score if the share more uncommon words.
+	 *
+	 * <P>
+	 * This method is fail-safe in that if a long 'body' is passed in and
+	 * {@link BooleanQuery#add BooleanQuery.add()} (used internally)
+	 * throws
+	 * {@link org.apache.lucene.search.BooleanQuery.TooManyClauses BooleanQuery.TooManyClauses}, the
+	 * query as it is will be returned.
+	 *
+	 * @param body the body of the document you want to find similar documents to
+	 * @param a the analyzer to use to parse the body
+	 * @param field the field you want to search on, probably something like "contents" or "body"
+	 * @param stop optional set of stop words to ignore
+	 * @return a query with all unique words in 'body'
+	 * @throws IOException this can't happen...
+	 */
+    public static Query formSimilarQuery( String body,
+										  Analyzer a,
+										  String field,
+										  Set<?> stop)
+										  throws IOException
+	{	
+		TokenStream ts = a.reusableTokenStream( field, new StringReader( body));
+		CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+		
+		BooleanQuery tmp = new BooleanQuery();
+		Set<String> already = new HashSet<String>(); // ignore dups
+		while (ts.incrementToken()) {
+		  String word = termAtt.toString();
+			// ignore opt stop words
+			if ( stop != null &&
+				 stop.contains( word)) continue;
+			// ignore dups
+			if ( ! already.add( word)) continue;
+			// add to query
+			TermQuery tq = new TermQuery( new Term( field, word));
+			try
+			{
+				tmp.add( tq, BooleanClause.Occur.SHOULD);
+			}
+			catch( BooleanQuery.TooManyClauses too)
+			{
+				// fail-safe, just return what we have, not the end of the world
+				break;
+			}
+		}
+		return tmp;
+	}
+}