--- /dev/null
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
+ * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
+ * of fuzzy scoring factors.
+ * This generally produces good results for queries where users may provide details in a number of
+ * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
+ * a fast query.
+ *
+ * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
+ * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
+ * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
+ * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
+ * term) and this is factored into the variant's boost. If the source query term does not exist in the
+ * index the average IDF of the variants is used.
+ */
+public class FuzzyLikeThisQuery extends Query
+{
+ static Similarity sim=new DefaultSimilarity();
+ Query rewrittenQuery=null;
+ ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
+ Analyzer analyzer;
+
+ ScoreTermQueue q;
+ int MAX_VARIANTS_PER_TERM=50;
+ boolean ignoreTF=false;
+ private int maxNumTerms;
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result + ((analyzer == null) ? 0 : analyzer.hashCode());
+ result = prime * result
+ + ((fieldVals == null) ? 0 : fieldVals.hashCode());
+ result = prime * result + (ignoreTF ? 1231 : 1237);
+ result = prime * result + maxNumTerms;
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ FuzzyLikeThisQuery other = (FuzzyLikeThisQuery) obj;
+ if (analyzer == null) {
+ if (other.analyzer != null)
+ return false;
+ } else if (!analyzer.equals(other.analyzer))
+ return false;
+ if (fieldVals == null) {
+ if (other.fieldVals != null)
+ return false;
+ } else if (!fieldVals.equals(other.fieldVals))
+ return false;
+ if (ignoreTF != other.ignoreTF)
+ return false;
+ if (maxNumTerms != other.maxNumTerms)
+ return false;
+ return true;
+ }
+
+
+ /**
+ *
+ * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
+ * @param analyzer
+ */
+ public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
+ {
+ q=new ScoreTermQueue(maxNumTerms);
+ this.analyzer=analyzer;
+ this.maxNumTerms = maxNumTerms;
+ }
+
+ class FieldVals
+ {
+ String queryString;
+ String fieldName;
+ float minSimilarity;
+ int prefixLength;
+ public FieldVals(String name, float similarity, int length, String queryString)
+ {
+ fieldName = name;
+ minSimilarity = similarity;
+ prefixLength = length;
+ this.queryString = queryString;
+ }
+
+ @Override
+ public int hashCode() {
+ final int prime = 31;
+ int result = 1;
+ result = prime * result
+ + ((fieldName == null) ? 0 : fieldName.hashCode());
+ result = prime * result + Float.floatToIntBits(minSimilarity);
+ result = prime * result + prefixLength;
+ result = prime * result
+ + ((queryString == null) ? 0 : queryString.hashCode());
+ return result;
+ }
+
+ @Override
+ public boolean equals(Object obj) {
+ if (this == obj)
+ return true;
+ if (obj == null)
+ return false;
+ if (getClass() != obj.getClass())
+ return false;
+ FieldVals other = (FieldVals) obj;
+ if (fieldName == null) {
+ if (other.fieldName != null)
+ return false;
+ } else if (!fieldName.equals(other.fieldName))
+ return false;
+ if (Float.floatToIntBits(minSimilarity) != Float
+ .floatToIntBits(other.minSimilarity))
+ return false;
+ if (prefixLength != other.prefixLength)
+ return false;
+ if (queryString == null) {
+ if (other.queryString != null)
+ return false;
+ } else if (!queryString.equals(other.queryString))
+ return false;
+ return true;
+ }
+
+
+
+ }
+
+ /**
+ * Adds user input for "fuzzification"
+ * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
+ * @param fieldName
+ * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
+ * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
+ */
+ public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
+ {
+ fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
+ }
+
+
+ private void addTerms(IndexReader reader,FieldVals f) throws IOException
+ {
+ if(f.queryString==null) return;
+ TokenStream ts=analyzer.reusableTokenStream(f.fieldName,new StringReader(f.queryString));
+ CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
+
+ int corpusNumDocs=reader.numDocs();
+ Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
+ HashSet<String> processedTerms=new HashSet<String>();
+ ts.reset();
+ while (ts.incrementToken())
+ {
+ String term = termAtt.toString();
+ if(!processedTerms.contains(term))
+ {
+ processedTerms.add(term);
+ ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
+ float minScore=0;
+ Term startTerm=internSavingTemplateTerm.createTerm(term);
+ FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
+ TermEnum origEnum = reader.terms(startTerm);
+ int df=0;
+ if(startTerm.equals(origEnum.term()))
+ {
+ df=origEnum.docFreq(); //store the df so all variants use same idf
+ }
+ int numVariants=0;
+ int totalVariantDocFreqs=0;
+ do
+ {
+ Term possibleMatch=fe.term();
+ if(possibleMatch!=null)
+ {
+ numVariants++;
+ totalVariantDocFreqs+=fe.docFreq();
+ float score=fe.difference();
+ if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
+ ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
+ variantsQ.insertWithOverflow(st);
+ minScore = variantsQ.top().score; // maintain minScore
+ }
+ }
+ }
+ while(fe.next());
+ if(numVariants>0)
+ {
+ int avgDf=totalVariantDocFreqs/numVariants;
+ if(df==0)//no direct match we can use as df for all variants
+ {
+ df=avgDf; //use avg df of all variants
+ }
+
+ // take the top variants (scored by edit distance) and reset the score
+ // to include an IDF factor then add to the global queue for ranking
+ // overall top query terms
+ int size = variantsQ.size();
+ for(int i = 0; i < size; i++)
+ {
+ ScoreTerm st = variantsQ.pop();
+ st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
+ q.insertWithOverflow(st);
+ }
+ }
+ }
+ }
+ ts.end();
+ ts.close();
+ }
+
+ @Override
+ public Query rewrite(IndexReader reader) throws IOException
+ {
+ if(rewrittenQuery!=null)
+ {
+ return rewrittenQuery;
+ }
+ //load up the list of possible terms
+ for (Iterator<FieldVals> iter = fieldVals.iterator(); iter.hasNext();)
+ {
+ FieldVals f = iter.next();
+ addTerms(reader,f);
+ }
+ //clear the list of fields
+ fieldVals.clear();
+
+ BooleanQuery bq=new BooleanQuery();
+
+
+ //create BooleanQueries to hold the variants for each token/field pair and ensure it
+ // has no coord factor
+ //Step 1: sort the termqueries by term/field
+ HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<Term,ArrayList<ScoreTerm>>();
+ int size = q.size();
+ for(int i = 0; i < size; i++)
+ {
+ ScoreTerm st = q.pop();
+ ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
+ if(l==null)
+ {
+ l=new ArrayList<ScoreTerm>();
+ variantQueries.put(st.fuzziedSourceTerm,l);
+ }
+ l.add(st);
+ }
+ //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
+ for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
+ {
+ ArrayList<ScoreTerm> variants = iter.next();
+ if(variants.size()==1)
+ {
+ //optimize where only one selected variant
+ ScoreTerm st= variants.get(0);
+ TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
+ tq.setBoost(st.score); // set the boost to a mix of IDF and score
+ bq.add(tq, BooleanClause.Occur.SHOULD);
+ }
+ else
+ {
+ BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
+ for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
+ .hasNext();)
+ {
+ ScoreTerm st = iterator2.next();
+ TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
+ tq.setBoost(st.score); // set the boost using the ScoreTerm's score
+ termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
+ }
+ bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
+ }
+ }
+ //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
+ // booleans with a minimum-should-match of NumFields-1?
+ bq.setBoost(getBoost());
+ this.rewrittenQuery=bq;
+ return bq;
+ }
+
+ //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
+ // term variants) then is reset with IDF for use in ranking against all other
+ // terms/fields
+ private static class ScoreTerm{
+ public Term term;
+ public float score;
+ Term fuzziedSourceTerm;
+
+ public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
+ this.term = term;
+ this.score = score;
+ this.fuzziedSourceTerm=fuzziedSourceTerm;
+ }
+ }
+
+ private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
+ public ScoreTermQueue(int size){
+ initialize(size);
+ }
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
+ */
+ @Override
+ protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
+ if (termA.score== termB.score)
+ return termA.term.compareTo(termB.term) > 0;
+ else
+ return termA.score < termB.score;
+ }
+
+ }
+
+ //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
+ private static class FuzzyTermQuery extends TermQuery
+ {
+ boolean ignoreTF;
+ public FuzzyTermQuery(Term t, boolean ignoreTF)
+ {
+ super(t);
+ this.ignoreTF=ignoreTF;
+ }
+ @Override
+ public Similarity getSimilarity(Searcher searcher)
+ {
+ Similarity result = super.getSimilarity(searcher);
+ result = new SimilarityDelegator(result) {
+
+ @Override
+ public float tf(float freq)
+ {
+ if(ignoreTF)
+ {
+ return 1; //ignore tf
+ }
+ return super.tf(freq);
+ }
+ @Override
+ public float idf(int docFreq, int numDocs)
+ {
+ //IDF is already factored into individual term boosts
+ return 1;
+ }
+ };
+ return result;
+ }
+ }
+
+
+
+ /* (non-Javadoc)
+ * @see org.apache.lucene.search.Query#toString(java.lang.String)
+ */
+ @Override
+ public String toString(String field)
+ {
+ return null;
+ }
+
+
+ public boolean isIgnoreTF()
+ {
+ return ignoreTF;
+ }
+
+
+ public void setIgnoreTF(boolean ignoreTF)
+ {
+ this.ignoreTF = ignoreTF;
+ }
+
+}