1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
22 import java.util.ArrayList;
23 import java.util.HashMap;
24 import java.util.HashSet;
25 import java.util.Iterator;
27 import org.apache.lucene.analysis.Analyzer;
28 import org.apache.lucene.analysis.TokenStream;
29 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
30 import org.apache.lucene.index.IndexReader;
31 import org.apache.lucene.index.Term;
32 import org.apache.lucene.index.TermEnum;
33 import org.apache.lucene.util.PriorityQueue;
36 * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
37 * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
38 * of fuzzy scoring factors.
39 * This generally produces good results for queries where users may provide details in a number of
40 * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
43 * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
44 * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
45 * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
46 * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
47 * term) and this is factored into the variant's boost. If the source query term does not exist in the
48 * index the average IDF of the variants is used.
50 public class FuzzyLikeThisQuery extends Query
52 static Similarity sim=new DefaultSimilarity();
53 Query rewrittenQuery=null;
54 ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
58 int MAX_VARIANTS_PER_TERM=50;
59 boolean ignoreTF=false;
60 private int maxNumTerms;
63 public int hashCode() {
66 result = prime * result + ((analyzer == null) ? 0 : analyzer.hashCode());
67 result = prime * result
68 + ((fieldVals == null) ? 0 : fieldVals.hashCode());
69 result = prime * result + (ignoreTF ? 1231 : 1237);
70 result = prime * result + maxNumTerms;
75 public boolean equals(Object obj) {
80 if (getClass() != obj.getClass())
82 FuzzyLikeThisQuery other = (FuzzyLikeThisQuery) obj;
83 if (analyzer == null) {
84 if (other.analyzer != null)
86 } else if (!analyzer.equals(other.analyzer))
88 if (fieldVals == null) {
89 if (other.fieldVals != null)
91 } else if (!fieldVals.equals(other.fieldVals))
93 if (ignoreTF != other.ignoreTF)
95 if (maxNumTerms != other.maxNumTerms)
103 * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
106 public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
108 q=new ScoreTermQueue(maxNumTerms);
109 this.analyzer=analyzer;
110 this.maxNumTerms = maxNumTerms;
119 public FieldVals(String name, float similarity, int length, String queryString)
122 minSimilarity = similarity;
123 prefixLength = length;
124 this.queryString = queryString;
128 public int hashCode() {
129 final int prime = 31;
131 result = prime * result
132 + ((fieldName == null) ? 0 : fieldName.hashCode());
133 result = prime * result + Float.floatToIntBits(minSimilarity);
134 result = prime * result + prefixLength;
135 result = prime * result
136 + ((queryString == null) ? 0 : queryString.hashCode());
141 public boolean equals(Object obj) {
146 if (getClass() != obj.getClass())
148 FieldVals other = (FieldVals) obj;
149 if (fieldName == null) {
150 if (other.fieldName != null)
152 } else if (!fieldName.equals(other.fieldName))
154 if (Float.floatToIntBits(minSimilarity) != Float
155 .floatToIntBits(other.minSimilarity))
157 if (prefixLength != other.prefixLength)
159 if (queryString == null) {
160 if (other.queryString != null)
162 } else if (!queryString.equals(other.queryString))
172 * Adds user input for "fuzzification"
173 * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
175 * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
176 * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
178 public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
180 fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
184 private void addTerms(IndexReader reader,FieldVals f) throws IOException
186 if(f.queryString==null) return;
187 TokenStream ts=analyzer.reusableTokenStream(f.fieldName,new StringReader(f.queryString));
188 CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
190 int corpusNumDocs=reader.numDocs();
191 Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
192 HashSet<String> processedTerms=new HashSet<String>();
194 while (ts.incrementToken())
196 String term = termAtt.toString();
197 if(!processedTerms.contains(term))
199 processedTerms.add(term);
200 ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
202 Term startTerm=internSavingTemplateTerm.createTerm(term);
203 FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
204 TermEnum origEnum = reader.terms(startTerm);
206 if(startTerm.equals(origEnum.term()))
208 df=origEnum.docFreq(); //store the df so all variants use same idf
211 int totalVariantDocFreqs=0;
214 Term possibleMatch=fe.term();
215 if(possibleMatch!=null)
218 totalVariantDocFreqs+=fe.docFreq();
219 float score=fe.difference();
220 if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
221 ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
222 variantsQ.insertWithOverflow(st);
223 minScore = variantsQ.top().score; // maintain minScore
230 int avgDf=totalVariantDocFreqs/numVariants;
231 if(df==0)//no direct match we can use as df for all variants
233 df=avgDf; //use avg df of all variants
236 // take the top variants (scored by edit distance) and reset the score
237 // to include an IDF factor then add to the global queue for ranking
238 // overall top query terms
239 int size = variantsQ.size();
240 for(int i = 0; i < size; i++)
242 ScoreTerm st = variantsQ.pop();
243 st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
244 q.insertWithOverflow(st);
254 public Query rewrite(IndexReader reader) throws IOException
256 if(rewrittenQuery!=null)
258 return rewrittenQuery;
260 //load up the list of possible terms
261 for (Iterator<FieldVals> iter = fieldVals.iterator(); iter.hasNext();)
263 FieldVals f = iter.next();
266 //clear the list of fields
269 BooleanQuery bq=new BooleanQuery();
272 //create BooleanQueries to hold the variants for each token/field pair and ensure it
273 // has no coord factor
274 //Step 1: sort the termqueries by term/field
275 HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<Term,ArrayList<ScoreTerm>>();
277 for(int i = 0; i < size; i++)
279 ScoreTerm st = q.pop();
280 ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
283 l=new ArrayList<ScoreTerm>();
284 variantQueries.put(st.fuzziedSourceTerm,l);
288 //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
289 for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
291 ArrayList<ScoreTerm> variants = iter.next();
292 if(variants.size()==1)
294 //optimize where only one selected variant
295 ScoreTerm st= variants.get(0);
296 TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
297 tq.setBoost(st.score); // set the boost to a mix of IDF and score
298 bq.add(tq, BooleanClause.Occur.SHOULD);
302 BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
303 for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
306 ScoreTerm st = iterator2.next();
307 TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF); // found a match
308 tq.setBoost(st.score); // set the boost using the ScoreTerm's score
309 termVariants.add(tq, BooleanClause.Occur.SHOULD); // add to query
311 bq.add(termVariants, BooleanClause.Occur.SHOULD); // add to query
314 //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
315 // booleans with a minimum-should-match of NumFields-1?
316 bq.setBoost(getBoost());
317 this.rewrittenQuery=bq;
321 //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
322 // term variants) then is reset with IDF for use in ranking against all other
324 private static class ScoreTerm{
327 Term fuzziedSourceTerm;
329 public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
332 this.fuzziedSourceTerm=fuzziedSourceTerm;
336 private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
337 public ScoreTermQueue(int size){
342 * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
345 protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
346 if (termA.score== termB.score)
347 return termA.term.compareTo(termB.term) > 0;
349 return termA.score < termB.score;
354 //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
355 private static class FuzzyTermQuery extends TermQuery
358 public FuzzyTermQuery(Term t, boolean ignoreTF)
361 this.ignoreTF=ignoreTF;
364 public Similarity getSimilarity(Searcher searcher)
366 Similarity result = super.getSimilarity(searcher);
367 result = new SimilarityDelegator(result) {
370 public float tf(float freq)
374 return 1; //ignore tf
376 return super.tf(freq);
379 public float idf(int docFreq, int numDocs)
381 //IDF is already factored into individual term boosts
392 * @see org.apache.lucene.search.Query#toString(java.lang.String)
395 public String toString(String field)
401 public boolean isIgnoreTF()
407 public void setIgnoreTF(boolean ignoreTF)
409 this.ignoreTF = ignoreTF;