lucene-java-3.5.0/lucene/contrib/queries/src/java/org/apache/lucene/search/FuzzyLikeThisQuery.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22 import java.util.ArrayList;
  23 import java.util.HashMap;
  24 import java.util.HashSet;
  25 import java.util.Iterator;
  26
  27 import org.apache.lucene.analysis.Analyzer;
  28 import org.apache.lucene.analysis.TokenStream;
  29 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  30 import org.apache.lucene.index.IndexReader;
  31 import org.apache.lucene.index.Term;
  32 import org.apache.lucene.index.TermEnum;
  33 import org.apache.lucene.util.PriorityQueue;
  34
  35 /**
  36  * Fuzzifies ALL terms provided as strings and then picks the best n differentiating terms.
  37  * In effect this mixes the behaviour of FuzzyQuery and MoreLikeThis but with special consideration
  38  * of fuzzy scoring factors.
  39  * This generally produces good results for queries where users may provide details in a number of
  40  * fields and have no knowledge of boolean query syntax and also want a degree of fuzzy matching and
  41  * a fast query.
  42  *
  43  * For each source term the fuzzy variants are held in a BooleanQuery with no coord factor (because
  44  * we are not looking for matches on multiple variants in any one doc). Additionally, a specialized
  45  * TermQuery is used for variants and does not use that variant term's IDF because this would favour rarer
  46  * terms eg misspellings. Instead, all variants use the same IDF ranking (the one for the source query
  47  * term) and this is factored into the variant's boost. If the source query term does not exist in the
  48  * index the average IDF of the variants is used.
  49  */
  50 public class FuzzyLikeThisQuery extends Query
  51 {
  52     static Similarity sim=new DefaultSimilarity();
  53     Query rewrittenQuery=null;
  54     ArrayList<FieldVals> fieldVals=new ArrayList<FieldVals>();
  55     Analyzer analyzer;
  56
  57     ScoreTermQueue q;
  58     int MAX_VARIANTS_PER_TERM=50;
  59     boolean ignoreTF=false;
  60     private int maxNumTerms;
  61
  62     @Override
  63     public int hashCode() {
  64       final int prime = 31;
  65       int result = 1;
  66       result = prime * result + ((analyzer == null) ? 0 : analyzer.hashCode());
  67       result = prime * result
  68           + ((fieldVals == null) ? 0 : fieldVals.hashCode());
  69       result = prime * result + (ignoreTF ? 1231 : 1237);
  70       result = prime * result + maxNumTerms;
  71       return result;
  72     }
  73
  74     @Override
  75     public boolean equals(Object obj) {
  76       if (this == obj)
  77         return true;
  78       if (obj == null)
  79         return false;
  80       if (getClass() != obj.getClass())
  81         return false;
  82       FuzzyLikeThisQuery other = (FuzzyLikeThisQuery) obj;
  83       if (analyzer == null) {
  84         if (other.analyzer != null)
  85           return false;
  86       } else if (!analyzer.equals(other.analyzer))
  87         return false;
  88       if (fieldVals == null) {
  89         if (other.fieldVals != null)
  90           return false;
  91       } else if (!fieldVals.equals(other.fieldVals))
  92         return false;
  93       if (ignoreTF != other.ignoreTF)
  94         return false;
  95       if (maxNumTerms != other.maxNumTerms)
  96         return false;
  97       return true;
  98     }
  99
 100
 101     /**
 102      *
 103      * @param maxNumTerms The total number of terms clauses that will appear once rewritten as a BooleanQuery
 104      * @param analyzer
 105      */
 106     public FuzzyLikeThisQuery(int maxNumTerms, Analyzer analyzer)
 107     {
 108         q=new ScoreTermQueue(maxNumTerms);
 109         this.analyzer=analyzer;
 110         this.maxNumTerms = maxNumTerms;
 111     }
 112
 113     class FieldVals
 114     {
 115         String queryString;
 116         String fieldName;
 117         float minSimilarity;
 118         int prefixLength;
 119                 public FieldVals(String name, float similarity, int length, String queryString)
 120                 {
 121                         fieldName = name;
 122                         minSimilarity = similarity;
 123                         prefixLength = length;
 124                         this.queryString = queryString;
 125                 }
 126
 127     @Override
 128     public int hashCode() {
 129       final int prime = 31;
 130       int result = 1;
 131       result = prime * result
 132           + ((fieldName == null) ? 0 : fieldName.hashCode());
 133       result = prime * result + Float.floatToIntBits(minSimilarity);
 134       result = prime * result + prefixLength;
 135       result = prime * result
 136           + ((queryString == null) ? 0 : queryString.hashCode());
 137       return result;
 138     }
 139
 140     @Override
 141     public boolean equals(Object obj) {
 142       if (this == obj)
 143         return true;
 144       if (obj == null)
 145         return false;
 146       if (getClass() != obj.getClass())
 147         return false;
 148       FieldVals other = (FieldVals) obj;
 149       if (fieldName == null) {
 150         if (other.fieldName != null)
 151           return false;
 152       } else if (!fieldName.equals(other.fieldName))
 153         return false;
 154       if (Float.floatToIntBits(minSimilarity) != Float
 155           .floatToIntBits(other.minSimilarity))
 156         return false;
 157       if (prefixLength != other.prefixLength)
 158         return false;
 159       if (queryString == null) {
 160         if (other.queryString != null)
 161           return false;
 162       } else if (!queryString.equals(other.queryString))
 163         return false;
 164       return true;
 165     }
 166
 167
 168
 169     }
 170
 171     /**
 172      * Adds user input for "fuzzification"
 173      * @param queryString The string which will be parsed by the analyzer and for which fuzzy variants will be parsed
 174      * @param fieldName
 175      * @param minSimilarity The minimum similarity of the term variants (see FuzzyTermEnum)
 176      * @param prefixLength Length of required common prefix on variant terms (see FuzzyTermEnum)
 177      */
 178     public void addTerms(String queryString, String fieldName,float minSimilarity, int prefixLength)
 179     {
 180         fieldVals.add(new FieldVals(fieldName,minSimilarity,prefixLength,queryString));
 181     }
 182
 183
 184     private void addTerms(IndexReader reader,FieldVals f) throws IOException
 185     {
 186         if(f.queryString==null) return;
 187         TokenStream ts=analyzer.reusableTokenStream(f.fieldName,new StringReader(f.queryString));
 188         CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 189
 190         int corpusNumDocs=reader.numDocs();
 191         Term internSavingTemplateTerm =new Term(f.fieldName); //optimization to avoid constructing new Term() objects
 192         HashSet<String> processedTerms=new HashSet<String>();
 193         ts.reset();
 194         while (ts.incrementToken())
 195         {
 196                 String term = termAtt.toString();
 197                 if(!processedTerms.contains(term))
 198                 {
 199                         processedTerms.add(term);
 200                 ScoreTermQueue variantsQ=new ScoreTermQueue(MAX_VARIANTS_PER_TERM); //maxNum variants considered for any one term
 201                 float minScore=0;
 202                 Term startTerm=internSavingTemplateTerm.createTerm(term);
 203                 FuzzyTermEnum fe=new FuzzyTermEnum(reader,startTerm,f.minSimilarity,f.prefixLength);
 204                 TermEnum origEnum = reader.terms(startTerm);
 205                 int df=0;
 206                 if(startTerm.equals(origEnum.term()))
 207                 {
 208                     df=origEnum.docFreq(); //store the df so all variants use same idf
 209                 }
 210                 int numVariants=0;
 211                 int totalVariantDocFreqs=0;
 212                 do
 213                 {
 214                     Term possibleMatch=fe.term();
 215                     if(possibleMatch!=null)
 216                     {
 217                         numVariants++;
 218                         totalVariantDocFreqs+=fe.docFreq();
 219                         float score=fe.difference();
 220                         if(variantsQ.size() < MAX_VARIANTS_PER_TERM || score > minScore){
 221                             ScoreTerm st=new ScoreTerm(possibleMatch,score,startTerm);
 222                             variantsQ.insertWithOverflow(st);
 223                             minScore = variantsQ.top().score; // maintain minScore
 224                         }
 225                     }
 226                 }
 227                 while(fe.next());
 228                 if(numVariants>0)
 229                 {
 230                         int avgDf=totalVariantDocFreqs/numVariants;
 231                         if(df==0)//no direct match we can use as df for all variants
 232                         {
 233                             df=avgDf; //use avg df of all variants
 234                         }
 235
 236                         // take the top variants (scored by edit distance) and reset the score
 237                         // to include an IDF factor then add to the global queue for ranking
 238                         // overall top query terms
 239                         int size = variantsQ.size();
 240                         for(int i = 0; i < size; i++)
 241                         {
 242                           ScoreTerm st = variantsQ.pop();
 243                           st.score=(st.score*st.score)*sim.idf(df,corpusNumDocs);
 244                           q.insertWithOverflow(st);
 245                         }
 246                 }
 247                 }
 248         }
 249         ts.end();
 250         ts.close();
 251     }
 252
 253     @Override
 254     public Query rewrite(IndexReader reader) throws IOException
 255     {
 256         if(rewrittenQuery!=null)
 257         {
 258             return rewrittenQuery;
 259         }
 260         //load up the list of possible terms
 261         for (Iterator<FieldVals> iter = fieldVals.iterator(); iter.hasNext();)
 262                 {
 263                         FieldVals f = iter.next();
 264                         addTerms(reader,f);
 265                 }
 266         //clear the list of fields
 267         fieldVals.clear();
 268
 269         BooleanQuery bq=new BooleanQuery();
 270
 271
 272         //create BooleanQueries to hold the variants for each token/field pair and ensure it
 273         // has no coord factor
 274         //Step 1: sort the termqueries by term/field
 275         HashMap<Term,ArrayList<ScoreTerm>> variantQueries=new HashMap<Term,ArrayList<ScoreTerm>>();
 276         int size = q.size();
 277         for(int i = 0; i < size; i++)
 278         {
 279           ScoreTerm st = q.pop();
 280           ArrayList<ScoreTerm> l= variantQueries.get(st.fuzziedSourceTerm);
 281           if(l==null)
 282           {
 283               l=new ArrayList<ScoreTerm>();
 284               variantQueries.put(st.fuzziedSourceTerm,l);
 285           }
 286           l.add(st);
 287         }
 288         //Step 2: Organize the sorted termqueries into zero-coord scoring boolean queries
 289         for (Iterator<ArrayList<ScoreTerm>> iter = variantQueries.values().iterator(); iter.hasNext();)
 290         {
 291             ArrayList<ScoreTerm> variants = iter.next();
 292             if(variants.size()==1)
 293             {
 294                 //optimize where only one selected variant
 295                 ScoreTerm st= variants.get(0);
 296                 TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);
 297                 tq.setBoost(st.score); // set the boost to a mix of IDF and score
 298                 bq.add(tq, BooleanClause.Occur.SHOULD);
 299             }
 300             else
 301             {
 302                 BooleanQuery termVariants=new BooleanQuery(true); //disable coord and IDF for these term variants
 303                 for (Iterator<ScoreTerm> iterator2 = variants.iterator(); iterator2
 304                         .hasNext();)
 305                 {
 306                     ScoreTerm st = iterator2.next();
 307                     TermQuery tq = new FuzzyTermQuery(st.term,ignoreTF);      // found a match
 308                     tq.setBoost(st.score); // set the boost using the ScoreTerm's score
 309                     termVariants.add(tq, BooleanClause.Occur.SHOULD);          // add to query
 310                 }
 311                 bq.add(termVariants, BooleanClause.Occur.SHOULD);          // add to query
 312             }
 313         }
 314         //TODO possible alternative step 3 - organize above booleans into a new layer of field-based
 315         // booleans with a minimum-should-match of NumFields-1?
 316         bq.setBoost(getBoost());
 317         this.rewrittenQuery=bq;
 318         return bq;
 319     }
 320
 321     //Holds info for a fuzzy term variant - initially score is set to edit distance (for ranking best
 322     // term variants) then is reset with IDF for use in ranking against all other
 323     // terms/fields
 324     private static class ScoreTerm{
 325         public Term term;
 326         public float score;
 327         Term fuzziedSourceTerm;
 328
 329         public ScoreTerm(Term term, float score, Term fuzziedSourceTerm){
 330           this.term = term;
 331           this.score = score;
 332           this.fuzziedSourceTerm=fuzziedSourceTerm;
 333         }
 334       }
 335
 336       private static class ScoreTermQueue extends PriorityQueue<ScoreTerm> {
 337         public ScoreTermQueue(int size){
 338           initialize(size);
 339         }
 340
 341         /* (non-Javadoc)
 342          * @see org.apache.lucene.util.PriorityQueue#lessThan(java.lang.Object, java.lang.Object)
 343          */
 344         @Override
 345         protected boolean lessThan(ScoreTerm termA, ScoreTerm termB) {
 346           if (termA.score== termB.score)
 347             return termA.term.compareTo(termB.term) > 0;
 348           else
 349             return termA.score < termB.score;
 350         }
 351
 352       }
 353
 354       //overrides basic TermQuery to negate effects of IDF (idf is factored into boost of containing BooleanQuery)
 355       private static class FuzzyTermQuery extends TermQuery
 356       {
 357           boolean ignoreTF;
 358           public FuzzyTermQuery(Term t, boolean ignoreTF)
 359           {
 360                   super(t);
 361                   this.ignoreTF=ignoreTF;
 362           }
 363           @Override
 364           public Similarity getSimilarity(Searcher searcher)
 365           {
 366               Similarity result = super.getSimilarity(searcher);
 367               result = new SimilarityDelegator(result) {
 368
 369                   @Override
 370                   public float tf(float freq)
 371                   {
 372                           if(ignoreTF)
 373                           {
 374                           return 1; //ignore tf
 375                           }
 376                           return super.tf(freq);
 377                   }
 378                   @Override
 379                   public float idf(int docFreq, int numDocs)
 380                   {
 381                       //IDF is already factored into individual term boosts
 382                       return 1;
 383                   }
 384               };
 385               return result;
 386           }
 387       }
 388
 389
 390
 391     /* (non-Javadoc)
 392      * @see org.apache.lucene.search.Query#toString(java.lang.String)
 393      */
 394     @Override
 395     public String toString(String field)
 396     {
 397         return null;
 398     }
 399
 400
 401         public boolean isIgnoreTF()
 402         {
 403                 return ignoreTF;
 404         }
 405
 406
 407         public void setIgnoreTF(boolean ignoreTF)
 408         {
 409                 this.ignoreTF = ignoreTF;
 410         }
 411
 412 }