lucene-java-3.4.0/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.misc;
  19
  20 import org.apache.lucene.search.DefaultSimilarity;
  21 import org.apache.lucene.index.FieldInvertState;
  22
  23 import java.util.Map;
  24 import java.util.HashMap;
  25
  26 /**
  27  * A similarity with a lengthNorm that provides for a "plateau" of
  28  * equally good lengths, and tf helper functions.
  29  *
  30  * <p>
  31  * For lengthNorm, A global min/max can be specified to define the
  32  * plateau of lengths that should all have a norm of 1.0.
  33  * Below the min, and above the max the lengthNorm drops off in a
  34  * sqrt function.
  35  * </p>
  36  * <p>
  37  * A per field min/max can be specified if different fields have
  38  * different sweet spots.
  39  * </p>
  40  *
  41  * <p>
  42  * For tf, baselineTf and hyperbolicTf functions are provided, which
  43  * subclasses can choose between.
  44  * </p>
  45  *
  46  */
  47 public class SweetSpotSimilarity extends DefaultSimilarity {
  48
  49   private int ln_min = 1;
  50   private int ln_max = 1;
  51   private float ln_steep = 0.5f;
  52
  53   private Map<String,Number> ln_maxs = new HashMap<String,Number>(7);
  54   private Map<String,Number> ln_mins = new HashMap<String,Number>(7);
  55   private Map<String,Float> ln_steeps = new HashMap<String,Float>(7);
  56   private Map<String,Boolean> ln_overlaps = new HashMap<String,Boolean>(7);
  57
  58   private float tf_base = 0.0f;
  59   private float tf_min = 0.0f;
  60
  61   private float tf_hyper_min = 0.0f;
  62   private float tf_hyper_max = 2.0f;
  63   private double tf_hyper_base = 1.3d;
  64   private float tf_hyper_xoffset = 10.0f;
  65
  66   public SweetSpotSimilarity() {
  67     super();
  68   }
  69
  70   /**
  71    * Sets the baseline and minimum function variables for baselineTf
  72    *
  73    * @see #baselineTf
  74    */
  75   public void setBaselineTfFactors(float base, float min) {
  76     tf_min = min;
  77     tf_base = base;
  78   }
  79
  80   /**
  81    * Sets the function variables for the hyperbolicTf functions
  82    *
  83    * @param min the minimum tf value to ever be returned (default: 0.0)
  84    * @param max the maximum tf value to ever be returned (default: 2.0)
  85    * @param base the base value to be used in the exponential for the hyperbolic function (default: e)
  86    * @param xoffset the midpoint of the hyperbolic function (default: 10.0)
  87    * @see #hyperbolicTf
  88    */
  89   public void setHyperbolicTfFactors(float min, float max,
  90                                      double base, float xoffset) {
  91     tf_hyper_min = min;
  92     tf_hyper_max = max;
  93     tf_hyper_base = base;
  94     tf_hyper_xoffset = xoffset;
  95   }
  96
  97   /**
  98    * Sets the default function variables used by lengthNorm when no field
  99    * specific variables have been set.
 100    *
 101    * @see #lengthNorm
 102    */
 103   public void setLengthNormFactors(int min, int max, float steepness) {
 104     this.ln_min = min;
 105     this.ln_max = max;
 106     this.ln_steep = steepness;
 107   }
 108
 109   /**
 110    * Sets the function variables used by lengthNorm for a specific named field.
 111    *
 112    * @param field field name
 113    * @param min minimum value
 114    * @param max maximum value
 115    * @param steepness steepness of the curve
 116    * @param discountOverlaps if true, <code>numOverlapTokens</code> will be
 117    * subtracted from <code>numTokens</code>; if false then
 118    * <code>numOverlapTokens</code> will be assumed to be 0 (see
 119    * {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
 120    *
 121    * @see #lengthNorm
 122    */
 123   public void setLengthNormFactors(String field, int min, int max,
 124                                    float steepness, boolean discountOverlaps) {
 125     ln_mins.put(field, Integer.valueOf(min));
 126     ln_maxs.put(field, Integer.valueOf(max));
 127     ln_steeps.put(field, Float.valueOf(steepness));
 128     ln_overlaps.put(field, new Boolean(discountOverlaps));
 129   }
 130
 131   /**
 132    * Implemented as <code> state.getBoost() *
 133    * lengthNorm(fieldName, numTokens) </code> where
 134    * numTokens does not count overlap tokens if
 135    * discountOverlaps is true by default or true for this
 136    * specific field. */
 137   @Override
 138   public float computeNorm(String fieldName, FieldInvertState state) {
 139     final int numTokens;
 140     boolean overlaps = discountOverlaps;
 141     if (ln_overlaps.containsKey(fieldName)) {
 142       overlaps = ln_overlaps.get(fieldName).booleanValue();
 143     }
 144     if (overlaps)
 145       numTokens = state.getLength() - state.getNumOverlap();
 146     else
 147       numTokens = state.getLength();
 148
 149     return state.getBoost() * computeLengthNorm(fieldName, numTokens);
 150   }
 151
 152   /**
 153    * Implemented as:
 154    * <code>
 155    * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
 156    * </code>.
 157    *
 158    * <p>
 159    * This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
 160    * steepness is 0.5
 161    * </p>
 162    *
 163    * <p>
 164    * :TODO: potential optimization is to just flat out return 1.0f if numTerms
 165    * is between min and max.
 166    * </p>
 167    *
 168    * @see #setLengthNormFactors
 169    */
 170   public float computeLengthNorm(String fieldName, int numTerms) {
 171     int l = ln_min;
 172     int h = ln_max;
 173     float s = ln_steep;
 174
 175     if (ln_mins.containsKey(fieldName)) {
 176       l = ln_mins.get(fieldName).intValue();
 177     }
 178     if (ln_maxs.containsKey(fieldName)) {
 179       h = ln_maxs.get(fieldName).intValue();
 180     }
 181     if (ln_steeps.containsKey(fieldName)) {
 182       s = ln_steeps.get(fieldName).floatValue();
 183     }
 184
 185     return (float)
 186       (1.0f /
 187        Math.sqrt
 188        (
 189         (
 190          s *
 191          (float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
 192          )
 193         + 1.0f
 194         )
 195        );
 196   }
 197
 198   /**
 199    * Delegates to baselineTf
 200    *
 201    * @see #baselineTf
 202    */
 203   @Override
 204   public float tf(int freq) {
 205     return baselineTf(freq);
 206   }
 207
 208   /**
 209    * Implemented as:
 210    * <code>
 211    *  (x &lt;= min) &#63; base : sqrt(x+(base**2)-min)
 212    * </code>
 213    * ...but with a special case check for 0.
 214    * <p>
 215    * This degrates to <code>sqrt(x)</code> when min and base are both 0
 216    * </p>
 217    *
 218    * @see #setBaselineTfFactors
 219    */
 220   public float baselineTf(float freq) {
 221
 222     if (0.0f == freq) return 0.0f;
 223
 224     return (freq <= tf_min)
 225       ? tf_base
 226       : (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
 227   }
 228
 229   /**
 230    * Uses a hyperbolic tangent function that allows for a hard max...
 231    *
 232    * <code>
 233    * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
 234    * </code>
 235    *
 236    * <p>
 237    * This code is provided as a convenience for subclasses that want
 238    * to use a hyperbolic tf function.
 239    * </p>
 240    *
 241    * @see #setHyperbolicTfFactors
 242    */
 243   public float hyperbolicTf(float freq) {
 244     if (0.0f == freq) return 0.0f;
 245
 246     final float min = tf_hyper_min;
 247     final float max = tf_hyper_max;
 248     final double base = tf_hyper_base;
 249     final float xoffset = tf_hyper_xoffset;
 250     final double x = (double)(freq - xoffset);
 251
 252     final float result = min +
 253       (float)(
 254               (max-min) / 2.0f
 255               *
 256               (
 257                ( ( Math.pow(base,x) - Math.pow(base,-x) )
 258                  / ( Math.pow(base,x) + Math.pow(base,-x) )
 259                  )
 260                + 1.0d
 261                )
 262               );
 263
 264     return Float.isNaN(result) ? max : result;
 265
 266   }
 267
 268 }