pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / misc / src / java / org / apache / lucene / misc / SweetSpotSimilarity.java
diff --git a/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java

new file mode 100644 (file)

index 0000000..cda2f07
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/misc/src/java/org/apache/lucene/misc/SweetSpotSimilarity.java
@@ -0,0 +1,268 @@
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.misc;
+
+import org.apache.lucene.search.DefaultSimilarity;
+import org.apache.lucene.index.FieldInvertState;
+
+import java.util.Map;
+import java.util.HashMap;
+
+/**
+ * A similarity with a lengthNorm that provides for a "plateau" of
+ * equally good lengths, and tf helper functions.
+ *
+ * <p>
+ * For lengthNorm, A global min/max can be specified to define the
+ * plateau of lengths that should all have a norm of 1.0.
+ * Below the min, and above the max the lengthNorm drops off in a
+ * sqrt function.
+ * </p>
+ * <p>
+ * A per field min/max can be specified if different fields have
+ * different sweet spots.
+ * </p>
+ *
+ * <p>
+ * For tf, baselineTf and hyperbolicTf functions are provided, which
+ * subclasses can choose between.
+ * </p>
+ *
+ */
+public class SweetSpotSimilarity extends DefaultSimilarity {
+
+  private int ln_min = 1;
+  private int ln_max = 1;
+  private float ln_steep = 0.5f;
+
+  private Map<String,Number> ln_maxs = new HashMap<String,Number>(7);
+  private Map<String,Number> ln_mins = new HashMap<String,Number>(7);
+  private Map<String,Float> ln_steeps = new HashMap<String,Float>(7);
+  private Map<String,Boolean> ln_overlaps = new HashMap<String,Boolean>(7);
+
+  private float tf_base = 0.0f;
+  private float tf_min = 0.0f;
+
+  private float tf_hyper_min = 0.0f;
+  private float tf_hyper_max = 2.0f;
+  private double tf_hyper_base = 1.3d;
+  private float tf_hyper_xoffset = 10.0f;
+    
+  public SweetSpotSimilarity() {
+    super();
+  }
+
+  /**
+   * Sets the baseline and minimum function variables for baselineTf
+   *
+   * @see #baselineTf
+   */
+  public void setBaselineTfFactors(float base, float min) {
+    tf_min = min;
+    tf_base = base;
+  }
+  
+  /**
+   * Sets the function variables for the hyperbolicTf functions
+   *
+   * @param min the minimum tf value to ever be returned (default: 0.0)
+   * @param max the maximum tf value to ever be returned (default: 2.0)
+   * @param base the base value to be used in the exponential for the hyperbolic function (default: e)
+   * @param xoffset the midpoint of the hyperbolic function (default: 10.0)
+   * @see #hyperbolicTf
+   */
+  public void setHyperbolicTfFactors(float min, float max,
+                                     double base, float xoffset) {
+    tf_hyper_min = min;
+    tf_hyper_max = max;
+    tf_hyper_base = base;
+    tf_hyper_xoffset = xoffset;
+  }
+    
+  /**
+   * Sets the default function variables used by lengthNorm when no field
+   * specific variables have been set.
+   *
+   * @see #lengthNorm
+   */
+  public void setLengthNormFactors(int min, int max, float steepness) {
+    this.ln_min = min;
+    this.ln_max = max;
+    this.ln_steep = steepness;
+  }
+
+  /**
+   * Sets the function variables used by lengthNorm for a specific named field.
+   * 
+   * @param field field name
+   * @param min minimum value
+   * @param max maximum value
+   * @param steepness steepness of the curve
+   * @param discountOverlaps if true, <code>numOverlapTokens</code> will be
+   * subtracted from <code>numTokens</code>; if false then
+   * <code>numOverlapTokens</code> will be assumed to be 0 (see
+   * {@link DefaultSimilarity#computeNorm(String, FieldInvertState)} for details).
+   *
+   * @see #lengthNorm
+   */
+  public void setLengthNormFactors(String field, int min, int max,
+                                   float steepness, boolean discountOverlaps) {
+    ln_mins.put(field, Integer.valueOf(min));
+    ln_maxs.put(field, Integer.valueOf(max));
+    ln_steeps.put(field, Float.valueOf(steepness));
+    ln_overlaps.put(field, new Boolean(discountOverlaps));
+  }
+    
+  /**
+   * Implemented as <code> state.getBoost() *
+   * lengthNorm(fieldName, numTokens) </code> where
+   * numTokens does not count overlap tokens if
+   * discountOverlaps is true by default or true for this
+   * specific field. */
+  @Override
+  public float computeNorm(String fieldName, FieldInvertState state) {
+    final int numTokens;
+    boolean overlaps = discountOverlaps;
+    if (ln_overlaps.containsKey(fieldName)) {
+      overlaps = ln_overlaps.get(fieldName).booleanValue();
+    }
+    if (overlaps)
+      numTokens = state.getLength() - state.getNumOverlap();
+    else
+      numTokens = state.getLength();
+
+    return state.getBoost() * computeLengthNorm(fieldName, numTokens);
+  }
+
+  /**
+   * Implemented as:
+   * <code>
+   * 1/sqrt( steepness * (abs(x-min) + abs(x-max) - (max-min)) + 1 )
+   * </code>.
+   *
+   * <p>
+   * This degrades to <code>1/sqrt(x)</code> when min and max are both 1 and
+   * steepness is 0.5
+   * </p>
+   *
+   * <p>
+   * :TODO: potential optimization is to just flat out return 1.0f if numTerms
+   * is between min and max.
+   * </p>
+   *
+   * @see #setLengthNormFactors
+   */
+  public float computeLengthNorm(String fieldName, int numTerms) {
+    int l = ln_min;
+    int h = ln_max;
+    float s = ln_steep;
+  
+    if (ln_mins.containsKey(fieldName)) {
+      l = ln_mins.get(fieldName).intValue();
+    }
+    if (ln_maxs.containsKey(fieldName)) {
+      h = ln_maxs.get(fieldName).intValue();
+    }
+    if (ln_steeps.containsKey(fieldName)) {
+      s = ln_steeps.get(fieldName).floatValue();
+    }
+  
+    return (float)
+      (1.0f /
+       Math.sqrt
+       (
+        (
+         s *
+         (float)(Math.abs(numTerms - l) + Math.abs(numTerms - h) - (h-l))
+         )
+        + 1.0f
+        )
+       );
+  }
+
+  /**
+   * Delegates to baselineTf
+   *
+   * @see #baselineTf
+   */
+  @Override
+  public float tf(int freq) {
+    return baselineTf(freq);
+  }
+  
+  /**
+   * Implemented as:
+   * <code>
+   *  (x &lt;= min) &#63; base : sqrt(x+(base**2)-min)
+   * </code>
+   * ...but with a special case check for 0.
+   * <p>
+   * This degrates to <code>sqrt(x)</code> when min and base are both 0
+   * </p>
+   *
+   * @see #setBaselineTfFactors
+   */
+  public float baselineTf(float freq) {
+
+    if (0.0f == freq) return 0.0f;
+  
+    return (freq <= tf_min)
+      ? tf_base
+      : (float)Math.sqrt(freq + (tf_base * tf_base) - tf_min);
+  }
+
+  /**
+   * Uses a hyperbolic tangent function that allows for a hard max...
+   *
+   * <code>
+   * tf(x)=min+(max-min)/2*(((base**(x-xoffset)-base**-(x-xoffset))/(base**(x-xoffset)+base**-(x-xoffset)))+1)
+   * </code>
+   *
+   * <p>
+   * This code is provided as a convenience for subclasses that want
+   * to use a hyperbolic tf function.
+   * </p>
+   *
+   * @see #setHyperbolicTfFactors
+   */
+  public float hyperbolicTf(float freq) {
+    if (0.0f == freq) return 0.0f;
+
+    final float min = tf_hyper_min;
+    final float max = tf_hyper_max;
+    final double base = tf_hyper_base;
+    final float xoffset = tf_hyper_xoffset;
+    final double x = (double)(freq - xoffset);
+  
+    final float result = min +
+      (float)(
+              (max-min) / 2.0f
+              *
+              (
+               ( ( Math.pow(base,x) - Math.pow(base,-x) )
+                 / ( Math.pow(base,x) + Math.pow(base,-x) )
+                 )
+               + 1.0d
+               )
+              );
+
+    return Float.isNaN(result) ? max : result;
+    
+  }
+
+}