lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/NumericUtils.java

   1 package org.apache.lucene.util;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
  21 import org.apache.lucene.document.NumericField; // for javadocs
  22 import org.apache.lucene.search.NumericRangeQuery; // for javadocs
  23 import org.apache.lucene.search.NumericRangeFilter; // for javadocs
  24
  25 /**
  26  * This is a helper class to generate prefix-encoded representations for numerical values
  27  * and supplies converters to represent float/double values as sortable integers/longs.
  28  *
  29  * <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
  30  * into multiple intervals for searching: The center of the range is searched only with
  31  * the lowest possible precision in the trie, while the boundaries are matched
  32  * more exactly. This reduces the number of terms dramatically.
  33  *
  34  * <p>This class generates terms to achieve this: First the numerical integer values need to
  35  * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
  36  * and the bits are converted to ASCII chars with each 7 bit. The resulting string is
  37  * sortable like the original integer value. Each value is also prefixed
  38  * (in the first char) by the <code>shift</code> value (number of bits removed) used
  39  * during encoding.
  40  *
  41  * <p>To also index floating point numbers, this class supplies two methods to convert them
  42  * to integer values by changing their bit layout: {@link #doubleToSortableLong},
  43  * {@link #floatToSortableInt}. You will have no precision loss by
  44  * converting floating point numbers to integers and back (only that the integer form
  45  * is not usable). Other data types like dates can easily converted to longs or ints (e.g.
  46  * date to long: {@link java.util.Date#getTime}).
  47  *
  48  * <p>For easy usage, the trie algorithm is implemented for indexing inside
  49  * {@link NumericTokenStream} that can index <code>int</code>, <code>long</code>,
  50  * <code>float</code>, and <code>double</code>. For querying,
  51  * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part
  52  * for the same data types.
  53  *
  54  * <p>This class can also be used, to generate lexicographically sortable (according
  55  * {@link String#compareTo(String)}) representations of numeric data types for other
  56  * usages (e.g. sorting).
  57  *
  58  * @lucene.internal
  59  *
  60  * @since 2.9
  61  */
  62 public final class NumericUtils {
  63
  64   private NumericUtils() {} // no instance!
  65
  66   /**
  67    * The default precision step used by {@link NumericField}, {@link NumericTokenStream},
  68    * {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default
  69    */
  70   public static final int PRECISION_STEP_DEFAULT = 4;
  71
  72   /**
  73    * Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
  74    * stored as <code>SHIFT_START_LONG+shift</code> in the first character
  75    */
  76   public static final char SHIFT_START_LONG = (char)0x20;
  77
  78   /**
  79    * Expert: The maximum term length (used for <code>char[]</code> buffer size)
  80    * for encoding <code>long</code> values.
  81    * @see #longToPrefixCoded(long,int,char[])
  82    */
  83   public static final int BUF_SIZE_LONG = 63/7 + 2;
  84
  85   /**
  86    * Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
  87    * stored as <code>SHIFT_START_INT+shift</code> in the first character
  88    */
  89   public static final char SHIFT_START_INT  = (char)0x60;
  90
  91   /**
  92    * Expert: The maximum term length (used for <code>char[]</code> buffer size)
  93    * for encoding <code>int</code> values.
  94    * @see #intToPrefixCoded(int,int,char[])
  95    */
  96   public static final int BUF_SIZE_INT = 31/7 + 2;
  97
  98   /**
  99    * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 100    * This is method is used by {@link NumericTokenStream}.
 101    * @param val the numeric value
 102    * @param shift how many bits to strip from the right
 103    * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG}
 104    * length
 105    * @return number of chars written to buffer
 106    */
 107   public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
 108     if (shift>63 || shift<0)
 109       throw new IllegalArgumentException("Illegal shift value, must be 0..63");
 110     int nChars = (63-shift)/7 + 1, len = nChars+1;
 111     buffer[0] = (char)(SHIFT_START_LONG + shift);
 112     long sortableBits = val ^ 0x8000000000000000L;
 113     sortableBits >>>= shift;
 114     while (nChars>=1) {
 115       // Store 7 bits per character for good efficiency when UTF-8 encoding.
 116       // The whole number is right-justified so that lucene can prefix-encode
 117       // the terms more efficiently.
 118       buffer[nChars--] = (char)(sortableBits & 0x7f);
 119       sortableBits >>>= 7;
 120     }
 121     return len;
 122   }
 123
 124   /*
 125    * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 126    * This is method is used by {@link LongRangeBuilder}.
 127    * @param val the numeric value
 128    * @param shift how many bits to strip from the right
 129    */
 130   public static String longToPrefixCoded(final long val, final int shift) {
 131     final char[] buffer = new char[BUF_SIZE_LONG];
 132     final int len = longToPrefixCoded(val, shift, buffer);
 133     return new String(buffer, 0, len);
 134   }
 135
 136   /*
 137    * This is a convenience method, that returns prefix coded bits of a long without
 138    * reducing the precision. It can be used to store the full precision value as a
 139    * stored field in index.
 140    * <p>To decode, use {@link #prefixCodedToLong}.
 141    */
 142   public static String longToPrefixCoded(final long val) {
 143     return longToPrefixCoded(val, 0);
 144   }
 145
 146   /**
 147    * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 148    * This is method is used by {@link NumericTokenStream}.
 149    * @param val the numeric value
 150    * @param shift how many bits to strip from the right
 151    * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT}
 152    * length
 153    * @return number of chars written to buffer
 154    */
 155   public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) {
 156     if (shift>31 || shift<0)
 157       throw new IllegalArgumentException("Illegal shift value, must be 0..31");
 158     int nChars = (31-shift)/7 + 1, len = nChars+1;
 159     buffer[0] = (char)(SHIFT_START_INT + shift);
 160     int sortableBits = val ^ 0x80000000;
 161     sortableBits >>>= shift;
 162     while (nChars>=1) {
 163       // Store 7 bits per character for good efficiency when UTF-8 encoding.
 164       // The whole number is right-justified so that lucene can prefix-encode
 165       // the terms more efficiently.
 166       buffer[nChars--] = (char)(sortableBits & 0x7f);
 167       sortableBits >>>= 7;
 168     }
 169     return len;
 170   }
 171
 172   /*
 173    * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 174    * This is method is used by {@link IntRangeBuilder}.
 175    * @param val the numeric value
 176    * @param shift how many bits to strip from the right
 177    */
 178   public static String intToPrefixCoded(final int val, final int shift) {
 179     final char[] buffer = new char[BUF_SIZE_INT];
 180     final int len = intToPrefixCoded(val, shift, buffer);
 181     return new String(buffer, 0, len);
 182   }
 183
 184   /*
 185    * This is a convenience method, that returns prefix coded bits of an int without
 186    * reducing the precision. It can be used to store the full precision value as a
 187    * stored field in index.
 188    * <p>To decode, use {@link #prefixCodedToInt}.
 189    */
 190   public static String intToPrefixCoded(final int val) {
 191     return intToPrefixCoded(val, 0);
 192   }
 193
 194   /*
 195    * Returns a long from prefixCoded characters.
 196    * Rightmost bits will be zero for lower precision codes.
 197    * This method can be used to decode e.g. a stored field.
 198    * @throws NumberFormatException if the supplied string is
 199    * not correctly prefix encoded.
 200    * @see #longToPrefixCoded(long)
 201    */
 202   public static long prefixCodedToLong(final String prefixCoded) {
 203     final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
 204     if (shift>63 || shift<0)
 205       throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
 206     long sortableBits = 0L;
 207     for (int i=1, len=prefixCoded.length(); i<len; i++) {
 208       sortableBits <<= 7;
 209       final char ch = prefixCoded.charAt(i);
 210       if (ch>0x7f) {
 211         throw new NumberFormatException(
 212           "Invalid prefixCoded numerical value representation (char "+
 213           Integer.toHexString(ch)+" at position "+i+" is invalid)"
 214         );
 215       }
 216       sortableBits |= ch;
 217     }
 218     return (sortableBits << shift) ^ 0x8000000000000000L;
 219   }
 220
 221   /*
 222    * Returns an int from prefixCoded characters.
 223    * Rightmost bits will be zero for lower precision codes.
 224    * This method can be used to decode e.g. a stored field.
 225    * @throws NumberFormatException if the supplied string is
 226    * not correctly prefix encoded.
 227    * @see #intToPrefixCoded(int)
 228    */
 229   public static int prefixCodedToInt(final String prefixCoded) {
 230     final int shift = prefixCoded.charAt(0)-SHIFT_START_INT;
 231     if (shift>31 || shift<0)
 232       throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
 233     int sortableBits = 0;
 234     for (int i=1, len=prefixCoded.length(); i<len; i++) {
 235       sortableBits <<= 7;
 236       final char ch = prefixCoded.charAt(i);
 237       if (ch>0x7f) {
 238         throw new NumberFormatException(
 239           "Invalid prefixCoded numerical value representation (char "+
 240           Integer.toHexString(ch)+" at position "+i+" is invalid)"
 241         );
 242       }
 243       sortableBits |= ch;
 244     }
 245     return (sortableBits << shift) ^ 0x80000000;
 246   }
 247
 248   /**
 249    * Converts a <code>double</code> value to a sortable signed <code>long</code>.
 250    * The value is converted by getting their IEEE 754 floating-point &quot;double format&quot;
 251    * bit layout and then some bits are swapped, to be able to compare the result as long.
 252    * By this the precision is not reduced, but the value can easily used as a long.
 253    * The sort order (including {@link Double#NaN}) is defined by
 254    * {@link Double#compareTo}; {@code NaN} is greater than positive infinity.
 255    * @see #sortableLongToDouble
 256    */
 257   public static long doubleToSortableLong(double val) {
 258     long f = Double.doubleToLongBits(val);
 259     if (f<0) f ^= 0x7fffffffffffffffL;
 260     return f;
 261   }
 262
 263   /*
 264    * Convenience method: this just returns:
 265    *   longToPrefixCoded(doubleToSortableLong(val))
 266    */
 267   public static String doubleToPrefixCoded(double val) {
 268     return longToPrefixCoded(doubleToSortableLong(val));
 269   }
 270
 271   /**
 272    * Converts a sortable <code>long</code> back to a <code>double</code>.
 273    * @see #doubleToSortableLong
 274    */
 275   public static double sortableLongToDouble(long val) {
 276     if (val<0) val ^= 0x7fffffffffffffffL;
 277     return Double.longBitsToDouble(val);
 278   }
 279
 280   /*
 281    * Convenience method: this just returns:
 282    *    sortableLongToDouble(prefixCodedToLong(val))
 283    */
 284   public static double prefixCodedToDouble(String val) {
 285     return sortableLongToDouble(prefixCodedToLong(val));
 286   }
 287
 288   /**
 289    * Converts a <code>float</code> value to a sortable signed <code>int</code>.
 290    * The value is converted by getting their IEEE 754 floating-point &quot;float format&quot;
 291    * bit layout and then some bits are swapped, to be able to compare the result as int.
 292    * By this the precision is not reduced, but the value can easily used as an int.
 293    * The sort order (including {@link Float#NaN}) is defined by
 294    * {@link Float#compareTo}; {@code NaN} is greater than positive infinity.
 295    * @see #sortableIntToFloat
 296    */
 297   public static int floatToSortableInt(float val) {
 298     int f = Float.floatToIntBits(val);
 299     if (f<0) f ^= 0x7fffffff;
 300     return f;
 301   }
 302
 303   /*
 304    * Convenience method: this just returns:
 305    *   intToPrefixCoded(floatToSortableInt(val))
 306    */
 307   public static String floatToPrefixCoded(float val) {
 308     return intToPrefixCoded(floatToSortableInt(val));
 309   }
 310
 311   /**
 312    * Converts a sortable <code>int</code> back to a <code>float</code>.
 313    * @see #floatToSortableInt
 314    */
 315   public static float sortableIntToFloat(int val) {
 316     if (val<0) val ^= 0x7fffffff;
 317     return Float.intBitsToFloat(val);
 318   }
 319
 320   /*
 321    * Convenience method: this just returns:
 322    *    sortableIntToFloat(prefixCodedToInt(val))
 323    */
 324   public static float prefixCodedToFloat(String val) {
 325     return sortableIntToFloat(prefixCodedToInt(val));
 326   }
 327
 328   /**
 329    * Expert: Splits a long range recursively.
 330    * You may implement a builder that adds clauses to a
 331    * {@link org.apache.lucene.search.BooleanQuery} for each call to its
 332    * {@link LongRangeBuilder#addRange(String,String)}
 333    * method.
 334    * <p>This method is used by {@link NumericRangeQuery}.
 335    */
 336   public static void splitLongRange(final LongRangeBuilder builder,
 337     final int precisionStep,  final long minBound, final long maxBound
 338   ) {
 339     splitRange(builder, 64, precisionStep, minBound, maxBound);
 340   }
 341
 342   /**
 343    * Expert: Splits an int range recursively.
 344    * You may implement a builder that adds clauses to a
 345    * {@link org.apache.lucene.search.BooleanQuery} for each call to its
 346    * {@link IntRangeBuilder#addRange(String,String)}
 347    * method.
 348    * <p>This method is used by {@link NumericRangeQuery}.
 349    */
 350   public static void splitIntRange(final IntRangeBuilder builder,
 351     final int precisionStep,  final int minBound, final int maxBound
 352   ) {
 353     splitRange(builder, 32, precisionStep, minBound, maxBound);
 354   }
 355
 356   /** This helper does the splitting for both 32 and 64 bit. */
 357   private static void splitRange(
 358     final Object builder, final int valSize,
 359     final int precisionStep, long minBound, long maxBound
 360   ) {
 361     if (precisionStep < 1)
 362       throw new IllegalArgumentException("precisionStep must be >=1");
 363     if (minBound > maxBound) return;
 364     for (int shift=0; ; shift += precisionStep) {
 365       // calculate new bounds for inner precision
 366       final long diff = 1L << (shift+precisionStep),
 367         mask = ((1L<<precisionStep) - 1L) << shift;
 368       final boolean
 369         hasLower = (minBound & mask) != 0L,
 370         hasUpper = (maxBound & mask) != mask;
 371       final long
 372         nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask,
 373         nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask;
 374       final boolean
 375         lowerWrapped = nextMinBound < minBound,
 376         upperWrapped = nextMaxBound > maxBound;
 377
 378       if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped) {
 379         // We are in the lowest precision or the next precision is not available.
 380         addRange(builder, valSize, minBound, maxBound, shift);
 381         // exit the split recursion loop
 382         break;
 383       }
 384
 385       if (hasLower)
 386         addRange(builder, valSize, minBound, minBound | mask, shift);
 387       if (hasUpper)
 388         addRange(builder, valSize, maxBound & ~mask, maxBound, shift);
 389
 390       // recurse to next precision
 391       minBound = nextMinBound;
 392       maxBound = nextMaxBound;
 393     }
 394   }
 395
 396   /** Helper that delegates to correct range builder */
 397   private static void addRange(
 398     final Object builder, final int valSize,
 399     long minBound, long maxBound,
 400     final int shift
 401   ) {
 402     // for the max bound set all lower bits (that were shifted away):
 403     // this is important for testing or other usages of the splitted range
 404     // (e.g. to reconstruct the full range). The prefixEncoding will remove
 405     // the bits anyway, so they do not hurt!
 406     maxBound |= (1L << shift) - 1L;
 407     // delegate to correct range builder
 408     switch(valSize) {
 409       case 64:
 410         ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift);
 411         break;
 412       case 32:
 413         ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift);
 414         break;
 415       default:
 416         // Should not happen!
 417         throw new IllegalArgumentException("valSize must be 32 or 64.");
 418     }
 419   }
 420
 421   /**
 422    * Expert: Callback for {@link #splitLongRange}.
 423    * You need to overwrite only one of the methods.
 424    * <p><font color="red"><b>NOTE:</b> This is a very low-level interface,
 425    * the method signatures may change in later versions.</font>
 426    */
 427   public static abstract class LongRangeBuilder {
 428
 429     /**
 430      * Overwrite this method, if you like to receive the already prefix encoded range bounds.
 431      * You can directly build classical (inclusive) range queries from them.
 432      */
 433     public void addRange(String minPrefixCoded, String maxPrefixCoded) {
 434       throw new UnsupportedOperationException();
 435     }
 436
 437     /**
 438      * Overwrite this method, if you like to receive the raw long range bounds.
 439      * You can use this for e.g. debugging purposes (print out range bounds).
 440      */
 441     public void addRange(final long min, final long max, final int shift) {
 442       addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
 443     }
 444
 445   }
 446
 447   /**
 448    * Expert: Callback for {@link #splitIntRange}.
 449    * You need to overwrite only one of the methods.
 450    * <p><font color="red"><b>NOTE:</b> This is a very low-level interface,
 451    * the method signatures may change in later versions.</font>
 452    */
 453   public static abstract class IntRangeBuilder {
 454
 455     /**
 456      * Overwrite this method, if you like to receive the already prefix encoded range bounds.
 457      * You can directly build classical range (inclusive) queries from them.
 458      */
 459     public void addRange(String minPrefixCoded, String maxPrefixCoded) {
 460       throw new UnsupportedOperationException();
 461     }
 462
 463     /**
 464      * Overwrite this method, if you like to receive the raw int range bounds.
 465      * You can use this for e.g. debugging purposes (print out range bounds).
 466      */
 467     public void addRange(final int min, final int max, final int shift) {
 468       addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));
 469     }
 470
 471   }
 472
 473 }