1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
21 import org.apache.lucene.document.NumericField; // for javadocs
22 import org.apache.lucene.search.NumericRangeQuery; // for javadocs
23 import org.apache.lucene.search.NumericRangeFilter; // for javadocs
26 * This is a helper class to generate prefix-encoded representations for numerical values
27 * and supplies converters to represent float/double values as sortable integers/longs.
29 * <p>To quickly execute range queries in Apache Lucene, a range is divided recursively
30 * into multiple intervals for searching: The center of the range is searched only with
31 * the lowest possible precision in the trie, while the boundaries are matched
32 * more exactly. This reduces the number of terms dramatically.
34 * <p>This class generates terms to achieve this: First the numerical integer values need to
35 * be converted to strings. For that integer values (32 bit or 64 bit) are made unsigned
36 * and the bits are converted to ASCII chars with each 7 bit. The resulting string is
37 * sortable like the original integer value. Each value is also prefixed
38 * (in the first char) by the <code>shift</code> value (number of bits removed) used
41 * <p>To also index floating point numbers, this class supplies two methods to convert them
42 * to integer values by changing their bit layout: {@link #doubleToSortableLong},
43 * {@link #floatToSortableInt}. You will have no precision loss by
44 * converting floating point numbers to integers and back (only that the integer form
45 * is not usable). Other data types like dates can easily converted to longs or ints (e.g.
46 * date to long: {@link java.util.Date#getTime}).
48 * <p>For easy usage, the trie algorithm is implemented for indexing inside
49 * {@link NumericTokenStream} that can index <code>int</code>, <code>long</code>,
50 * <code>float</code>, and <code>double</code>. For querying,
51 * {@link NumericRangeQuery} and {@link NumericRangeFilter} implement the query part
52 * for the same data types.
54 * <p>This class can also be used, to generate lexicographically sortable (according
55 * {@link String#compareTo(String)}) representations of numeric data types for other
56 * usages (e.g. sorting).
62 public final class NumericUtils {
64 private NumericUtils() {} // no instance!
67 * The default precision step used by {@link NumericField}, {@link NumericTokenStream},
68 * {@link NumericRangeQuery}, and {@link NumericRangeFilter} as default
70 public static final int PRECISION_STEP_DEFAULT = 4;
73 * Expert: Longs are stored at lower precision by shifting off lower bits. The shift count is
74 * stored as <code>SHIFT_START_LONG+shift</code> in the first character
76 public static final char SHIFT_START_LONG = (char)0x20;
79 * Expert: The maximum term length (used for <code>char[]</code> buffer size)
80 * for encoding <code>long</code> values.
81 * @see #longToPrefixCoded(long,int,char[])
83 public static final int BUF_SIZE_LONG = 63/7 + 2;
86 * Expert: Integers are stored at lower precision by shifting off lower bits. The shift count is
87 * stored as <code>SHIFT_START_INT+shift</code> in the first character
89 public static final char SHIFT_START_INT = (char)0x60;
92 * Expert: The maximum term length (used for <code>char[]</code> buffer size)
93 * for encoding <code>int</code> values.
94 * @see #intToPrefixCoded(int,int,char[])
96 public static final int BUF_SIZE_INT = 31/7 + 2;
99 * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
100 * This is method is used by {@link NumericTokenStream}.
101 * @param val the numeric value
102 * @param shift how many bits to strip from the right
103 * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_LONG}
105 * @return number of chars written to buffer
107 public static int longToPrefixCoded(final long val, final int shift, final char[] buffer) {
108 if (shift>63 || shift<0)
109 throw new IllegalArgumentException("Illegal shift value, must be 0..63");
110 int nChars = (63-shift)/7 + 1, len = nChars+1;
111 buffer[0] = (char)(SHIFT_START_LONG + shift);
112 long sortableBits = val ^ 0x8000000000000000L;
113 sortableBits >>>= shift;
115 // Store 7 bits per character for good efficiency when UTF-8 encoding.
116 // The whole number is right-justified so that lucene can prefix-encode
117 // the terms more efficiently.
118 buffer[nChars--] = (char)(sortableBits & 0x7f);
125 * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
126 * This is method is used by {@link LongRangeBuilder}.
127 * @param val the numeric value
128 * @param shift how many bits to strip from the right
130 public static String longToPrefixCoded(final long val, final int shift) {
131 final char[] buffer = new char[BUF_SIZE_LONG];
132 final int len = longToPrefixCoded(val, shift, buffer);
133 return new String(buffer, 0, len);
137 * This is a convenience method, that returns prefix coded bits of a long without
138 * reducing the precision. It can be used to store the full precision value as a
139 * stored field in index.
140 * <p>To decode, use {@link #prefixCodedToLong}.
142 public static String longToPrefixCoded(final long val) {
143 return longToPrefixCoded(val, 0);
147 * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
148 * This is method is used by {@link NumericTokenStream}.
149 * @param val the numeric value
150 * @param shift how many bits to strip from the right
151 * @param buffer that will contain the encoded chars, must be at least of {@link #BUF_SIZE_INT}
153 * @return number of chars written to buffer
155 public static int intToPrefixCoded(final int val, final int shift, final char[] buffer) {
156 if (shift>31 || shift<0)
157 throw new IllegalArgumentException("Illegal shift value, must be 0..31");
158 int nChars = (31-shift)/7 + 1, len = nChars+1;
159 buffer[0] = (char)(SHIFT_START_INT + shift);
160 int sortableBits = val ^ 0x80000000;
161 sortableBits >>>= shift;
163 // Store 7 bits per character for good efficiency when UTF-8 encoding.
164 // The whole number is right-justified so that lucene can prefix-encode
165 // the terms more efficiently.
166 buffer[nChars--] = (char)(sortableBits & 0x7f);
173 * Expert: Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
174 * This is method is used by {@link IntRangeBuilder}.
175 * @param val the numeric value
176 * @param shift how many bits to strip from the right
178 public static String intToPrefixCoded(final int val, final int shift) {
179 final char[] buffer = new char[BUF_SIZE_INT];
180 final int len = intToPrefixCoded(val, shift, buffer);
181 return new String(buffer, 0, len);
185 * This is a convenience method, that returns prefix coded bits of an int without
186 * reducing the precision. It can be used to store the full precision value as a
187 * stored field in index.
188 * <p>To decode, use {@link #prefixCodedToInt}.
190 public static String intToPrefixCoded(final int val) {
191 return intToPrefixCoded(val, 0);
195 * Returns a long from prefixCoded characters.
196 * Rightmost bits will be zero for lower precision codes.
197 * This method can be used to decode e.g. a stored field.
198 * @throws NumberFormatException if the supplied string is
199 * not correctly prefix encoded.
200 * @see #longToPrefixCoded(long)
202 public static long prefixCodedToLong(final String prefixCoded) {
203 final int shift = prefixCoded.charAt(0)-SHIFT_START_LONG;
204 if (shift>63 || shift<0)
205 throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really a LONG?)");
206 long sortableBits = 0L;
207 for (int i=1, len=prefixCoded.length(); i<len; i++) {
209 final char ch = prefixCoded.charAt(i);
211 throw new NumberFormatException(
212 "Invalid prefixCoded numerical value representation (char "+
213 Integer.toHexString(ch)+" at position "+i+" is invalid)"
218 return (sortableBits << shift) ^ 0x8000000000000000L;
222 * Returns an int from prefixCoded characters.
223 * Rightmost bits will be zero for lower precision codes.
224 * This method can be used to decode e.g. a stored field.
225 * @throws NumberFormatException if the supplied string is
226 * not correctly prefix encoded.
227 * @see #intToPrefixCoded(int)
229 public static int prefixCodedToInt(final String prefixCoded) {
230 final int shift = prefixCoded.charAt(0)-SHIFT_START_INT;
231 if (shift>31 || shift<0)
232 throw new NumberFormatException("Invalid shift value in prefixCoded string (is encoded value really an INT?)");
233 int sortableBits = 0;
234 for (int i=1, len=prefixCoded.length(); i<len; i++) {
236 final char ch = prefixCoded.charAt(i);
238 throw new NumberFormatException(
239 "Invalid prefixCoded numerical value representation (char "+
240 Integer.toHexString(ch)+" at position "+i+" is invalid)"
245 return (sortableBits << shift) ^ 0x80000000;
249 * Converts a <code>double</code> value to a sortable signed <code>long</code>.
250 * The value is converted by getting their IEEE 754 floating-point "double format"
251 * bit layout and then some bits are swapped, to be able to compare the result as long.
252 * By this the precision is not reduced, but the value can easily used as a long.
253 * The sort order (including {@link Double#NaN}) is defined by
254 * {@link Double#compareTo}; {@code NaN} is greater than positive infinity.
255 * @see #sortableLongToDouble
257 public static long doubleToSortableLong(double val) {
258 long f = Double.doubleToLongBits(val);
259 if (f<0) f ^= 0x7fffffffffffffffL;
264 * Convenience method: this just returns:
265 * longToPrefixCoded(doubleToSortableLong(val))
267 public static String doubleToPrefixCoded(double val) {
268 return longToPrefixCoded(doubleToSortableLong(val));
272 * Converts a sortable <code>long</code> back to a <code>double</code>.
273 * @see #doubleToSortableLong
275 public static double sortableLongToDouble(long val) {
276 if (val<0) val ^= 0x7fffffffffffffffL;
277 return Double.longBitsToDouble(val);
281 * Convenience method: this just returns:
282 * sortableLongToDouble(prefixCodedToLong(val))
284 public static double prefixCodedToDouble(String val) {
285 return sortableLongToDouble(prefixCodedToLong(val));
289 * Converts a <code>float</code> value to a sortable signed <code>int</code>.
290 * The value is converted by getting their IEEE 754 floating-point "float format"
291 * bit layout and then some bits are swapped, to be able to compare the result as int.
292 * By this the precision is not reduced, but the value can easily used as an int.
293 * The sort order (including {@link Float#NaN}) is defined by
294 * {@link Float#compareTo}; {@code NaN} is greater than positive infinity.
295 * @see #sortableIntToFloat
297 public static int floatToSortableInt(float val) {
298 int f = Float.floatToIntBits(val);
299 if (f<0) f ^= 0x7fffffff;
304 * Convenience method: this just returns:
305 * intToPrefixCoded(floatToSortableInt(val))
307 public static String floatToPrefixCoded(float val) {
308 return intToPrefixCoded(floatToSortableInt(val));
312 * Converts a sortable <code>int</code> back to a <code>float</code>.
313 * @see #floatToSortableInt
315 public static float sortableIntToFloat(int val) {
316 if (val<0) val ^= 0x7fffffff;
317 return Float.intBitsToFloat(val);
321 * Convenience method: this just returns:
322 * sortableIntToFloat(prefixCodedToInt(val))
324 public static float prefixCodedToFloat(String val) {
325 return sortableIntToFloat(prefixCodedToInt(val));
329 * Expert: Splits a long range recursively.
330 * You may implement a builder that adds clauses to a
331 * {@link org.apache.lucene.search.BooleanQuery} for each call to its
332 * {@link LongRangeBuilder#addRange(String,String)}
334 * <p>This method is used by {@link NumericRangeQuery}.
336 public static void splitLongRange(final LongRangeBuilder builder,
337 final int precisionStep, final long minBound, final long maxBound
339 splitRange(builder, 64, precisionStep, minBound, maxBound);
343 * Expert: Splits an int range recursively.
344 * You may implement a builder that adds clauses to a
345 * {@link org.apache.lucene.search.BooleanQuery} for each call to its
346 * {@link IntRangeBuilder#addRange(String,String)}
348 * <p>This method is used by {@link NumericRangeQuery}.
350 public static void splitIntRange(final IntRangeBuilder builder,
351 final int precisionStep, final int minBound, final int maxBound
353 splitRange(builder, 32, precisionStep, minBound, maxBound);
356 /** This helper does the splitting for both 32 and 64 bit. */
357 private static void splitRange(
358 final Object builder, final int valSize,
359 final int precisionStep, long minBound, long maxBound
361 if (precisionStep < 1)
362 throw new IllegalArgumentException("precisionStep must be >=1");
363 if (minBound > maxBound) return;
364 for (int shift=0; ; shift += precisionStep) {
365 // calculate new bounds for inner precision
366 final long diff = 1L << (shift+precisionStep),
367 mask = ((1L<<precisionStep) - 1L) << shift;
369 hasLower = (minBound & mask) != 0L,
370 hasUpper = (maxBound & mask) != mask;
372 nextMinBound = (hasLower ? (minBound + diff) : minBound) & ~mask,
373 nextMaxBound = (hasUpper ? (maxBound - diff) : maxBound) & ~mask;
375 lowerWrapped = nextMinBound < minBound,
376 upperWrapped = nextMaxBound > maxBound;
378 if (shift+precisionStep>=valSize || nextMinBound>nextMaxBound || lowerWrapped || upperWrapped) {
379 // We are in the lowest precision or the next precision is not available.
380 addRange(builder, valSize, minBound, maxBound, shift);
381 // exit the split recursion loop
386 addRange(builder, valSize, minBound, minBound | mask, shift);
388 addRange(builder, valSize, maxBound & ~mask, maxBound, shift);
390 // recurse to next precision
391 minBound = nextMinBound;
392 maxBound = nextMaxBound;
396 /** Helper that delegates to correct range builder */
397 private static void addRange(
398 final Object builder, final int valSize,
399 long minBound, long maxBound,
402 // for the max bound set all lower bits (that were shifted away):
403 // this is important for testing or other usages of the splitted range
404 // (e.g. to reconstruct the full range). The prefixEncoding will remove
405 // the bits anyway, so they do not hurt!
406 maxBound |= (1L << shift) - 1L;
407 // delegate to correct range builder
410 ((LongRangeBuilder)builder).addRange(minBound, maxBound, shift);
413 ((IntRangeBuilder)builder).addRange((int)minBound, (int)maxBound, shift);
416 // Should not happen!
417 throw new IllegalArgumentException("valSize must be 32 or 64.");
422 * Expert: Callback for {@link #splitLongRange}.
423 * You need to overwrite only one of the methods.
424 * <p><font color="red"><b>NOTE:</b> This is a very low-level interface,
425 * the method signatures may change in later versions.</font>
427 public static abstract class LongRangeBuilder {
430 * Overwrite this method, if you like to receive the already prefix encoded range bounds.
431 * You can directly build classical (inclusive) range queries from them.
433 public void addRange(String minPrefixCoded, String maxPrefixCoded) {
434 throw new UnsupportedOperationException();
438 * Overwrite this method, if you like to receive the raw long range bounds.
439 * You can use this for e.g. debugging purposes (print out range bounds).
441 public void addRange(final long min, final long max, final int shift) {
442 addRange(longToPrefixCoded(min, shift), longToPrefixCoded(max, shift));
448 * Expert: Callback for {@link #splitIntRange}.
449 * You need to overwrite only one of the methods.
450 * <p><font color="red"><b>NOTE:</b> This is a very low-level interface,
451 * the method signatures may change in later versions.</font>
453 public static abstract class IntRangeBuilder {
456 * Overwrite this method, if you like to receive the already prefix encoded range bounds.
457 * You can directly build classical range (inclusive) queries from them.
459 public void addRange(String minPrefixCoded, String maxPrefixCoded) {
460 throw new UnsupportedOperationException();
464 * Overwrite this method, if you like to receive the raw int range bounds.
465 * You can use this for e.g. debugging purposes (print out range bounds).
467 public void addRange(final int min, final int max, final int shift) {
468 addRange(intToPrefixCoded(min, shift), intToPrefixCoded(max, shift));