1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.LinkedList;
23 import org.apache.lucene.analysis.NumericTokenStream; // for javadocs
24 import org.apache.lucene.document.NumericField; // for javadocs
25 import org.apache.lucene.document.NumericField.DataType;
26 import org.apache.lucene.util.NumericUtils;
27 import org.apache.lucene.util.ToStringUtils;
28 import org.apache.lucene.util.StringHelper;
29 import org.apache.lucene.index.IndexReader;
30 import org.apache.lucene.index.Term;
31 import org.apache.lucene.index.TermEnum;
34 * <p>A {@link Query} that matches numeric values within a
35 * specified range. To use this, you must first index the
36 * numeric values using {@link NumericField} (expert: {@link
37 * NumericTokenStream}). If your terms are instead textual,
38 * you should use {@link TermRangeQuery}. {@link
39 * NumericRangeFilter} is the filter equivalent of this
42 * <p>You create a new NumericRangeQuery with the static
43 * factory methods, eg:
46 * Query q = NumericRangeQuery.newFloatRange("weight", 0.03f, 0.10f, true, true);
49 * matches all documents whose float valued "weight" field
50 * ranges from 0.03 to 0.10, inclusive.
52 * <p>The performance of NumericRangeQuery is much better
53 * than the corresponding {@link TermRangeQuery} because the
54 * number of terms that must be searched is usually far
55 * fewer, thanks to trie indexing, described below.</p>
57 * <p>You can optionally specify a <a
58 * href="#precisionStepDesc"><code>precisionStep</code></a>
59 * when creating this query. This is necessary if you've
60 * changed this configuration from its default (4) during
61 * indexing. Lower values consume more disk space but speed
62 * up searching. Suitable values are between <b>1</b> and
63 * <b>8</b>. A good starting point to test is <b>4</b>,
64 * which is the default value for all <code>Numeric*</code>
65 * classes. See <a href="#precisionStepDesc">below</a> for
68 * <p>This query defaults to {@linkplain
69 * MultiTermQuery#CONSTANT_SCORE_AUTO_REWRITE_DEFAULT} for
70 * 32 bit (int/float) ranges with precisionStep ≤8 and 64
71 * bit (long/double) ranges with precisionStep ≤6.
72 * Otherwise it uses {@linkplain
73 * MultiTermQuery#CONSTANT_SCORE_FILTER_REWRITE} as the
74 * number of terms is likely to be high. With precision
75 * steps of ≤4, this query can be run with one of the
76 * BooleanQuery rewrite methods without changing
77 * BooleanQuery's default max clause count.
79 * <br><h3>How it works</h3>
81 * <p>See the publication about <a target="_blank" href="http://www.panfmp.org">panFMP</a>,
82 * where this algorithm was described (referred to as <code>TrieRangeQuery</code>):
84 * <blockquote><strong>Schindler, U, Diepenbroek, M</strong>, 2008.
85 * <em>Generic XML-based Framework for Metadata Portals.</em>
86 * Computers & Geosciences 34 (12), 1947-1955.
87 * <a href="http://dx.doi.org/10.1016/j.cageo.2008.02.023"
88 * target="_blank">doi:10.1016/j.cageo.2008.02.023</a></blockquote>
90 * <p><em>A quote from this paper:</em> Because Apache Lucene is a full-text
91 * search engine and not a conventional database, it cannot handle numerical ranges
92 * (e.g., field value is inside user defined bounds, even dates are numerical values).
93 * We have developed an extension to Apache Lucene that stores
94 * the numerical values in a special string-encoded format with variable precision
95 * (all numerical values like doubles, longs, floats, and ints are converted to
96 * lexicographic sortable string representations and stored with different precisions
97 * (for a more detailed description of how the values are stored,
98 * see {@link NumericUtils}). A range is then divided recursively into multiple intervals for searching:
99 * The center of the range is searched only with the lowest possible precision in the <em>trie</em>,
100 * while the boundaries are matched more exactly. This reduces the number of terms dramatically.</p>
102 * <p>For the variant that stores long values in 8 different precisions (each reduced by 8 bits) that
103 * uses a lowest precision of 1 byte, the index contains only a maximum of 256 distinct values in the
104 * lowest precision. Overall, a range could consist of a theoretical maximum of
105 * <code>7*255*2 + 255 = 3825</code> distinct terms (when there is a term for every distinct value of an
106 * 8-byte-number in the index and the range covers almost all of them; a maximum of 255 distinct values is used
107 * because it would always be possible to reduce the full 256 values to one term with degraded precision).
108 * In practice, we have seen up to 300 terms in most cases (index with 500,000 metadata records
109 * and a uniform value distribution).</p>
111 * <a name="precisionStepDesc"><h3>Precision Step</h3>
112 * <p>You can choose any <code>precisionStep</code> when encoding values.
113 * Lower step values mean more precisions and so more terms in index (and index gets larger).
114 * On the other hand, the maximum number of terms to match reduces, which optimized query speed.
115 * The formula to calculate the maximum term count is:
117 * n = [ (bitsPerValue/precisionStep - 1) * (2^precisionStep - 1 ) * 2 ] + (2^precisionStep - 1 )
119 * <p><em>(this formula is only correct, when <code>bitsPerValue/precisionStep</code> is an integer;
120 * in other cases, the value must be rounded up and the last summand must contain the modulo of the division as
121 * precision step)</em>.
122 * For longs stored using a precision step of 4, <code>n = 15*15*2 + 15 = 465</code>, and for a precision
123 * step of 2, <code>n = 31*3*2 + 3 = 189</code>. But the faster search speed is reduced by more seeking
124 * in the term enum of the index. Because of this, the ideal <code>precisionStep</code> value can only
125 * be found out by testing. <b>Important:</b> You can index with a lower precision step value and test search speed
126 * using a multiple of the original step value.</p>
128 * <p>Good values for <code>precisionStep</code> are depending on usage and data type:
130 * <li>The default for all data types is <b>4</b>, which is used, when no <code>precisionStep</code> is given.
131 * <li>Ideal value in most cases for <em>64 bit</em> data types <em>(long, double)</em> is <b>6</b> or <b>8</b>.
132 * <li>Ideal value in most cases for <em>32 bit</em> data types <em>(int, float)</em> is <b>4</b>.
133 * <li>For low cardinality fields larger precision steps are good. If the cardinality is < 100, it is
134 * fair to use {@link Integer#MAX_VALUE} (see below).
135 * <li>Steps <b>≥64</b> for <em>long/double</em> and <b>≥32</b> for <em>int/float</em> produces one token
136 * per value in the index and querying is as slow as a conventional {@link TermRangeQuery}. But it can be used
137 * to produce fields, that are solely used for sorting (in this case simply use {@link Integer#MAX_VALUE} as
138 * <code>precisionStep</code>). Using {@link NumericField NumericFields} for sorting
139 * is ideal, because building the field cache is much faster than with text-only numbers.
140 * These fields have one term per value and therefore also work with term enumeration for building distinct lists
141 * (e.g. facets / preselected values to search for).
142 * Sorting is also possible with range query optimized fields using one of the above <code>precisionSteps</code>.
145 * <p>Comparisons of the different types of RangeQueries on an index with about 500,000 docs showed
146 * that {@link TermRangeQuery} in boolean rewrite mode (with raised {@link BooleanQuery} clause count)
147 * took about 30-40 secs to complete, {@link TermRangeQuery} in constant score filter rewrite mode took 5 secs
148 * and executing this class took <100ms to complete (on an Opteron64 machine, Java 1.5, 8 bit
149 * precision step). This query type was developed for a geographic portal, where the performance for
150 * e.g. bounding boxes or exact date/time stamps is important.</p>
154 public final class NumericRangeQuery<T extends Number> extends MultiTermQuery {
156 private NumericRangeQuery(final String field, final int precisionStep, final DataType dataType,
157 T min, T max, final boolean minInclusive, final boolean maxInclusive
159 if (precisionStep < 1)
160 throw new IllegalArgumentException("precisionStep must be >=1");
161 this.field = StringHelper.intern(field);
162 this.precisionStep = precisionStep;
163 this.dataType = dataType;
166 this.minInclusive = minInclusive;
167 this.maxInclusive = maxInclusive;
169 // For bigger precisionSteps this query likely
170 // hits too many terms, so set to CONSTANT_SCORE_FILTER right off
171 // (especially as the FilteredTermEnum is costly if wasted only for AUTO tests because it
172 // creates new enums from IndexReader for each sub-range)
176 setRewriteMethod( (precisionStep > 6) ?
177 CONSTANT_SCORE_FILTER_REWRITE :
178 CONSTANT_SCORE_AUTO_REWRITE_DEFAULT
183 setRewriteMethod( (precisionStep > 8) ?
184 CONSTANT_SCORE_FILTER_REWRITE :
185 CONSTANT_SCORE_AUTO_REWRITE_DEFAULT
189 // should never happen
190 throw new IllegalArgumentException("Invalid numeric DataType");
193 // shortcut if upper bound == lower bound
194 if (min != null && min.equals(max)) {
195 setRewriteMethod(CONSTANT_SCORE_BOOLEAN_QUERY_REWRITE);
200 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>long</code>
201 * range using the given <a href="#precisionStepDesc"><code>precisionStep</code></a>.
202 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
203 * by setting the min or max value to <code>null</code>. By setting inclusive to false, it will
204 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
206 public static NumericRangeQuery<Long> newLongRange(final String field, final int precisionStep,
207 Long min, Long max, final boolean minInclusive, final boolean maxInclusive
209 return new NumericRangeQuery<Long>(field, precisionStep, DataType.LONG, min, max, minInclusive, maxInclusive);
213 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>long</code>
214 * range using the default <code>precisionStep</code> {@link NumericUtils#PRECISION_STEP_DEFAULT} (4).
215 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
216 * by setting the min or max value to <code>null</code>. By setting inclusive to false, it will
217 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
219 public static NumericRangeQuery<Long> newLongRange(final String field,
220 Long min, Long max, final boolean minInclusive, final boolean maxInclusive
222 return new NumericRangeQuery<Long>(field, NumericUtils.PRECISION_STEP_DEFAULT, DataType.LONG, min, max, minInclusive, maxInclusive);
226 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>int</code>
227 * range using the given <a href="#precisionStepDesc"><code>precisionStep</code></a>.
228 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
229 * by setting the min or max value to <code>null</code>. By setting inclusive to false, it will
230 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
232 public static NumericRangeQuery<Integer> newIntRange(final String field, final int precisionStep,
233 Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive
235 return new NumericRangeQuery<Integer>(field, precisionStep, DataType.INT, min, max, minInclusive, maxInclusive);
239 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>int</code>
240 * range using the default <code>precisionStep</code> {@link NumericUtils#PRECISION_STEP_DEFAULT} (4).
241 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
242 * by setting the min or max value to <code>null</code>. By setting inclusive to false, it will
243 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
245 public static NumericRangeQuery<Integer> newIntRange(final String field,
246 Integer min, Integer max, final boolean minInclusive, final boolean maxInclusive
248 return new NumericRangeQuery<Integer>(field, NumericUtils.PRECISION_STEP_DEFAULT, DataType.INT, min, max, minInclusive, maxInclusive);
252 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>double</code>
253 * range using the given <a href="#precisionStepDesc"><code>precisionStep</code></a>.
254 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
255 * by setting the min or max value to <code>null</code>.
256 * {@link Double#NaN} will never match a half-open range, to hit {@code NaN} use a query
257 * with {@code min == max == Double.NaN}. By setting inclusive to false, it will
258 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
260 public static NumericRangeQuery<Double> newDoubleRange(final String field, final int precisionStep,
261 Double min, Double max, final boolean minInclusive, final boolean maxInclusive
263 return new NumericRangeQuery<Double>(field, precisionStep, DataType.DOUBLE, min, max, minInclusive, maxInclusive);
267 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>double</code>
268 * range using the default <code>precisionStep</code> {@link NumericUtils#PRECISION_STEP_DEFAULT} (4).
269 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
270 * by setting the min or max value to <code>null</code>.
271 * {@link Double#NaN} will never match a half-open range, to hit {@code NaN} use a query
272 * with {@code min == max == Double.NaN}. By setting inclusive to false, it will
273 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
275 public static NumericRangeQuery<Double> newDoubleRange(final String field,
276 Double min, Double max, final boolean minInclusive, final boolean maxInclusive
278 return new NumericRangeQuery<Double>(field, NumericUtils.PRECISION_STEP_DEFAULT, DataType.DOUBLE, min, max, minInclusive, maxInclusive);
282 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>float</code>
283 * range using the given <a href="#precisionStepDesc"><code>precisionStep</code></a>.
284 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
285 * by setting the min or max value to <code>null</code>.
286 * {@link Float#NaN} will never match a half-open range, to hit {@code NaN} use a query
287 * with {@code min == max == Float.NaN}. By setting inclusive to false, it will
288 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
290 public static NumericRangeQuery<Float> newFloatRange(final String field, final int precisionStep,
291 Float min, Float max, final boolean minInclusive, final boolean maxInclusive
293 return new NumericRangeQuery<Float>(field, precisionStep, DataType.FLOAT, min, max, minInclusive, maxInclusive);
297 * Factory that creates a <code>NumericRangeQuery</code>, that queries a <code>float</code>
298 * range using the default <code>precisionStep</code> {@link NumericUtils#PRECISION_STEP_DEFAULT} (4).
299 * You can have half-open ranges (which are in fact </≤ or >/≥ queries)
300 * by setting the min or max value to <code>null</code>.
301 * {@link Float#NaN} will never match a half-open range, to hit {@code NaN} use a query
302 * with {@code min == max == Float.NaN}. By setting inclusive to false, it will
303 * match all documents excluding the bounds, with inclusive on, the boundaries are hits, too.
305 public static NumericRangeQuery<Float> newFloatRange(final String field,
306 Float min, Float max, final boolean minInclusive, final boolean maxInclusive
308 return new NumericRangeQuery<Float>(field, NumericUtils.PRECISION_STEP_DEFAULT, DataType.FLOAT, min, max, minInclusive, maxInclusive);
312 protected FilteredTermEnum getEnum(final IndexReader reader) throws IOException {
313 return new NumericRangeTermEnum(reader);
316 /** Returns the field name for this query */
317 public String getField() { return field; }
319 /** Returns <code>true</code> if the lower endpoint is inclusive */
320 public boolean includesMin() { return minInclusive; }
322 /** Returns <code>true</code> if the upper endpoint is inclusive */
323 public boolean includesMax() { return maxInclusive; }
325 /** Returns the lower value of this range query */
326 public T getMin() { return min; }
328 /** Returns the upper value of this range query */
329 public T getMax() { return max; }
331 /** Returns the precision step. */
332 public int getPrecisionStep() { return precisionStep; }
335 public String toString(final String field) {
336 final StringBuilder sb = new StringBuilder();
337 if (!this.field.equals(field)) sb.append(this.field).append(':');
338 return sb.append(minInclusive ? '[' : '{')
339 .append((min == null) ? "*" : min.toString())
341 .append((max == null) ? "*" : max.toString())
342 .append(maxInclusive ? ']' : '}')
343 .append(ToStringUtils.boost(getBoost()))
348 public final boolean equals(final Object o) {
349 if (o==this) return true;
350 if (!super.equals(o))
352 if (o instanceof NumericRangeQuery) {
353 final NumericRangeQuery q=(NumericRangeQuery)o;
356 (q.min == null ? min == null : q.min.equals(min)) &&
357 (q.max == null ? max == null : q.max.equals(max)) &&
358 minInclusive == q.minInclusive &&
359 maxInclusive == q.maxInclusive &&
360 precisionStep == q.precisionStep
367 public final int hashCode() {
368 int hash = super.hashCode();
369 hash += field.hashCode()^0x4565fd66 + precisionStep^0x64365465;
370 if (min != null) hash += min.hashCode()^0x14fa55fb;
371 if (max != null) hash += max.hashCode()^0x733fa5fe;
373 (Boolean.valueOf(minInclusive).hashCode()^0x14fa55fb)+
374 (Boolean.valueOf(maxInclusive).hashCode()^0x733fa5fe);
377 // field must be interned after reading from stream
378 private void readObject(java.io.ObjectInputStream in) throws java.io.IOException, ClassNotFoundException {
379 in.defaultReadObject();
380 field = StringHelper.intern(field);
383 // members (package private, to be also fast accessible by NumericRangeTermEnum)
385 final int precisionStep;
386 final DataType dataType;
388 final boolean minInclusive,maxInclusive;
390 // used to handle float/double infinity correcty
391 static final long LONG_NEGATIVE_INFINITY =
392 NumericUtils.doubleToSortableLong(Double.NEGATIVE_INFINITY);
393 static final long LONG_POSITIVE_INFINITY =
394 NumericUtils.doubleToSortableLong(Double.POSITIVE_INFINITY);
395 static final int INT_NEGATIVE_INFINITY =
396 NumericUtils.floatToSortableInt(Float.NEGATIVE_INFINITY);
397 static final int INT_POSITIVE_INFINITY =
398 NumericUtils.floatToSortableInt(Float.POSITIVE_INFINITY);
401 * Subclass of FilteredTermEnum for enumerating all terms that match the
402 * sub-ranges for trie range queries.
404 * WARNING: This term enumeration is not guaranteed to be always ordered by
405 * {@link Term#compareTo}.
406 * The ordering depends on how {@link NumericUtils#splitLongRange} and
407 * {@link NumericUtils#splitIntRange} generates the sub-ranges. For
408 * {@link MultiTermQuery} ordering is not relevant.
410 private final class NumericRangeTermEnum extends FilteredTermEnum {
412 private final IndexReader reader;
413 private final LinkedList<String> rangeBounds = new LinkedList<String>();
414 private final Term termTemplate = new Term(field);
415 private String currentUpperBound = null;
417 NumericRangeTermEnum(final IndexReader reader) throws IOException {
418 this.reader = reader;
425 if (dataType == DataType.LONG) {
426 minBound = (min == null) ? Long.MIN_VALUE : min.longValue();
428 assert dataType == DataType.DOUBLE;
429 minBound = (min == null) ? LONG_NEGATIVE_INFINITY
430 : NumericUtils.doubleToSortableLong(min.doubleValue());
432 if (!minInclusive && min != null) {
433 if (minBound == Long.MAX_VALUE) break;
439 if (dataType == DataType.LONG) {
440 maxBound = (max == null) ? Long.MAX_VALUE : max.longValue();
442 assert dataType == DataType.DOUBLE;
443 maxBound = (max == null) ? LONG_POSITIVE_INFINITY
444 : NumericUtils.doubleToSortableLong(max.doubleValue());
446 if (!maxInclusive && max != null) {
447 if (maxBound == Long.MIN_VALUE) break;
451 NumericUtils.splitLongRange(new NumericUtils.LongRangeBuilder() {
453 public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
454 rangeBounds.add(minPrefixCoded);
455 rangeBounds.add(maxPrefixCoded);
457 }, precisionStep, minBound, maxBound);
465 if (dataType == DataType.INT) {
466 minBound = (min == null) ? Integer.MIN_VALUE : min.intValue();
468 assert dataType == DataType.FLOAT;
469 minBound = (min == null) ? INT_NEGATIVE_INFINITY
470 : NumericUtils.floatToSortableInt(min.floatValue());
472 if (!minInclusive && min != null) {
473 if (minBound == Integer.MAX_VALUE) break;
479 if (dataType == DataType.INT) {
480 maxBound = (max == null) ? Integer.MAX_VALUE : max.intValue();
482 assert dataType == DataType.FLOAT;
483 maxBound = (max == null) ? INT_POSITIVE_INFINITY
484 : NumericUtils.floatToSortableInt(max.floatValue());
486 if (!maxInclusive && max != null) {
487 if (maxBound == Integer.MIN_VALUE) break;
491 NumericUtils.splitIntRange(new NumericUtils.IntRangeBuilder() {
493 public final void addRange(String minPrefixCoded, String maxPrefixCoded) {
494 rangeBounds.add(minPrefixCoded);
495 rangeBounds.add(maxPrefixCoded);
497 }, precisionStep, minBound, maxBound);
502 // should never happen
503 throw new IllegalArgumentException("Invalid numeric DataType");
506 // seek to first term
511 public float difference() {
515 /** this is a dummy, it is not used by this class. */
517 protected boolean endEnum() {
518 throw new UnsupportedOperationException("not implemented");
521 /** this is a dummy, it is not used by this class. */
523 protected void setEnum(TermEnum tenum) {
524 throw new UnsupportedOperationException("not implemented");
528 * Compares if current upper bound is reached.
529 * In contrast to {@link FilteredTermEnum}, a return value
530 * of <code>false</code> ends iterating the current enum
531 * and forwards to the next sub-range.
534 protected boolean termCompare(Term term) {
535 return (term.field() == field && term.text().compareTo(currentUpperBound) <= 0);
538 /** Increments the enumeration to the next element. True if one exists. */
540 public boolean next() throws IOException {
541 // if a current term exists, the actual enum is initialized:
542 // try change to next term, if no such term exists, fall-through
543 if (currentTerm != null) {
544 assert actualEnum != null;
545 if (actualEnum.next()) {
546 currentTerm = actualEnum.term();
547 if (termCompare(currentTerm))
552 // if all above fails, we go forward to the next enum,
553 // if one is available
555 while (rangeBounds.size() >= 2) {
556 assert rangeBounds.size() % 2 == 0;
557 // close the current enum and read next bounds
558 if (actualEnum != null) {
562 final String lowerBound = rangeBounds.removeFirst();
563 this.currentUpperBound = rangeBounds.removeFirst();
565 actualEnum = reader.terms(termTemplate.createTerm(lowerBound));
566 currentTerm = actualEnum.term();
567 if (currentTerm != null && termCompare(currentTerm))
569 // clear the current term for next iteration
573 // no more sub-range enums available
574 assert rangeBounds.size() == 0 && currentTerm == null;
578 /** Closes the enumeration to further activity, freeing resources. */
580 public void close() throws IOException {
582 currentUpperBound = null;