1 package org.apache.lucene.collation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import org.apache.lucene.analysis.TokenFilter;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24 import org.apache.lucene.util.IndexableBinaryStringTools;
26 import java.io.IOException;
27 import java.text.Collator;
32 * Converts each token into its {@link java.text.CollationKey}, and then
33 * encodes the CollationKey with {@link IndexableBinaryStringTools}, to allow
34 * it to be stored as an index term.
37 * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
38 * index and query time -- CollationKeys are only comparable when produced by
39 * the same Collator. Since {@link java.text.RuleBasedCollator}s are not
40 * independently versioned, it is unsafe to search against stored
41 * CollationKeys unless the following are exactly the same (best practice is
42 * to store this information with the index and check that they remain the
43 * same at query time):
47 * <li>JVM version, including patch version</li>
49 * The language (and country and variant, if specified) of the Locale
50 * used when constructing the collator via
51 * {@link Collator#getInstance(java.util.Locale)}.
54 * The collation strength used - see {@link Collator#setStrength(int)}
58 * The <code>ICUCollationKeyFilter</code> in the icu package of Lucene's
59 * contrib area uses ICU4J's Collator, which makes its
60 * version available, thus allowing collation to be versioned independently
61 * from the JVM. ICUCollationKeyFilter is also significantly faster and
62 * generates significantly shorter keys than CollationKeyFilter. See
63 * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
64 * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
65 * generation timing and key length comparisons between ICU4J and
66 * java.text.Collator over several languages.
69 * CollationKeys generated by java.text.Collators are not compatible
70 * with those those generated by ICU Collators. Specifically, if you use
71 * CollationKeyFilter to generate index terms, do not use
72 * ICUCollationKeyFilter on the query side, or vice versa.
75 public final class CollationKeyFilter extends TokenFilter {
76 private final Collator collator;
77 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
80 * @param input Source token stream
81 * @param collator CollationKey generator
83 public CollationKeyFilter(TokenStream input, Collator collator) {
85 // clone in case JRE doesnt properly sync,
86 // or to reduce contention in case they do
87 this.collator = (Collator) collator.clone();
91 public boolean incrementToken() throws IOException {
92 if (input.incrementToken()) {
93 byte[] collationKey = collator.getCollationKey(termAtt.toString()).toByteArray();
94 int encodedLength = IndexableBinaryStringTools.getEncodedLength(
95 collationKey, 0, collationKey.length);
96 termAtt.resizeBuffer(encodedLength);
97 termAtt.setLength(encodedLength);
98 IndexableBinaryStringTools.encode(collationKey, 0, collationKey.length,
99 termAtt.buffer(), 0, encodedLength);