1 package org.apache.lucene.collation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import com.ibm.icu.text.Collator;
22 import com.ibm.icu.text.RawCollationKey;
24 import org.apache.lucene.analysis.TokenFilter;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.util.IndexableBinaryStringTools;
29 import org.apache.lucene.collation.CollationKeyFilter; // javadocs
31 import java.io.IOException;
36 * Converts each token into its {@link com.ibm.icu.text.CollationKey}, and
37 * then encodes the CollationKey with {@link IndexableBinaryStringTools}, to
38 * allow it to be stored as an index term.
41 * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
42 * index and query time -- CollationKeys are only comparable when produced by
43 * the same Collator. {@link com.ibm.icu.text.RuleBasedCollator}s are
44 * independently versioned, so it is safe to search against stored
45 * CollationKeys if the following are exactly the same (best practice is
46 * to store this information with the index and check that they remain the
47 * same at query time):
51 * Collator version - see {@link Collator#getVersion()}
54 * The collation strength used - see {@link Collator#setStrength(int)}
58 * CollationKeys generated by ICU Collators are not compatible with those
59 * generated by java.text.Collators. Specifically, if you use
60 * ICUCollationKeyFilter to generate index terms, do not use
61 * {@link CollationKeyFilter} on the query side, or vice versa.
64 * ICUCollationKeyFilter is significantly faster and generates significantly
65 * shorter keys than CollationKeyFilter. See
66 * <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
67 * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
68 * generation timing and key length comparisons between ICU4J and
69 * java.text.Collator over several languages.
72 public final class ICUCollationKeyFilter extends TokenFilter {
73 private Collator collator = null;
74 private RawCollationKey reusableKey = new RawCollationKey();
75 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
79 * @param input Source token stream
80 * @param collator CollationKey generator
82 public ICUCollationKeyFilter(TokenStream input, Collator collator) {
84 // clone the collator: see http://userguide.icu-project.org/collation/architecture
86 this.collator = (Collator) collator.clone();
87 } catch (CloneNotSupportedException e) {
88 throw new RuntimeException(e);
93 public boolean incrementToken() throws IOException {
94 if (input.incrementToken()) {
95 char[] termBuffer = termAtt.buffer();
96 String termText = new String(termBuffer, 0, termAtt.length());
97 collator.getRawCollationKey(termText, reusableKey);
98 int encodedLength = IndexableBinaryStringTools.getEncodedLength(
99 reusableKey.bytes, 0, reusableKey.size);
100 if (encodedLength > termBuffer.length) {
101 termAtt.resizeBuffer(encodedLength);
103 termAtt.setLength(encodedLength);
104 IndexableBinaryStringTools.encode(reusableKey.bytes, 0, reusableKey.size,
105 termAtt.buffer(), 0, encodedLength);