1 package org.apache.lucene.collation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 import org.apache.lucene.analysis.Analyzer;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.analysis.KeywordTokenizer;
24 import org.apache.lucene.analysis.Tokenizer;
26 import java.text.Collator;
27 import java.io.Reader;
28 import java.io.IOException;
32 * Filters {@link KeywordTokenizer} with {@link CollationKeyFilter}.
35 * Converts the token into its {@link java.text.CollationKey}, and then
36 * encodes the CollationKey with
37 * {@link org.apache.lucene.util.IndexableBinaryStringTools}, to allow
38 * it to be stored as an index term.
41 * <strong>WARNING:</strong> Make sure you use exactly the same Collator at
42 * index and query time -- CollationKeys are only comparable when produced by
43 * the same Collator. Since {@link java.text.RuleBasedCollator}s are not
44 * independently versioned, it is unsafe to search against stored
45 * CollationKeys unless the following are exactly the same (best practice is
46 * to store this information with the index and check that they remain the
47 * same at query time):
51 * <li>JVM version, including patch version</li>
53 * The language (and country and variant, if specified) of the Locale
54 * used when constructing the collator via
55 * {@link Collator#getInstance(java.util.Locale)}.
58 * The collation strength used - see {@link Collator#setStrength(int)}
62 * The <code>ICUCollationKeyAnalyzer</code> in the icu package of Lucene's
63 * contrib area uses ICU4J's Collator, which makes its
64 * its version available, thus allowing collation to be versioned
65 * independently from the JVM. ICUCollationKeyAnalyzer is also significantly
66 * faster and generates significantly shorter keys than CollationKeyAnalyzer.
67 * See <a href="http://site.icu-project.org/charts/collation-icu4j-sun"
68 * >http://site.icu-project.org/charts/collation-icu4j-sun</a> for key
69 * generation timing and key length comparisons between ICU4J and
70 * java.text.Collator over several languages.
73 * CollationKeys generated by java.text.Collators are not compatible
74 * with those those generated by ICU Collators. Specifically, if you use
75 * CollationKeyAnalyzer to generate index terms, do not use
76 * ICUCollationKeyAnalyzer on the query side, or vice versa.
79 public final class CollationKeyAnalyzer extends Analyzer {
80 private Collator collator;
82 public CollationKeyAnalyzer(Collator collator) {
83 this.collator = collator;
87 public TokenStream tokenStream(String fieldName, Reader reader) {
88 TokenStream result = new KeywordTokenizer(reader);
89 result = new CollationKeyFilter(result, collator);
93 private class SavedStreams {
99 public TokenStream reusableTokenStream(String fieldName, Reader reader)
102 SavedStreams streams = (SavedStreams)getPreviousTokenStream();
103 if (streams == null) {
104 streams = new SavedStreams();
105 streams.source = new KeywordTokenizer(reader);
106 streams.result = new CollationKeyFilter(streams.source, collator);
107 setPreviousTokenStream(streams);
109 streams.source.reset(reader);
111 return streams.result;