1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
23 * A filter that replaces accented characters in the ISO Latin 1 character set
24 * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
26 * For instance, 'à' will be replaced by 'a'.
29 * @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
30 * which covers a superset of Latin 1.
31 * This class is included for use with existing
32 * indexes and will be removed in a future release (possibly Lucene 4.0).
35 public final class ISOLatin1AccentFilter extends TokenFilter {
36 public ISOLatin1AccentFilter(TokenStream input) {
40 private char[] output = new char[256];
41 private int outputPos;
42 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
45 public final boolean incrementToken() throws java.io.IOException {
46 if (input.incrementToken()) {
47 final char[] buffer = termAtt.buffer();
48 final int length = termAtt.length();
49 // If no characters actually require rewriting then we
50 // just return token as-is:
51 for(int i=0;i<length;i++) {
52 final char c = buffer[i];
53 if (c >= '\u00c0' && c <= '\uFB06') {
54 removeAccents(buffer, length);
55 termAtt.copyBuffer(output, 0, outputPos);
65 * To replace accented characters in a String by unaccented equivalents.
67 public final void removeAccents(char[] input, int length) {
69 // Worst-case length required:
70 final int maxSizeNeeded = 2*length;
72 int size = output.length;
73 while (size < maxSizeNeeded)
76 if (size != output.length)
77 output = new char[size];
83 for (int i=0; i<length; i++, pos++) {
84 final char c = input[pos];
86 // Quick test: if it's not in range then just keep
88 if (c < '\u00c0' || c > '\uFB06')
89 output[outputPos++] = c;
98 output[outputPos++] = 'A';
101 output[outputPos++] = 'A';
102 output[outputPos++] = 'E';
105 output[outputPos++] = 'C';
111 output[outputPos++] = 'E';
117 output[outputPos++] = 'I';
120 output[outputPos++] = 'I';
121 output[outputPos++] = 'J';
124 output[outputPos++] = 'D';
127 output[outputPos++] = 'N';
135 output[outputPos++] = 'O';
138 output[outputPos++] = 'O';
139 output[outputPos++] = 'E';
142 output[outputPos++] = 'T';
143 output[outputPos++] = 'H';
149 output[outputPos++] = 'U';
153 output[outputPos++] = 'Y';
161 output[outputPos++] = 'a';
164 output[outputPos++] = 'a';
165 output[outputPos++] = 'e';
168 output[outputPos++] = 'c';
174 output[outputPos++] = 'e';
180 output[outputPos++] = 'i';
183 output[outputPos++] = 'i';
184 output[outputPos++] = 'j';
187 output[outputPos++] = 'd';
190 output[outputPos++] = 'n';
198 output[outputPos++] = 'o';
201 output[outputPos++] = 'o';
202 output[outputPos++] = 'e';
205 output[outputPos++] = 's';
206 output[outputPos++] = 's';
209 output[outputPos++] = 't';
210 output[outputPos++] = 'h';
216 output[outputPos++] = 'u';
220 output[outputPos++] = 'y';
223 output[outputPos++] = 'f';
224 output[outputPos++] = 'f';
227 output[outputPos++] = 'f';
228 output[outputPos++] = 'i';
231 output[outputPos++] = 'f';
232 output[outputPos++] = 'l';
234 // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
235 // case '\uFB03': // ffi
236 // output[outputPos++] = 'f';
237 // output[outputPos++] = 'f';
238 // output[outputPos++] = 'i';
240 // case '\uFB04': // ffl
241 // output[outputPos++] = 'f';
242 // output[outputPos++] = 'f';
243 // output[outputPos++] = 'l';
246 output[outputPos++] = 'f';
247 output[outputPos++] = 't';
250 output[outputPos++] = 's';
251 output[outputPos++] = 't';
254 output[outputPos++] = c;