lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/ISOLatin1AccentFilter.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  21
  22 /**
  23  * A filter that replaces accented characters in the ISO Latin 1 character set
  24  * (ISO-8859-1) by their unaccented equivalent. The case will not be altered.
  25  * <p>
  26  * For instance, '&agrave;' will be replaced by 'a'.
  27  * <p>
  28  *
  29  * @deprecated If you build a new index, use {@link ASCIIFoldingFilter}
  30  * which covers a superset of Latin 1.
  31  * This class is included for use with existing
  32  * indexes and will be removed in a future release (possibly Lucene 4.0).
  33  */
  34 @Deprecated
  35 public final class ISOLatin1AccentFilter extends TokenFilter {
  36   public ISOLatin1AccentFilter(TokenStream input) {
  37     super(input);
  38   }
  39
  40   private char[] output = new char[256];
  41   private int outputPos;
  42   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  43
  44   @Override
  45   public final boolean incrementToken() throws java.io.IOException {
  46     if (input.incrementToken()) {
  47       final char[] buffer = termAtt.buffer();
  48       final int length = termAtt.length();
  49       // If no characters actually require rewriting then we
  50       // just return token as-is:
  51       for(int i=0;i<length;i++) {
  52         final char c = buffer[i];
  53         if (c >= '\u00c0' && c <= '\uFB06') {
  54           removeAccents(buffer, length);
  55           termAtt.copyBuffer(output, 0, outputPos);
  56           break;
  57         }
  58       }
  59       return true;
  60     } else
  61       return false;
  62   }
  63
  64   /**
  65    * To replace accented characters in a String by unaccented equivalents.
  66    */
  67   public final void removeAccents(char[] input, int length) {
  68
  69     // Worst-case length required:
  70     final int maxSizeNeeded = 2*length;
  71
  72     int size = output.length;
  73     while (size < maxSizeNeeded)
  74       size *= 2;
  75
  76     if (size != output.length)
  77       output = new char[size];
  78
  79     outputPos = 0;
  80
  81     int pos = 0;
  82
  83     for (int i=0; i<length; i++, pos++) {
  84       final char c = input[pos];
  85
  86       // Quick test: if it's not in range then just keep
  87       // current character
  88       if (c < '\u00c0' || c > '\uFB06')
  89         output[outputPos++] = c;
  90       else {
  91         switch (c) {
  92         case '\u00C0' : // À
  93         case '\u00C1' : // Á
  94         case '\u00C2' : // Â
  95         case '\u00C3' : // Ã
  96         case '\u00C4' : // Ä
  97         case '\u00C5' : // Å
  98           output[outputPos++] = 'A';
  99           break;
 100         case '\u00C6' : // Æ
 101           output[outputPos++] = 'A';
 102           output[outputPos++] = 'E';
 103           break;
 104         case '\u00C7' : // Ç
 105           output[outputPos++] = 'C';
 106           break;
 107         case '\u00C8' : // È
 108         case '\u00C9' : // É
 109         case '\u00CA' : // Ê
 110         case '\u00CB' : // Ë
 111           output[outputPos++] = 'E';
 112           break;
 113         case '\u00CC' : // Ì
 114         case '\u00CD' : // Í
 115         case '\u00CE' : // Î
 116         case '\u00CF' : // Ï
 117           output[outputPos++] = 'I';
 118           break;
 119         case '\u0132' : // Ĳ
 120             output[outputPos++] = 'I';
 121             output[outputPos++] = 'J';
 122             break;
 123         case '\u00D0' : // Ð
 124           output[outputPos++] = 'D';
 125           break;
 126         case '\u00D1' : // Ñ
 127           output[outputPos++] = 'N';
 128           break;
 129         case '\u00D2' : // Ò
 130         case '\u00D3' : // Ó
 131         case '\u00D4' : // Ô
 132         case '\u00D5' : // Õ
 133         case '\u00D6' : // Ö
 134         case '\u00D8' : // Ø
 135           output[outputPos++] = 'O';
 136           break;
 137         case '\u0152' : // Œ
 138           output[outputPos++] = 'O';
 139           output[outputPos++] = 'E';
 140           break;
 141         case '\u00DE' : // Þ
 142           output[outputPos++] = 'T';
 143           output[outputPos++] = 'H';
 144           break;
 145         case '\u00D9' : // Ù
 146         case '\u00DA' : // Ú
 147         case '\u00DB' : // Û
 148         case '\u00DC' : // Ü
 149           output[outputPos++] = 'U';
 150           break;
 151         case '\u00DD' : // Ý
 152         case '\u0178' : // Ÿ
 153           output[outputPos++] = 'Y';
 154           break;
 155         case '\u00E0' : // à
 156         case '\u00E1' : // á
 157         case '\u00E2' : // â
 158         case '\u00E3' : // ã
 159         case '\u00E4' : // ä
 160         case '\u00E5' : // å
 161           output[outputPos++] = 'a';
 162           break;
 163         case '\u00E6' : // æ
 164           output[outputPos++] = 'a';
 165           output[outputPos++] = 'e';
 166           break;
 167         case '\u00E7' : // ç
 168           output[outputPos++] = 'c';
 169           break;
 170         case '\u00E8' : // è
 171         case '\u00E9' : // é
 172         case '\u00EA' : // ê
 173         case '\u00EB' : // ë
 174           output[outputPos++] = 'e';
 175           break;
 176         case '\u00EC' : // ì
 177         case '\u00ED' : // í
 178         case '\u00EE' : // î
 179         case '\u00EF' : // ï
 180           output[outputPos++] = 'i';
 181           break;
 182         case '\u0133' : // ĳ
 183             output[outputPos++] = 'i';
 184             output[outputPos++] = 'j';
 185             break;
 186         case '\u00F0' : // ð
 187           output[outputPos++] = 'd';
 188           break;
 189         case '\u00F1' : // ñ
 190           output[outputPos++] = 'n';
 191           break;
 192         case '\u00F2' : // ò
 193         case '\u00F3' : // ó
 194         case '\u00F4' : // ô
 195         case '\u00F5' : // õ
 196         case '\u00F6' : // ö
 197         case '\u00F8' : // ø
 198           output[outputPos++] = 'o';
 199           break;
 200         case '\u0153' : // œ
 201           output[outputPos++] = 'o';
 202           output[outputPos++] = 'e';
 203           break;
 204         case '\u00DF' : // ß
 205           output[outputPos++] = 's';
 206           output[outputPos++] = 's';
 207           break;
 208         case '\u00FE' : // þ
 209           output[outputPos++] = 't';
 210           output[outputPos++] = 'h';
 211           break;
 212         case '\u00F9' : // ù
 213         case '\u00FA' : // ú
 214         case '\u00FB' : // û
 215         case '\u00FC' : // ü
 216           output[outputPos++] = 'u';
 217           break;
 218         case '\u00FD' : // ý
 219         case '\u00FF' : // ÿ
 220           output[outputPos++] = 'y';
 221           break;
 222         case '\uFB00': // ﬀ
 223             output[outputPos++] = 'f';
 224             output[outputPos++] = 'f';
 225             break;
 226         case '\uFB01': // ﬁ
 227             output[outputPos++] = 'f';
 228             output[outputPos++] = 'i';
 229             break;
 230         case '\uFB02': // ﬂ
 231             output[outputPos++] = 'f';
 232             output[outputPos++] = 'l';
 233             break;
 234         // following 2 are commented as they can break the maxSizeNeeded (and doing *3 could be expensive)
 235 //        case '\uFB03': // ﬃ
 236 //            output[outputPos++] = 'f';
 237 //            output[outputPos++] = 'f';
 238 //            output[outputPos++] = 'i';
 239 //            break;
 240 //        case '\uFB04': // ﬄ
 241 //            output[outputPos++] = 'f';
 242 //            output[outputPos++] = 'f';
 243 //            output[outputPos++] = 'l';
 244 //            break;
 245         case '\uFB05': // ﬅ
 246             output[outputPos++] = 'f';
 247             output[outputPos++] = 't';
 248             break;
 249         case '\uFB06': // ﬆ
 250             output[outputPos++] = 's';
 251             output[outputPos++] = 't';
 252           break;
 253         default :
 254           output[outputPos++] = c;
 255           break;
 256         }
 257       }
 258     }
 259   }
 260 }