lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/tr/TurkishLowerCaseFilter.java

   1 package org.apache.lucene.analysis.tr;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.TokenFilter;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25
  26 /**
  27  * Normalizes Turkish token text to lower case.
  28  * <p>
  29  * Turkish and Azeri have unique casing behavior for some characters. This
  30  * filter applies Turkish lowercase rules. For more information, see <a
  31  * href="http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I"
  32  * >http://en.wikipedia.org/wiki/Turkish_dotted_and_dotless_I</a>
  33  * </p>
  34  */
  35 public final class TurkishLowerCaseFilter extends TokenFilter {
  36   private static final int LATIN_CAPITAL_LETTER_I = '\u0049';
  37   private static final int LATIN_SMALL_LETTER_I = '\u0069';
  38   private static final int LATIN_SMALL_LETTER_DOTLESS_I = '\u0131';
  39   private static final int COMBINING_DOT_ABOVE = '\u0307';
  40   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  41
  42   /**
  43    * Create a new TurkishLowerCaseFilter, that normalizes Turkish token text
  44    * to lower case.
  45    *
  46    * @param in TokenStream to filter
  47    */
  48   public TurkishLowerCaseFilter(TokenStream in) {
  49     super(in);
  50   }
  51
  52   @Override
  53   public final boolean incrementToken() throws IOException {
  54     boolean iOrAfter = false;
  55
  56     if (input.incrementToken()) {
  57       final char[] buffer = termAtt.buffer();
  58       int length = termAtt.length();
  59       for (int i = 0; i < length;) {
  60         final int ch = Character.codePointAt(buffer, i);
  61
  62         iOrAfter = (ch == LATIN_CAPITAL_LETTER_I ||
  63             (iOrAfter && Character.getType(ch) == Character.NON_SPACING_MARK));
  64
  65         if (iOrAfter) { // all the special I turkish handling happens here.
  66           switch(ch) {
  67             // remove COMBINING_DOT_ABOVE to mimic composed lowercase
  68             case COMBINING_DOT_ABOVE:
  69               length = delete(buffer, i, length);
  70               continue;
  71             // i itself, it depends if it is followed by COMBINING_DOT_ABOVE
  72             // if it is, we will make it small i and later remove the dot
  73             case LATIN_CAPITAL_LETTER_I:
  74               if (isBeforeDot(buffer, i + 1, length)) {
  75                 buffer[i] = LATIN_SMALL_LETTER_I;
  76               } else {
  77                 buffer[i] = LATIN_SMALL_LETTER_DOTLESS_I;
  78                 // below is an optimization. no COMBINING_DOT_ABOVE follows,
  79                 // so don't waste time calculating Character.getType(), etc
  80                 iOrAfter = false;
  81               }
  82               i++;
  83               continue;
  84           }
  85         }
  86
  87         i += Character.toChars(Character.toLowerCase(ch), buffer, i);
  88       }
  89
  90       termAtt.setLength(length);
  91       return true;
  92     } else
  93       return false;
  94   }
  95
  96
  97   /**
  98    * lookahead for a combining dot above.
  99    * other NSMs may be in between.
 100    */
 101   private boolean isBeforeDot(char s[], int pos, int len) {
 102     for (int i = pos; i < len;) {
 103       final int ch = Character.codePointAt(s, i);
 104       if (Character.getType(ch) != Character.NON_SPACING_MARK)
 105         return false;
 106       if (ch == COMBINING_DOT_ABOVE)
 107         return true;
 108       i += Character.charCount(ch);
 109     }
 110
 111     return false;
 112   }
 113
 114   /**
 115    * delete a character in-place.
 116    * rarely happens, only if COMBINING_DOT_ABOVE is found after an i
 117    */
 118   private int delete(char s[], int pos, int len) {
 119     if (pos < len)
 120       System.arraycopy(s, pos + 1, s, pos, len - pos - 1);
 121
 122     return len - 1;
 123   }
 124 }