lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/el/GreekLowerCaseFilter.java

   1 package org.apache.lucene.analysis.el;
   2
   3 /**
   4  * Copyright 2005 The Apache Software Foundation
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import java.io.IOException;
  20
  21 import org.apache.lucene.analysis.TokenFilter;
  22 import org.apache.lucene.analysis.TokenStream;
  23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  24 import org.apache.lucene.util.CharacterUtils;
  25 import org.apache.lucene.util.Version;
  26
  27 /**
  28  * Normalizes token text to lower case, removes some Greek diacritics,
  29  * and standardizes final sigma to sigma.
  30  * <a name="version"/>
  31  * <p>You must specify the required {@link Version}
  32  * compatibility when creating GreekLowerCaseFilter:
  33  * <ul>
  34  *   <li> As of 3.1, supplementary characters are properly lowercased.
  35  * </ul>
  36  */
  37 public final class GreekLowerCaseFilter extends TokenFilter {
  38   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  39   private final CharacterUtils charUtils;
  40
  41   /** @deprecated Use {@link #GreekLowerCaseFilter(Version, TokenStream)} instead. */
  42   @Deprecated
  43   public GreekLowerCaseFilter(TokenStream in) {
  44     this(Version.LUCENE_30, in);
  45   }
  46
  47   /**
  48    * Create a GreekLowerCaseFilter that normalizes Greek token text.
  49    *
  50    * @param matchVersion Lucene compatibility version,
  51    *   See <a href="#version">above</a>
  52    * @param in TokenStream to filter
  53    */
  54   public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
  55     super(in);
  56     this.charUtils = CharacterUtils.getInstance(matchVersion);
  57   }
  58
  59   @Override
  60   public boolean incrementToken() throws IOException {
  61     if (input.incrementToken()) {
  62       char[] chArray = termAtt.buffer();
  63       int chLen = termAtt.length();
  64       for (int i = 0; i < chLen;) {
  65         i += Character.toChars(
  66             lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
  67        }
  68       return true;
  69     } else {
  70       return false;
  71     }
  72   }
  73
  74   private int lowerCase(int codepoint) {
  75     switch(codepoint) {
  76       /* There are two lowercase forms of sigma:
  77        *   U+03C2: small final sigma (end of word)
  78        *   U+03C3: small sigma (otherwise)
  79        *
  80        * Standardize both to U+03C3
  81        */
  82       case '\u03C2': /* small final sigma */
  83         return '\u03C3'; /* small sigma */
  84
  85       /* Some greek characters contain diacritics.
  86        * This filter removes these, converting to the lowercase base form.
  87        */
  88
  89       case '\u0386': /* capital alpha with tonos */
  90       case '\u03AC': /* small alpha with tonos */
  91         return '\u03B1'; /* small alpha */
  92
  93       case '\u0388': /* capital epsilon with tonos */
  94       case '\u03AD': /* small epsilon with tonos */
  95         return '\u03B5'; /* small epsilon */
  96
  97       case '\u0389': /* capital eta with tonos */
  98       case '\u03AE': /* small eta with tonos */
  99         return '\u03B7'; /* small eta */
 100
 101       case '\u038A': /* capital iota with tonos */
 102       case '\u03AA': /* capital iota with dialytika */
 103       case '\u03AF': /* small iota with tonos */
 104       case '\u03CA': /* small iota with dialytika */
 105       case '\u0390': /* small iota with dialytika and tonos */
 106         return '\u03B9'; /* small iota */
 107
 108       case '\u038E': /* capital upsilon with tonos */
 109       case '\u03AB': /* capital upsilon with dialytika */
 110       case '\u03CD': /* small upsilon with tonos */
 111       case '\u03CB': /* small upsilon with dialytika */
 112       case '\u03B0': /* small upsilon with dialytika and tonos */
 113         return '\u03C5'; /* small upsilon */
 114
 115       case '\u038C': /* capital omicron with tonos */
 116       case '\u03CC': /* small omicron with tonos */
 117         return '\u03BF'; /* small omicron */
 118
 119       case '\u038F': /* capital omega with tonos */
 120       case '\u03CE': /* small omega with tonos */
 121         return '\u03C9'; /* small omega */
 122
 123       /* The previous implementation did the conversion below.
 124        * Only implemented for backwards compatibility with old indexes.
 125        */
 126
 127       case '\u03A2': /* reserved */
 128         return '\u03C2'; /* small final sigma */
 129
 130       default:
 131         return Character.toLowerCase(codepoint);
 132     }
 133   }
 134 }