lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ar/ArabicNormalizer.java

   1 package org.apache.lucene.analysis.ar;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  21
  22 /**
  23  *  Normalizer for Arabic.
  24  *  <p>
  25  *  Normalization is done in-place for efficiency, operating on a termbuffer.
  26  *  <p>
  27  *  Normalization is defined as:
  28  *  <ul>
  29  *  <li> Normalization of hamza with alef seat to a bare alef.
  30  *  <li> Normalization of teh marbuta to heh
  31  *  <li> Normalization of dotless yeh (alef maksura) to yeh.
  32  *  <li> Removal of Arabic diacritics (the harakat)
  33  *  <li> Removal of tatweel (stretching character).
  34  * </ul>
  35  *
  36  */
  37 public class ArabicNormalizer {
  38   public static final char ALEF = '\u0627';
  39   public static final char ALEF_MADDA = '\u0622';
  40   public static final char ALEF_HAMZA_ABOVE = '\u0623';
  41   public static final char ALEF_HAMZA_BELOW = '\u0625';
  42
  43   public static final char YEH = '\u064A';
  44   public static final char DOTLESS_YEH = '\u0649';
  45
  46   public static final char TEH_MARBUTA = '\u0629';
  47   public static final char HEH = '\u0647';
  48
  49   public static final char TATWEEL = '\u0640';
  50
  51   public static final char FATHATAN = '\u064B';
  52   public static final char DAMMATAN = '\u064C';
  53   public static final char KASRATAN = '\u064D';
  54   public static final char FATHA = '\u064E';
  55   public static final char DAMMA = '\u064F';
  56   public static final char KASRA = '\u0650';
  57   public static final char SHADDA = '\u0651';
  58   public static final char SUKUN = '\u0652';
  59
  60   /**
  61    * Normalize an input buffer of Arabic text
  62    *
  63    * @param s input buffer
  64    * @param len length of input buffer
  65    * @return length of input buffer after normalization
  66    */
  67   public int normalize(char s[], int len) {
  68
  69     for (int i = 0; i < len; i++) {
  70       switch (s[i]) {
  71       case ALEF_MADDA:
  72       case ALEF_HAMZA_ABOVE:
  73       case ALEF_HAMZA_BELOW:
  74         s[i] = ALEF;
  75         break;
  76       case DOTLESS_YEH:
  77         s[i] = YEH;
  78         break;
  79       case TEH_MARBUTA:
  80         s[i] = HEH;
  81         break;
  82       case TATWEEL:
  83       case KASRATAN:
  84       case DAMMATAN:
  85       case FATHATAN:
  86       case FATHA:
  87       case DAMMA:
  88       case KASRA:
  89       case SHADDA:
  90       case SUKUN:
  91         len = delete(s, i, len);
  92         i--;
  93         break;
  94       default:
  95         break;
  96       }
  97     }
  98
  99     return len;
 100   }
 101 }