lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hi/HindiNormalizer.java

   1 package org.apache.lucene.analysis.hi;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
  21
  22 /**
  23  * Normalizer for Hindi.
  24  * <p>
  25  * Normalizes text to remove some differences in spelling variations.
  26  * <p>
  27  * Implements the Hindi-language specific algorithm specified in:
  28  * <i>Word normalization in Indian languages</i>
  29  * Prasad Pingali and Vasudeva Varma.
  30  * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
  31  * <p>
  32  * with the following additions from <i>Hindi CLIR in Thirty Days</i>
  33  * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
  34  * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
  35  * <ul>
  36  *  <li>Internal Zero-width joiner and Zero-width non-joiners are removed
  37  *  <li>In addition to chandrabindu, NA+halant is normalized to anusvara
  38  * </ul>
  39  *
  40  */
  41 public class HindiNormalizer {
  42   /**
  43    * Normalize an input buffer of Hindi text
  44    *
  45    * @param s input buffer
  46    * @param len length of input buffer
  47    * @return length of input buffer after normalization
  48    */
  49   public int normalize(char s[], int len) {
  50
  51     for (int i = 0; i < len; i++) {
  52       switch (s[i]) {
  53         // dead n -> bindu
  54       case '\u0928':
  55         if (i + 1 < len && s[i + 1] == '\u094D') {
  56           s[i] = '\u0902';
  57           len = delete(s, i + 1, len);
  58         }
  59         break;
  60       // candrabindu -> bindu
  61       case '\u0901':
  62         s[i] = '\u0902';
  63         break;
  64       // nukta deletions
  65       case '\u093C':
  66         len = delete(s, i, len);
  67         i--;
  68         break;
  69       case '\u0929':
  70         s[i] = '\u0928';
  71         break;
  72       case '\u0931':
  73         s[i] = '\u0930';
  74         break;
  75       case '\u0934':
  76         s[i] = '\u0933';
  77         break;
  78       case '\u0958':
  79         s[i] = '\u0915';
  80         break;
  81       case '\u0959':
  82         s[i] = '\u0916';
  83         break;
  84       case '\u095A':
  85         s[i] = '\u0917';
  86         break;
  87       case '\u095B':
  88         s[i] = '\u091C';
  89         break;
  90       case '\u095C':
  91         s[i] = '\u0921';
  92         break;
  93       case '\u095D':
  94         s[i] = '\u0922';
  95         break;
  96       case '\u095E':
  97         s[i] = '\u092B';
  98         break;
  99       case '\u095F':
 100         s[i] = '\u092F';
 101         break;
 102         // zwj/zwnj -> delete
 103       case '\u200D':
 104       case '\u200C':
 105         len = delete(s, i, len);
 106         i--;
 107         break;
 108         // virama -> delete
 109       case '\u094D':
 110         len = delete(s, i, len);
 111         i--;
 112         break;
 113         // chandra/short -> replace
 114       case '\u0945':
 115       case '\u0946':
 116         s[i] = '\u0947';
 117         break;
 118       case '\u0949':
 119       case '\u094A':
 120         s[i] = '\u094B';
 121         break;
 122       case '\u090D':
 123       case '\u090E':
 124         s[i] = '\u090F';
 125         break;
 126       case '\u0911':
 127       case '\u0912':
 128         s[i] = '\u0913';
 129         break;
 130       case '\u0972':
 131         s[i] = '\u0905';
 132         break;
 133         // long -> short ind. vowels
 134       case '\u0906':
 135         s[i] = '\u0905';
 136         break;
 137       case '\u0908':
 138         s[i] = '\u0907';
 139         break;
 140       case '\u090A':
 141         s[i] = '\u0909';
 142         break;
 143       case '\u0960':
 144         s[i] = '\u090B';
 145         break;
 146       case '\u0961':
 147         s[i] = '\u090C';
 148         break;
 149       case '\u0910':
 150         s[i] = '\u090F';
 151         break;
 152       case '\u0914':
 153         s[i] = '\u0913';
 154         break;
 155         // long -> short dep. vowels
 156       case '\u0940':
 157         s[i] = '\u093F';
 158         break;
 159       case '\u0942':
 160         s[i] = '\u0941';
 161         break;
 162       case '\u0944':
 163         s[i] = '\u0943';
 164         break;
 165       case '\u0963':
 166         s[i] = '\u0962';
 167         break;
 168       case '\u0948':
 169         s[i] = '\u0947';
 170         break;
 171       case '\u094C':
 172         s[i] = '\u094B';
 173         break;
 174       default:
 175         break;
 176       }
 177     }
 178
 179     return len;
 180   }
 181 }