1 package org.apache.lucene.analysis.hi;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
23 * Normalizer for Hindi.
25 * Normalizes text to remove some differences in spelling variations.
27 * Implements the Hindi-language specific algorithm specified in:
28 * <i>Word normalization in Indian languages</i>
29 * Prasad Pingali and Vasudeva Varma.
30 * http://web2py.iiit.ac.in/publications/default/download/inproceedings.pdf.3fe5b38c-02ee-41ce-9a8f-3e745670be32.pdf
32 * with the following additions from <i>Hindi CLIR in Thirty Days</i>
33 * Leah S. Larkey, Margaret E. Connell, and Nasreen AbdulJaleel.
34 * http://maroo.cs.umass.edu/pub/web/getpdf.php?id=454:
36 * <li>Internal Zero-width joiner and Zero-width non-joiners are removed
37 * <li>In addition to chandrabindu, NA+halant is normalized to anusvara
41 public class HindiNormalizer {
43 * Normalize an input buffer of Hindi text
45 * @param s input buffer
46 * @param len length of input buffer
47 * @return length of input buffer after normalization
49 public int normalize(char s[], int len) {
51 for (int i = 0; i < len; i++) {
55 if (i + 1 < len && s[i + 1] == '\u094D') {
57 len = delete(s, i + 1, len);
60 // candrabindu -> bindu
66 len = delete(s, i, len);
102 // zwj/zwnj -> delete
105 len = delete(s, i, len);
110 len = delete(s, i, len);
113 // chandra/short -> replace
133 // long -> short ind. vowels
155 // long -> short dep. vowels