1 package org.apache.lucene.analysis.in;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.BitSet;
21 import java.util.IdentityHashMap;
22 import static java.lang.Character.UnicodeBlock.*;
23 import static org.apache.lucene.analysis.util.StemmerUtil.*;
26 * Normalizes the Unicode representation of text in Indian languages.
28 * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
29 * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
32 public class IndicNormalizer {
34 private static class ScriptData {
39 ScriptData(int flag, int base) {
45 private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
46 new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
48 private static int flag(Character.UnicodeBlock ub) {
49 return scripts.get(ub).flag;
53 scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
54 scripts.put(BENGALI, new ScriptData(2, 0x0980));
55 scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
56 scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
57 scripts.put(ORIYA, new ScriptData(16, 0x0B00));
58 scripts.put(TAMIL, new ScriptData(32, 0x0B80));
59 scripts.put(TELUGU, new ScriptData(64, 0x0C00));
60 scripts.put(KANNADA, new ScriptData(128, 0x0C80));
61 scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
65 * Decompositions according to Unicode 5.2,
66 * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
68 * Most of these are not handled by unicode normalization anyway.
70 * The numbers here represent offsets into the respective codepages,
71 * with -1 representing null and 0xFF representing zero-width joiner.
73 * the columns are: ch1, ch2, ch3, res, flags
74 * ch1, ch2, and ch3 are the decomposition
75 * res is the composition, and flags are the scripts to which it applies.
77 private static final int decompositions[][] = {
78 /* devanagari, gujarati vowel candra O */
79 { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
80 /* devanagari short O */
81 { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
82 /* devanagari, gujarati letter O */
83 { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
84 /* devanagari letter AI, gujarati letter AU */
85 { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
86 /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
87 { 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
88 /* devanagari letter candra A */
89 { 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
90 /* gujarati vowel candra E */
91 { 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
92 /* devanagari letter short A */
93 { 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
94 /* gujarati letter E */
95 { 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
96 /* gurmukhi, gujarati letter AI */
97 { 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
98 /* devanagari, gujarati vowel candra O */
99 { 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
100 /* devanagari short O */
101 { 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
102 /* devanagari, gujarati letter O */
103 { 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
104 /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
105 { 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
106 /* devanagari, gujarati vowel candra O */
107 { 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
108 /* devanagari short O */
109 { 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
110 /* devanagari, gujarati letter O */
111 { 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
112 /* devanagari letter AI, gujarati letter AU */
113 { 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
114 /* malayalam letter II */
115 { 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
116 /* devanagari letter UU */
117 { 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
118 /* tamil, malayalam letter UU (some styles) */
119 { 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
120 /* malayalam letter AI */
121 { 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
122 /* devanagari candra E */
123 { 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
124 /* devanagari short E */
125 { 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
127 { 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
129 { 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
130 /* malayalam letter OO */
131 { 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
132 /* telugu, kannada letter AU */
133 { 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
134 /* telugu letter OO */
135 { 0x12, 0x55, -1, 0x13, flag(TELUGU) },
136 /* tamil, malayalam letter AU */
137 { 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
138 /* oriya letter AU */
139 { 0x13, 0x57, -1, 0x14, flag(ORIYA) },
141 { 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
142 /* devanagari, gurmukhi khha */
143 { 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
144 /* devanagari, gurmukhi ghha */
145 { 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
146 /* devanagari, gurmukhi za */
147 { 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
148 /* devanagari dddha, bengali, oriya rra */
149 { 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
150 /* devanagari, bengali, oriya rha */
151 { 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
152 /* malayalam chillu nn */
153 { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
154 /* bengali khanda ta */
155 { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
156 /* devanagari nnna */
157 { 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
158 /* malayalam chillu n */
159 { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
160 /* devanagari, gurmukhi fa */
161 { 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
162 /* devanagari, bengali yya */
163 { 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
164 /* telugu letter vocalic R */
165 { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
167 { 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
168 /* malayalam chillu rr */
169 { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
170 /* malayalam chillu l */
171 { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
172 /* devanagari llla */
173 { 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
174 /* malayalam chillu ll */
175 { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
176 /* telugu letter MA */
177 { 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
178 /* devanagari, gujarati vowel sign candra O */
179 { 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
180 /* devanagari vowel sign short O */
181 { 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
182 /* devanagari, gujarati vowel sign O */
183 { 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
184 /* devanagari, gujarati vowel sign AU */
185 { 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
186 /* kannada vowel sign II */
187 { 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
188 /* gurmukhi vowel sign UU (when stacking) */
189 { 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
190 /* tamil, malayalam vowel sign O */
191 { 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
192 /* kannada vowel sign OO */
193 { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
194 /* kannada vowel sign O */
195 { 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
196 /* malayalam vowel sign AI (if reordered twice) */
197 { 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
198 /* telugu, kannada vowel sign EE */
199 { 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
200 /* telugu, kannada vowel sign AI */
201 { 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
202 /* tamil, malayalam vowel sign AU */
203 { 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
204 /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
205 { 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
206 /* bengali, oriya vowel sign AU */
207 { 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
208 /* kannada vowel sign OO */
209 { 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
210 /* gurmukhi letter I */
211 { 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
212 /* gurmukhi letter II */
213 { 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
214 /* gurmukhi letter EE */
215 { 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
216 /* gurmukhi letter U */
217 { 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
218 /* gurmukhi letter UU */
219 { 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
220 /* gurmukhi letter OO */
221 { 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
225 for (ScriptData sd : scripts.values()) {
226 sd.decompMask = new BitSet(0x7F);
227 for (int i = 0; i < decompositions.length; i++) {
228 final int ch = decompositions[i][0];
229 final int flags = decompositions[i][4];
230 if ((flags & sd.flag) != 0)
231 sd.decompMask.set(ch);
237 * Normalizes input text, and returns the new length.
238 * The length will always be less than or equal to the existing length.
240 * @param text input text
241 * @param len valid length
242 * @return normalized length
244 public int normalize(char text[], int len) {
245 for (int i = 0; i < len; i++) {
246 final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
247 final ScriptData sd = scripts.get(block);
249 final int ch = text[i] - sd.base;
250 if (sd.decompMask.get(ch))
251 len = compose(ch, block, sd, text, i, len);
258 * Compose into standard form any compositions in the decompositions table.
260 private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
261 char text[], int pos, int len) {
262 if (pos + 1 >= len) /* need at least 2 chars! */
265 final int ch1 = text[pos + 1] - sd.base;
266 final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
267 if (block1 != block0) /* needs to be the same writing system */
273 ch2 = text[pos + 2] - sd.base;
274 Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
275 if (text[pos + 2] == '\u200D') // ZWJ
277 else if (block2 != block1) // still allow a 2-char match
281 for (int i = 0; i < decompositions.length; i++)
282 if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
283 if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
284 text[pos] = (char) (sd.base + decompositions[i][3]);
285 len = delete(text, pos + 1, len);
286 if (decompositions[i][2] >= 0)
287 len = delete(text, pos + 1, len);