--- /dev/null
+package org.apache.lucene.analysis.in;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.util.BitSet;
+import java.util.IdentityHashMap;
+import static java.lang.Character.UnicodeBlock.*;
+import static org.apache.lucene.analysis.util.StemmerUtil.*;
+
+/**
+ * Normalizes the Unicode representation of text in Indian languages.
+ * <p>
+ * Follows guidelines from Unicode 5.2, chapter 6, South Asian Scripts I
+ * and graphical decompositions from http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ * </p>
+ */
+public class IndicNormalizer {
+
+ private static class ScriptData {
+ final int flag;
+ final int base;
+ BitSet decompMask;
+
+ ScriptData(int flag, int base) {
+ this.flag = flag;
+ this.base = base;
+ }
+ }
+
+ private static final IdentityHashMap<Character.UnicodeBlock,ScriptData> scripts =
+ new IdentityHashMap<Character.UnicodeBlock,ScriptData>(9);
+
+ private static int flag(Character.UnicodeBlock ub) {
+ return scripts.get(ub).flag;
+ }
+
+ static {
+ scripts.put(DEVANAGARI, new ScriptData(1, 0x0900));
+ scripts.put(BENGALI, new ScriptData(2, 0x0980));
+ scripts.put(GURMUKHI, new ScriptData(4, 0x0A00));
+ scripts.put(GUJARATI, new ScriptData(8, 0x0A80));
+ scripts.put(ORIYA, new ScriptData(16, 0x0B00));
+ scripts.put(TAMIL, new ScriptData(32, 0x0B80));
+ scripts.put(TELUGU, new ScriptData(64, 0x0C00));
+ scripts.put(KANNADA, new ScriptData(128, 0x0C80));
+ scripts.put(MALAYALAM, new ScriptData(256, 0x0D00));
+ }
+
+ /**
+ * Decompositions according to Unicode 5.2,
+ * and http://ldc.upenn.edu/myl/IndianScriptsUnicode.html
+ *
+ * Most of these are not handled by unicode normalization anyway.
+ *
+ * The numbers here represent offsets into the respective codepages,
+ * with -1 representing null and 0xFF representing zero-width joiner.
+ *
+ * the columns are: ch1, ch2, ch3, res, flags
+ * ch1, ch2, and ch3 are the decomposition
+ * res is the composition, and flags are the scripts to which it applies.
+ */
+ private static final int decompositions[][] = {
+ /* devanagari, gujarati vowel candra O */
+ { 0x05, 0x3E, 0x45, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x05, 0x3E, 0x46, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x05, 0x3E, 0x47, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gujarati letter AU */
+ { 0x05, 0x3E, 0x48, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari, bengali, gurmukhi, gujarati, oriya AA */
+ { 0x05, 0x3E, -1, 0x06, flag(DEVANAGARI) | flag(BENGALI) | flag(GURMUKHI) | flag(GUJARATI) | flag(ORIYA) },
+ /* devanagari letter candra A */
+ { 0x05, 0x45, -1, 0x72, flag(DEVANAGARI) },
+ /* gujarati vowel candra E */
+ { 0x05, 0x45, -1, 0x0D, flag(GUJARATI) },
+ /* devanagari letter short A */
+ { 0x05, 0x46, -1, 0x04, flag(DEVANAGARI) },
+ /* gujarati letter E */
+ { 0x05, 0x47, -1, 0x0F, flag(GUJARATI) },
+ /* gurmukhi, gujarati letter AI */
+ { 0x05, 0x48, -1, 0x10, flag(GURMUKHI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel candra O */
+ { 0x05, 0x49, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x05, 0x4A, -1, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x05, 0x4B, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gurmukhi letter AU, gujarati letter AU */
+ { 0x05, 0x4C, -1, 0x14, flag(DEVANAGARI) | flag(GURMUKHI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel candra O */
+ { 0x06, 0x45, -1, 0x11, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari short O */
+ { 0x06, 0x46, -1, 0x12, flag(DEVANAGARI) },
+ /* devanagari, gujarati letter O */
+ { 0x06, 0x47, -1, 0x13, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari letter AI, gujarati letter AU */
+ { 0x06, 0x48, -1, 0x14, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* malayalam letter II */
+ { 0x07, 0x57, -1, 0x08, flag(MALAYALAM) },
+ /* devanagari letter UU */
+ { 0x09, 0x41, -1, 0x0A, flag(DEVANAGARI) },
+ /* tamil, malayalam letter UU (some styles) */
+ { 0x09, 0x57, -1, 0x0A, flag(TAMIL) | flag(MALAYALAM) },
+ /* malayalam letter AI */
+ { 0x0E, 0x46, -1, 0x10, flag(MALAYALAM) },
+ /* devanagari candra E */
+ { 0x0F, 0x45, -1, 0x0D, flag(DEVANAGARI) },
+ /* devanagari short E */
+ { 0x0F, 0x46, -1, 0x0E, flag(DEVANAGARI) },
+ /* devanagari AI */
+ { 0x0F, 0x47, -1, 0x10, flag(DEVANAGARI) },
+ /* oriya AI */
+ { 0x0F, 0x57, -1, 0x10, flag(ORIYA) },
+ /* malayalam letter OO */
+ { 0x12, 0x3E, -1, 0x13, flag(MALAYALAM) },
+ /* telugu, kannada letter AU */
+ { 0x12, 0x4C, -1, 0x14, flag(TELUGU) | flag(KANNADA) },
+ /* telugu letter OO */
+ { 0x12, 0x55, -1, 0x13, flag(TELUGU) },
+ /* tamil, malayalam letter AU */
+ { 0x12, 0x57, -1, 0x14, flag(TAMIL) | flag(MALAYALAM) },
+ /* oriya letter AU */
+ { 0x13, 0x57, -1, 0x14, flag(ORIYA) },
+ /* devanagari qa */
+ { 0x15, 0x3C, -1, 0x58, flag(DEVANAGARI) },
+ /* devanagari, gurmukhi khha */
+ { 0x16, 0x3C, -1, 0x59, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, gurmukhi ghha */
+ { 0x17, 0x3C, -1, 0x5A, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, gurmukhi za */
+ { 0x1C, 0x3C, -1, 0x5B, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari dddha, bengali, oriya rra */
+ { 0x21, 0x3C, -1, 0x5C, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+ /* devanagari, bengali, oriya rha */
+ { 0x22, 0x3C, -1, 0x5D, flag(DEVANAGARI) | flag(BENGALI) | flag(ORIYA) },
+ /* malayalam chillu nn */
+ { 0x23, 0x4D, 0xFF, 0x7A, flag(MALAYALAM) },
+ /* bengali khanda ta */
+ { 0x24, 0x4D, 0xFF, 0x4E, flag(BENGALI) },
+ /* devanagari nnna */
+ { 0x28, 0x3C, -1, 0x29, flag(DEVANAGARI) },
+ /* malayalam chillu n */
+ { 0x28, 0x4D, 0xFF, 0x7B, flag(MALAYALAM) },
+ /* devanagari, gurmukhi fa */
+ { 0x2B, 0x3C, -1, 0x5E, flag(DEVANAGARI) | flag(GURMUKHI) },
+ /* devanagari, bengali yya */
+ { 0x2F, 0x3C, -1, 0x5F, flag(DEVANAGARI) | flag(BENGALI) },
+ /* telugu letter vocalic R */
+ { 0x2C, 0x41, 0x41, 0x0B, flag(TELUGU) },
+ /* devanagari rra */
+ { 0x30, 0x3C, -1, 0x31, flag(DEVANAGARI) },
+ /* malayalam chillu rr */
+ { 0x30, 0x4D, 0xFF, 0x7C, flag(MALAYALAM) },
+ /* malayalam chillu l */
+ { 0x32, 0x4D, 0xFF, 0x7D, flag(MALAYALAM) },
+ /* devanagari llla */
+ { 0x33, 0x3C, -1, 0x34, flag(DEVANAGARI) },
+ /* malayalam chillu ll */
+ { 0x33, 0x4D, 0xFF, 0x7E, flag(MALAYALAM) },
+ /* telugu letter MA */
+ { 0x35, 0x41, -1, 0x2E, flag(TELUGU) },
+ /* devanagari, gujarati vowel sign candra O */
+ { 0x3E, 0x45, -1, 0x49, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari vowel sign short O */
+ { 0x3E, 0x46, -1, 0x4A, flag(DEVANAGARI) },
+ /* devanagari, gujarati vowel sign O */
+ { 0x3E, 0x47, -1, 0x4B, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* devanagari, gujarati vowel sign AU */
+ { 0x3E, 0x48, -1, 0x4C, flag(DEVANAGARI) | flag(GUJARATI) },
+ /* kannada vowel sign II */
+ { 0x3F, 0x55, -1, 0x40, flag(KANNADA) },
+ /* gurmukhi vowel sign UU (when stacking) */
+ { 0x41, 0x41, -1, 0x42, flag(GURMUKHI) },
+ /* tamil, malayalam vowel sign O */
+ { 0x46, 0x3E, -1, 0x4A, flag(TAMIL) | flag(MALAYALAM) },
+ /* kannada vowel sign OO */
+ { 0x46, 0x42, 0x55, 0x4B, flag(KANNADA) },
+ /* kannada vowel sign O */
+ { 0x46, 0x42, -1, 0x4A, flag(KANNADA) },
+ /* malayalam vowel sign AI (if reordered twice) */
+ { 0x46, 0x46, -1, 0x48, flag(MALAYALAM) },
+ /* telugu, kannada vowel sign EE */
+ { 0x46, 0x55, -1, 0x47, flag(TELUGU) | flag(KANNADA) },
+ /* telugu, kannada vowel sign AI */
+ { 0x46, 0x56, -1, 0x48, flag(TELUGU) | flag(KANNADA) },
+ /* tamil, malayalam vowel sign AU */
+ { 0x46, 0x57, -1, 0x4C, flag(TAMIL) | flag(MALAYALAM) },
+ /* bengali, oriya vowel sign O, tamil, malayalam vowel sign OO */
+ { 0x47, 0x3E, -1, 0x4B, flag(BENGALI) | flag(ORIYA) | flag(TAMIL) | flag(MALAYALAM) },
+ /* bengali, oriya vowel sign AU */
+ { 0x47, 0x57, -1, 0x4C, flag(BENGALI) | flag(ORIYA) },
+ /* kannada vowel sign OO */
+ { 0x4A, 0x55, -1, 0x4B, flag(KANNADA) },
+ /* gurmukhi letter I */
+ { 0x72, 0x3F, -1, 0x07, flag(GURMUKHI) },
+ /* gurmukhi letter II */
+ { 0x72, 0x40, -1, 0x08, flag(GURMUKHI) },
+ /* gurmukhi letter EE */
+ { 0x72, 0x47, -1, 0x0F, flag(GURMUKHI) },
+ /* gurmukhi letter U */
+ { 0x73, 0x41, -1, 0x09, flag(GURMUKHI) },
+ /* gurmukhi letter UU */
+ { 0x73, 0x42, -1, 0x0A, flag(GURMUKHI) },
+ /* gurmukhi letter OO */
+ { 0x73, 0x4B, -1, 0x13, flag(GURMUKHI) },
+ };
+
+ static {
+ for (ScriptData sd : scripts.values()) {
+ sd.decompMask = new BitSet(0x7F);
+ for (int i = 0; i < decompositions.length; i++) {
+ final int ch = decompositions[i][0];
+ final int flags = decompositions[i][4];
+ if ((flags & sd.flag) != 0)
+ sd.decompMask.set(ch);
+ }
+ }
+ }
+
+ /**
+ * Normalizes input text, and returns the new length.
+ * The length will always be less than or equal to the existing length.
+ *
+ * @param text input text
+ * @param len valid length
+ * @return normalized length
+ */
+ public int normalize(char text[], int len) {
+ for (int i = 0; i < len; i++) {
+ final Character.UnicodeBlock block = Character.UnicodeBlock.of(text[i]);
+ final ScriptData sd = scripts.get(block);
+ if (sd != null) {
+ final int ch = text[i] - sd.base;
+ if (sd.decompMask.get(ch))
+ len = compose(ch, block, sd, text, i, len);
+ }
+ }
+ return len;
+ }
+
+ /**
+ * Compose into standard form any compositions in the decompositions table.
+ */
+ private int compose(int ch0, Character.UnicodeBlock block0, ScriptData sd,
+ char text[], int pos, int len) {
+ if (pos + 1 >= len) /* need at least 2 chars! */
+ return len;
+
+ final int ch1 = text[pos + 1] - sd.base;
+ final Character.UnicodeBlock block1 = Character.UnicodeBlock.of(text[pos + 1]);
+ if (block1 != block0) /* needs to be the same writing system */
+ return len;
+
+ int ch2 = -1;
+
+ if (pos + 2 < len) {
+ ch2 = text[pos + 2] - sd.base;
+ Character.UnicodeBlock block2 = Character.UnicodeBlock.of(text[pos + 2]);
+ if (text[pos + 2] == '\u200D') // ZWJ
+ ch2 = 0xFF;
+ else if (block2 != block1) // still allow a 2-char match
+ ch2 = -1;
+ }
+
+ for (int i = 0; i < decompositions.length; i++)
+ if (decompositions[i][0] == ch0 && (decompositions[i][4] & sd.flag) != 0) {
+ if (decompositions[i][1] == ch1 && (decompositions[i][2] < 0 || decompositions[i][2] == ch2)) {
+ text[pos] = (char) (sd.base + decompositions[i][3]);
+ len = delete(text, pos + 1, len);
+ if (decompositions[i][2] >= 0)
+ len = delete(text, pos + 1, len);
+ return len;
+ }
+ }
+
+ return len;
+ }
+}