--- /dev/null
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
+ * which has the same functionality. This filter will be removed in Lucene 4.0
+ */
+@Deprecated
+class RussianStemmer
+{
+ // positions of RV, R1 and R2 respectively
+ private int RV, /*R1,*/ R2;
+
+ // letters (currently unused letters are commented out)
+ private final static char A = '\u0430';
+ //private final static char B = '\u0431';
+ private final static char V = '\u0432';
+ private final static char G = '\u0433';
+ //private final static char D = '\u0434';
+ private final static char E = '\u0435';
+ //private final static char ZH = '\u0436';
+ //private final static char Z = '\u0437';
+ private final static char I = '\u0438';
+ private final static char I_ = '\u0439';
+ //private final static char K = '\u043A';
+ private final static char L = '\u043B';
+ private final static char M = '\u043C';
+ private final static char N = '\u043D';
+ private final static char O = '\u043E';
+ //private final static char P = '\u043F';
+ //private final static char R = '\u0440';
+ private final static char S = '\u0441';
+ private final static char T = '\u0442';
+ private final static char U = '\u0443';
+ //private final static char F = '\u0444';
+ private final static char X = '\u0445';
+ //private final static char TS = '\u0446';
+ //private final static char CH = '\u0447';
+ private final static char SH = '\u0448';
+ private final static char SHCH = '\u0449';
+ //private final static char HARD = '\u044A';
+ private final static char Y = '\u044B';
+ private final static char SOFT = '\u044C';
+ private final static char AE = '\u044D';
+ private final static char IU = '\u044E';
+ private final static char IA = '\u044F';
+
+ // stem definitions
+ private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
+
+ private static char[][] perfectiveGerundEndings1 = {
+ { V },
+ { V, SH, I },
+ { V, SH, I, S, SOFT }
+ };
+
+ private static char[][] perfectiveGerund1Predessors = {
+ { A },
+ { IA }
+ };
+
+ private static char[][] perfectiveGerundEndings2 = { { I, V }, {
+ Y, V }, {
+ I, V, SH, I }, {
+ Y, V, SH, I }, {
+ I, V, SH, I, S, SOFT }, {
+ Y, V, SH, I, S, SOFT }
+ };
+
+ private static char[][] adjectiveEndings = {
+ { E, E },
+ { I, E },
+ { Y, E },
+ { O, E },
+ { E, I_ },
+ { I, I_ },
+ { Y, I_ },
+ { O, I_ },
+ { E, M },
+ { I, M },
+ { Y, M },
+ { O, M },
+ { I, X },
+ { Y, X },
+ { U, IU },
+ { IU, IU },
+ { A, IA },
+ { IA, IA },
+ { O, IU },
+ { E, IU },
+ { I, M, I },
+ { Y, M, I },
+ { E, G, O },
+ { O, G, O },
+ { E, M, U },
+ {O, M, U }
+ };
+
+ private static char[][] participleEndings1 = {
+ { SHCH },
+ { E, M },
+ { N, N },
+ { V, SH },
+ { IU, SHCH }
+ };
+
+ private static char[][] participleEndings2 = {
+ { I, V, SH },
+ { Y, V, SH },
+ { U, IU, SHCH }
+ };
+
+ private static char[][] participle1Predessors = {
+ { A },
+ { IA }
+ };
+
+ private static char[][] reflexiveEndings = {
+ { S, IA },
+ { S, SOFT }
+ };
+
+ private static char[][] verbEndings1 = {
+ { I_ },
+ { L },
+ { N },
+ { L, O },
+ { N, O },
+ { E, T },
+ { IU, T },
+ { L, A },
+ { N, A },
+ { L, I },
+ { E, M },
+ { N, Y },
+ { E, T, E },
+ { I_, T, E },
+ { T, SOFT },
+ { E, SH, SOFT },
+ { N, N, O }
+ };
+
+ private static char[][] verbEndings2 = {
+ { IU },
+ { U, IU },
+ { E, N },
+ { E, I_ },
+ { IA, T },
+ { U, I_ },
+ { I, L },
+ { Y, L },
+ { I, M },
+ { Y, M },
+ { I, T },
+ { Y, T },
+ { I, L, A },
+ { Y, L, A },
+ { E, N, A },
+ { I, T, E },
+ { I, L, I },
+ { Y, L, I },
+ { I, L, O },
+ { Y, L, O },
+ { E, N, O },
+ { U, E, T },
+ { U, IU, T },
+ { E, N, Y },
+ { I, T, SOFT },
+ { Y, T, SOFT },
+ { I, SH, SOFT },
+ { E, I_, T, E },
+ { U, I_, T, E }
+ };
+
+ private static char[][] verb1Predessors = {
+ { A },
+ { IA }
+ };
+
+ private static char[][] nounEndings = {
+ { A },
+ { U },
+ { I_ },
+ { O },
+ { U },
+ { E },
+ { Y },
+ { I },
+ { SOFT },
+ { IA },
+ { E, V },
+ { O, V },
+ { I, E },
+ { SOFT, E },
+ { IA, X },
+ { I, IU },
+ { E, I },
+ { I, I },
+ { E, I_ },
+ { O, I_ },
+ { E, M },
+ { A, M },
+ { O, M },
+ { A, X },
+ { SOFT, IU },
+ { I, IA },
+ { SOFT, IA },
+ { I, I_ },
+ { IA, M },
+ { IA, M, I },
+ { A, M, I },
+ { I, E, I_ },
+ { I, IA, M },
+ { I, E, M },
+ { I, IA, X },
+ { I, IA, M, I }
+ };
+
+ private static char[][] superlativeEndings = {
+ { E, I_, SH },
+ { E, I_, SH, E }
+ };
+
+ private static char[][] derivationalEndings = {
+ { O, S, T },
+ { O, S, T, SOFT }
+ };
+
+ /**
+ * RussianStemmer constructor comment.
+ */
+ public RussianStemmer()
+ {
+ super();
+ }
+
+ /**
+ * Adjectival ending is an adjective ending,
+ * optionally preceded by participle ending.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean adjectival(StringBuilder stemmingZone)
+ {
+ // look for adjective ending in a stemming zone
+ if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
+ return false;
+ // if adjective ending was found, try for participle ending.
+ if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
+ findAndRemoveEnding(stemmingZone, participleEndings2);
+ return true;
+ }
+
+ /**
+ * Derivational endings
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean derivational(StringBuilder stemmingZone)
+ {
+ int endingLength = findEnding(stemmingZone, derivationalEndings);
+ if (endingLength == 0)
+ // no derivational ending found
+ return false;
+ else
+ {
+ // Ensure that the ending locates in R2
+ if (R2 - RV <= stemmingZone.length() - endingLength)
+ {
+ stemmingZone.setLength(stemmingZone.length() - endingLength);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+ }
+
+ /**
+ * Finds ending among given ending class and returns the length of ending found(0, if not found).
+ * Creation date: (17/03/2002 8:18:34 PM)
+ */
+ private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
+ {
+ boolean match = false;
+ for (int i = theEndingClass.length - 1; i >= 0; i--)
+ {
+ char[] theEnding = theEndingClass[i];
+ // check if the ending is bigger than stemming zone
+ if (startIndex < theEnding.length - 1)
+ {
+ match = false;
+ continue;
+ }
+ match = true;
+ int stemmingIndex = startIndex;
+ for (int j = theEnding.length - 1; j >= 0; j--)
+ {
+ if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
+ {
+ match = false;
+ break;
+ }
+ }
+ // check if ending was found
+ if (match)
+ {
+ return theEndingClass[i].length; // cut ending
+ }
+ }
+ return 0;
+ }
+
+ private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
+ {
+ return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
+ }
+
+ /**
+ * Finds the ending among the given class of endings and removes it from stemming zone.
+ * Creation date: (17/03/2002 8:18:34 PM)
+ */
+ private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
+ {
+ int endingLength = findEnding(stemmingZone, theEndingClass);
+ if (endingLength == 0)
+ // not found
+ return false;
+ else {
+ stemmingZone.setLength(stemmingZone.length() - endingLength);
+ // cut the ending found
+ return true;
+ }
+ }
+
+ /**
+ * Finds the ending among the given class of endings, then checks if this ending was
+ * preceded by any of given predecessors, and if so, removes it from stemming zone.
+ * Creation date: (17/03/2002 8:18:34 PM)
+ */
+ private boolean findAndRemoveEnding(StringBuilder stemmingZone,
+ char[][] theEndingClass, char[][] thePredessors)
+ {
+ int endingLength = findEnding(stemmingZone, theEndingClass);
+ if (endingLength == 0)
+ // not found
+ return false;
+ else
+ {
+ int predessorLength =
+ findEnding(stemmingZone,
+ stemmingZone.length() - endingLength - 1,
+ thePredessors);
+ if (predessorLength == 0)
+ return false;
+ else {
+ stemmingZone.setLength(stemmingZone.length() - endingLength);
+ // cut the ending found
+ return true;
+ }
+ }
+
+ }
+
+ /**
+ * Marks positions of RV, R1 and R2 in a given word.
+ * Creation date: (16/03/2002 3:40:11 PM)
+ */
+ private void markPositions(String word)
+ {
+ RV = 0;
+// R1 = 0;
+ R2 = 0;
+ int i = 0;
+ // find RV
+ while (word.length() > i && !isVowel(word.charAt(i)))
+ {
+ i++;
+ }
+ if (word.length() - 1 < ++i)
+ return; // RV zone is empty
+ RV = i;
+ // find R1
+ while (word.length() > i && isVowel(word.charAt(i)))
+ {
+ i++;
+ }
+ if (word.length() - 1 < ++i)
+ return; // R1 zone is empty
+// R1 = i;
+ // find R2
+ while (word.length() > i && !isVowel(word.charAt(i)))
+ {
+ i++;
+ }
+ if (word.length() - 1 < ++i)
+ return; // R2 zone is empty
+ while (word.length() > i && isVowel(word.charAt(i)))
+ {
+ i++;
+ }
+ if (word.length() - 1 < ++i)
+ return; // R2 zone is empty
+ R2 = i;
+ }
+
+ /**
+ * Checks if character is a vowel..
+ * Creation date: (16/03/2002 10:47:03 PM)
+ * @return boolean
+ * @param letter char
+ */
+ private boolean isVowel(char letter)
+ {
+ for (int i = 0; i < vowels.length; i++)
+ {
+ if (letter == vowels[i])
+ return true;
+ }
+ return false;
+ }
+
+ /**
+ * Noun endings.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean noun(StringBuilder stemmingZone)
+ {
+ return findAndRemoveEnding(stemmingZone, nounEndings);
+ }
+
+ /**
+ * Perfective gerund endings.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean perfectiveGerund(StringBuilder stemmingZone)
+ {
+ return findAndRemoveEnding(
+ stemmingZone,
+ perfectiveGerundEndings1,
+ perfectiveGerund1Predessors)
+ || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
+ }
+
+ /**
+ * Reflexive endings.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean reflexive(StringBuilder stemmingZone)
+ {
+ return findAndRemoveEnding(stemmingZone, reflexiveEndings);
+ }
+
+ /**
+ * Insert the method's description here.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean removeI(StringBuilder stemmingZone)
+ {
+ if (stemmingZone.length() > 0
+ && stemmingZone.charAt(stemmingZone.length() - 1) == I)
+ {
+ stemmingZone.setLength(stemmingZone.length() - 1);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Insert the method's description here.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean removeSoft(StringBuilder stemmingZone)
+ {
+ if (stemmingZone.length() > 0
+ && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
+ {
+ stemmingZone.setLength(stemmingZone.length() - 1);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Finds the stem for given Russian word.
+ * Creation date: (16/03/2002 3:36:48 PM)
+ * @return java.lang.String
+ * @param input java.lang.String
+ */
+ public String stem(String input)
+ {
+ markPositions(input);
+ if (RV == 0)
+ return input; //RV wasn't detected, nothing to stem
+ StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
+ // stemming goes on in RV
+ // Step 1
+
+ if (!perfectiveGerund(stemmingZone))
+ {
+ reflexive(stemmingZone);
+ if (!adjectival(stemmingZone))
+ if (!verb(stemmingZone))
+ noun(stemmingZone);
+ }
+ // Step 2
+ removeI(stemmingZone);
+ // Step 3
+ derivational(stemmingZone);
+ // Step 4
+ superlative(stemmingZone);
+ undoubleN(stemmingZone);
+ removeSoft(stemmingZone);
+ // return result
+ return input.substring(0, RV) + stemmingZone.toString();
+ }
+
+ /**
+ * Superlative endings.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean superlative(StringBuilder stemmingZone)
+ {
+ return findAndRemoveEnding(stemmingZone, superlativeEndings);
+ }
+
+ /**
+ * Undoubles N.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean undoubleN(StringBuilder stemmingZone)
+ {
+ char[][] doubleN = {
+ { N, N }
+ };
+ if (findEnding(stemmingZone, doubleN) != 0)
+ {
+ stemmingZone.setLength(stemmingZone.length() - 1);
+ return true;
+ }
+ else
+ {
+ return false;
+ }
+ }
+
+ /**
+ * Verb endings.
+ * Creation date: (17/03/2002 12:14:58 AM)
+ * @param stemmingZone java.lang.StringBuilder
+ */
+ private boolean verb(StringBuilder stemmingZone)
+ {
+ return findAndRemoveEnding(
+ stemmingZone,
+ verbEndings1,
+ verb1Predessors)
+ || findAndRemoveEnding(stemmingZone, verbEndings2);
+ }
+
+ /**
+ * Static method for stemming.
+ */
+ public static String stemWord(String theWord)
+ {
+ RussianStemmer stemmer = new RussianStemmer();
+ return stemmer.stem(theWord);
+ }
+}