pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / ru / RussianStemmer.java
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java

new file mode 100644 (file)

index 0000000..fea9e21
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/ru/RussianStemmer.java
@@ -0,0 +1,600 @@
+package org.apache.lucene.analysis.ru;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/**
+ * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
+ * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead, 
+ * which has the same functionality. This filter will be removed in Lucene 4.0
+ */
+@Deprecated
+class RussianStemmer
+{
+    // positions of RV, R1 and R2 respectively
+    private int RV, /*R1,*/ R2;
+
+    // letters (currently unused letters are commented out)
+    private final static char A = '\u0430';
+    //private final static char B = '\u0431';
+    private final static char V = '\u0432';
+    private final static char G = '\u0433';
+    //private final static char D = '\u0434';
+    private final static char E = '\u0435';
+    //private final static char ZH = '\u0436';
+    //private final static char Z = '\u0437';
+    private final static char I = '\u0438';
+    private final static char I_ = '\u0439';
+    //private final static char K = '\u043A';
+    private final static char L = '\u043B';
+    private final static char M = '\u043C';
+    private final static char N = '\u043D';
+    private final static char O = '\u043E';
+    //private final static char P = '\u043F';
+    //private final static char R = '\u0440';
+    private final static char S = '\u0441';
+    private final static char T = '\u0442';
+    private final static char U = '\u0443';
+    //private final static char F = '\u0444';
+    private final static char X = '\u0445';
+    //private final static char TS = '\u0446';
+    //private final static char CH = '\u0447';
+    private final static char SH = '\u0448';
+    private final static char SHCH = '\u0449';
+    //private final static char HARD = '\u044A';
+    private final static char Y = '\u044B';
+    private final static char SOFT = '\u044C';
+    private final static char AE = '\u044D';
+    private final static char IU = '\u044E';
+    private final static char IA = '\u044F';
+
+    // stem definitions
+    private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
+
+    private static char[][] perfectiveGerundEndings1 = {
+        { V },
+        { V, SH, I },
+        { V, SH, I, S, SOFT }
+    };
+
+    private static char[][] perfectiveGerund1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] perfectiveGerundEndings2 = { { I, V }, {
+        Y, V }, {
+            I, V, SH, I }, {
+                Y, V, SH, I }, {
+                    I, V, SH, I, S, SOFT }, {
+                        Y, V, SH, I, S, SOFT }
+    };
+
+    private static char[][] adjectiveEndings = {
+        { E, E },
+        { I, E },
+        { Y, E },
+        { O, E },
+        { E, I_ },
+        { I, I_ },
+        { Y, I_ },
+        { O, I_ },
+        { E, M },
+        { I, M },
+        { Y, M },
+        { O, M },
+        { I, X },
+        { Y, X },
+        { U, IU },
+        { IU, IU },
+        { A, IA },
+        { IA, IA },
+        { O, IU },
+        { E, IU },
+        { I, M, I },
+        { Y, M, I },
+        { E, G, O },
+        { O, G, O },
+        { E, M, U },
+        {O, M, U }
+    };
+
+    private static char[][] participleEndings1 = {
+        { SHCH },
+        { E, M },
+        { N, N },
+        { V, SH },
+        { IU, SHCH }
+    };
+
+    private static char[][] participleEndings2 = {
+        { I, V, SH },
+        { Y, V, SH },
+        { U, IU, SHCH }
+    };
+
+    private static char[][] participle1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] reflexiveEndings = {
+        { S, IA },
+        { S, SOFT }
+    };
+
+    private static char[][] verbEndings1 = {
+        { I_ },
+        { L },
+        { N },
+        { L, O },
+        { N, O },
+        { E, T },
+        { IU, T },
+        { L, A },
+        { N, A },
+        { L, I },
+        { E, M },
+        { N, Y },
+        { E, T, E },
+        { I_, T, E },
+        { T, SOFT },
+        { E, SH, SOFT },
+        { N, N, O }
+    };
+
+    private static char[][] verbEndings2 = {
+        { IU },
+        { U, IU },
+        { E, N },
+        { E, I_ },
+        { IA, T },
+        { U, I_ },
+        { I, L },
+        { Y, L },
+        { I, M },
+        { Y, M },
+        { I, T },
+        { Y, T },
+        { I, L, A },
+        { Y, L, A },
+        { E, N, A },
+        { I, T, E },
+        { I, L, I },
+        { Y, L, I },
+        { I, L, O },
+        { Y, L, O },
+        { E, N, O },
+        { U, E, T },
+        { U, IU, T },
+        { E, N, Y },
+        { I, T, SOFT },
+        { Y, T, SOFT },
+        { I, SH, SOFT },
+        { E, I_, T, E },
+        { U, I_, T, E }
+    };
+
+    private static char[][] verb1Predessors = {
+        { A },
+        { IA }
+    };
+
+    private static char[][] nounEndings = {
+        { A },
+        { U },
+        { I_ },
+        { O },
+        { U },
+        { E },
+        { Y },
+        { I },
+        { SOFT },
+        { IA },
+        { E, V },
+        { O, V },
+        { I, E },
+        { SOFT, E },
+        { IA, X },
+        { I, IU },
+        { E, I },
+        { I, I },
+        { E, I_ },
+        { O, I_ },
+        { E, M },
+        { A, M },
+        { O, M },
+        { A, X },
+        { SOFT, IU },
+        { I, IA },
+        { SOFT, IA },
+        { I, I_ },
+        { IA, M },
+        { IA, M, I },
+        { A, M, I },
+        { I, E, I_ },
+        { I, IA, M },
+        { I, E, M },
+        { I, IA, X },
+        { I, IA, M, I }
+    };
+
+    private static char[][] superlativeEndings = {
+        { E, I_, SH },
+        { E, I_, SH, E }
+    };
+
+    private static char[][] derivationalEndings = {
+        { O, S, T },
+        { O, S, T, SOFT }
+    };
+
+    /**
+     * RussianStemmer constructor comment.
+     */
+    public RussianStemmer()
+    {
+        super();
+    }
+
+    /**
+     * Adjectival ending is an adjective ending,
+     * optionally preceded by participle ending.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean adjectival(StringBuilder stemmingZone)
+    {
+        // look for adjective ending in a stemming zone
+        if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
+            return false;
+        // if adjective ending was found, try for participle ending.
+        if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
+            findAndRemoveEnding(stemmingZone, participleEndings2);
+        return true;
+    }
+
+    /**
+     * Derivational endings
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean derivational(StringBuilder stemmingZone)
+    {
+        int endingLength = findEnding(stemmingZone, derivationalEndings);
+        if (endingLength == 0)
+             // no derivational ending found
+            return false;
+        else
+        {
+            // Ensure that the ending locates in R2
+            if (R2 - RV <= stemmingZone.length() - endingLength)
+            {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                return true;
+            }
+            else
+            {
+                return false;
+            }
+        }
+    }
+
+    /**
+     * Finds ending among given ending class and returns the length of ending found(0, if not found).
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
+    {
+        boolean match = false;
+        for (int i = theEndingClass.length - 1; i >= 0; i--)
+        {
+            char[] theEnding = theEndingClass[i];
+            // check if the ending is bigger than stemming zone
+            if (startIndex < theEnding.length - 1)
+            {
+                match = false;
+                continue;
+            }
+            match = true;
+            int stemmingIndex = startIndex;
+            for (int j = theEnding.length - 1; j >= 0; j--)
+            {
+                if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
+                {
+                    match = false;
+                    break;
+                }
+            }
+            // check if ending was found
+            if (match)
+            {
+                return theEndingClass[i].length; // cut ending
+            }
+        }
+        return 0;
+    }
+
+    private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
+    {
+        return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
+    }
+
+    /**
+     * Finds the ending among the given class of endings and removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else {
+            stemmingZone.setLength(stemmingZone.length() - endingLength);
+            // cut the ending found
+            return true;
+        }
+    }
+
+    /**
+     * Finds the ending among the given class of endings, then checks if this ending was
+     * preceded by any of given predecessors, and if so, removes it from stemming zone.
+     * Creation date: (17/03/2002 8:18:34 PM)
+     */
+    private boolean findAndRemoveEnding(StringBuilder stemmingZone,
+        char[][] theEndingClass, char[][] thePredessors)
+    {
+        int endingLength = findEnding(stemmingZone, theEndingClass);
+        if (endingLength == 0)
+            // not found
+            return false;
+        else
+        {
+            int predessorLength =
+                findEnding(stemmingZone,
+                    stemmingZone.length() - endingLength - 1,
+                    thePredessors);
+            if (predessorLength == 0)
+                return false;
+            else {
+                stemmingZone.setLength(stemmingZone.length() - endingLength);
+                // cut the ending found
+                return true;
+            }
+        }
+
+    }
+
+    /**
+     * Marks positions of RV, R1 and R2 in a given word.
+     * Creation date: (16/03/2002 3:40:11 PM)
+     */
+    private void markPositions(String word)
+    {
+        RV = 0;
+//        R1 = 0;
+        R2 = 0;
+        int i = 0;
+        // find RV
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // RV zone is empty
+        RV = i;
+        // find R1
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R1 zone is empty
+//        R1 = i;
+        // find R2
+        while (word.length() > i && !isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        while (word.length() > i && isVowel(word.charAt(i)))
+        {
+            i++;
+        }
+        if (word.length() - 1 < ++i)
+            return; // R2 zone is empty
+        R2 = i;
+    }
+
+    /**
+     * Checks if character is a vowel..
+     * Creation date: (16/03/2002 10:47:03 PM)
+     * @return boolean
+     * @param letter char
+     */
+    private boolean isVowel(char letter)
+    {
+        for (int i = 0; i < vowels.length; i++)
+        {
+            if (letter == vowels[i])
+                return true;
+        }
+        return false;
+    }
+
+    /**
+     * Noun endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean noun(StringBuilder stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, nounEndings);
+    }
+
+    /**
+     * Perfective gerund endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean perfectiveGerund(StringBuilder stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            perfectiveGerundEndings1,
+            perfectiveGerund1Predessors)
+            || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
+    }
+
+    /**
+     * Reflexive endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean reflexive(StringBuilder stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, reflexiveEndings);
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean removeI(StringBuilder stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == I)
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Insert the method's description here.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean removeSoft(StringBuilder stemmingZone)
+    {
+        if (stemmingZone.length() > 0
+            && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Finds the stem for given Russian word.
+     * Creation date: (16/03/2002 3:36:48 PM)
+     * @return java.lang.String
+     * @param input java.lang.String
+     */
+    public String stem(String input)
+    {
+        markPositions(input);
+        if (RV == 0)
+            return input; //RV wasn't detected, nothing to stem
+        StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
+        // stemming goes on in RV
+        // Step 1
+
+        if (!perfectiveGerund(stemmingZone))
+        {
+            reflexive(stemmingZone);
+            if (!adjectival(stemmingZone))
+              if (!verb(stemmingZone))
+                noun(stemmingZone);
+        }
+        // Step 2
+        removeI(stemmingZone);
+        // Step 3
+        derivational(stemmingZone);
+        // Step 4
+        superlative(stemmingZone);
+        undoubleN(stemmingZone);
+        removeSoft(stemmingZone);
+        // return result
+        return input.substring(0, RV) + stemmingZone.toString();
+    }
+
+    /**
+     * Superlative endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean superlative(StringBuilder stemmingZone)
+    {
+        return findAndRemoveEnding(stemmingZone, superlativeEndings);
+    }
+
+    /**
+     * Undoubles N.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean undoubleN(StringBuilder stemmingZone)
+    {
+        char[][] doubleN = {
+            { N, N }
+        };
+        if (findEnding(stemmingZone, doubleN) != 0)
+        {
+            stemmingZone.setLength(stemmingZone.length() - 1);
+            return true;
+        }
+        else
+        {
+            return false;
+        }
+    }
+
+    /**
+     * Verb endings.
+     * Creation date: (17/03/2002 12:14:58 AM)
+     * @param stemmingZone java.lang.StringBuilder
+     */
+    private boolean verb(StringBuilder stemmingZone)
+    {
+        return findAndRemoveEnding(
+            stemmingZone,
+            verbEndings1,
+            verb1Predessors)
+            || findAndRemoveEnding(stemmingZone, verbEndings2);
+    }
+   
+    /**
+     * Static method for stemming.
+     */
+    public static String stemWord(String theWord)
+    {
+        RussianStemmer stemmer = new RussianStemmer();
+        return stemmer.stem(theWord);
+    }
+}