1 package org.apache.lucene.analysis.ru;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
21 * Russian stemming algorithm implementation (see http://snowball.sourceforge.net for detailed description).
22 * @deprecated Use {@link org.tartarus.snowball.ext.RussianStemmer} instead,
23 * which has the same functionality. This filter will be removed in Lucene 4.0
28 // positions of RV, R1 and R2 respectively
29 private int RV, /*R1,*/ R2;
31 // letters (currently unused letters are commented out)
32 private final static char A = '\u0430';
33 //private final static char B = '\u0431';
34 private final static char V = '\u0432';
35 private final static char G = '\u0433';
36 //private final static char D = '\u0434';
37 private final static char E = '\u0435';
38 //private final static char ZH = '\u0436';
39 //private final static char Z = '\u0437';
40 private final static char I = '\u0438';
41 private final static char I_ = '\u0439';
42 //private final static char K = '\u043A';
43 private final static char L = '\u043B';
44 private final static char M = '\u043C';
45 private final static char N = '\u043D';
46 private final static char O = '\u043E';
47 //private final static char P = '\u043F';
48 //private final static char R = '\u0440';
49 private final static char S = '\u0441';
50 private final static char T = '\u0442';
51 private final static char U = '\u0443';
52 //private final static char F = '\u0444';
53 private final static char X = '\u0445';
54 //private final static char TS = '\u0446';
55 //private final static char CH = '\u0447';
56 private final static char SH = '\u0448';
57 private final static char SHCH = '\u0449';
58 //private final static char HARD = '\u044A';
59 private final static char Y = '\u044B';
60 private final static char SOFT = '\u044C';
61 private final static char AE = '\u044D';
62 private final static char IU = '\u044E';
63 private final static char IA = '\u044F';
66 private static char[] vowels = { A, E, I, O, U, Y, AE, IU, IA };
68 private static char[][] perfectiveGerundEndings1 = {
74 private static char[][] perfectiveGerund1Predessors = {
79 private static char[][] perfectiveGerundEndings2 = { { I, V }, {
83 I, V, SH, I, S, SOFT }, {
84 Y, V, SH, I, S, SOFT }
87 private static char[][] adjectiveEndings = {
116 private static char[][] participleEndings1 = {
124 private static char[][] participleEndings2 = {
130 private static char[][] participle1Predessors = {
135 private static char[][] reflexiveEndings = {
140 private static char[][] verbEndings1 = {
160 private static char[][] verbEndings2 = {
192 private static char[][] verb1Predessors = {
197 private static char[][] nounEndings = {
236 private static char[][] superlativeEndings = {
241 private static char[][] derivationalEndings = {
247 * RussianStemmer constructor comment.
249 public RussianStemmer()
255 * Adjectival ending is an adjective ending,
256 * optionally preceded by participle ending.
257 * Creation date: (17/03/2002 12:14:58 AM)
258 * @param stemmingZone java.lang.StringBuilder
260 private boolean adjectival(StringBuilder stemmingZone)
262 // look for adjective ending in a stemming zone
263 if (!findAndRemoveEnding(stemmingZone, adjectiveEndings))
265 // if adjective ending was found, try for participle ending.
266 if (!findAndRemoveEnding(stemmingZone, participleEndings1, participle1Predessors))
267 findAndRemoveEnding(stemmingZone, participleEndings2);
272 * Derivational endings
273 * Creation date: (17/03/2002 12:14:58 AM)
274 * @param stemmingZone java.lang.StringBuilder
276 private boolean derivational(StringBuilder stemmingZone)
278 int endingLength = findEnding(stemmingZone, derivationalEndings);
279 if (endingLength == 0)
280 // no derivational ending found
284 // Ensure that the ending locates in R2
285 if (R2 - RV <= stemmingZone.length() - endingLength)
287 stemmingZone.setLength(stemmingZone.length() - endingLength);
298 * Finds ending among given ending class and returns the length of ending found(0, if not found).
299 * Creation date: (17/03/2002 8:18:34 PM)
301 private int findEnding(StringBuilder stemmingZone, int startIndex, char[][] theEndingClass)
303 boolean match = false;
304 for (int i = theEndingClass.length - 1; i >= 0; i--)
306 char[] theEnding = theEndingClass[i];
307 // check if the ending is bigger than stemming zone
308 if (startIndex < theEnding.length - 1)
314 int stemmingIndex = startIndex;
315 for (int j = theEnding.length - 1; j >= 0; j--)
317 if (stemmingZone.charAt(stemmingIndex--) != theEnding[j])
323 // check if ending was found
326 return theEndingClass[i].length; // cut ending
332 private int findEnding(StringBuilder stemmingZone, char[][] theEndingClass)
334 return findEnding(stemmingZone, stemmingZone.length() - 1, theEndingClass);
338 * Finds the ending among the given class of endings and removes it from stemming zone.
339 * Creation date: (17/03/2002 8:18:34 PM)
341 private boolean findAndRemoveEnding(StringBuilder stemmingZone, char[][] theEndingClass)
343 int endingLength = findEnding(stemmingZone, theEndingClass);
344 if (endingLength == 0)
348 stemmingZone.setLength(stemmingZone.length() - endingLength);
349 // cut the ending found
355 * Finds the ending among the given class of endings, then checks if this ending was
356 * preceded by any of given predecessors, and if so, removes it from stemming zone.
357 * Creation date: (17/03/2002 8:18:34 PM)
359 private boolean findAndRemoveEnding(StringBuilder stemmingZone,
360 char[][] theEndingClass, char[][] thePredessors)
362 int endingLength = findEnding(stemmingZone, theEndingClass);
363 if (endingLength == 0)
368 int predessorLength =
369 findEnding(stemmingZone,
370 stemmingZone.length() - endingLength - 1,
372 if (predessorLength == 0)
375 stemmingZone.setLength(stemmingZone.length() - endingLength);
376 // cut the ending found
384 * Marks positions of RV, R1 and R2 in a given word.
385 * Creation date: (16/03/2002 3:40:11 PM)
387 private void markPositions(String word)
394 while (word.length() > i && !isVowel(word.charAt(i)))
398 if (word.length() - 1 < ++i)
399 return; // RV zone is empty
402 while (word.length() > i && isVowel(word.charAt(i)))
406 if (word.length() - 1 < ++i)
407 return; // R1 zone is empty
410 while (word.length() > i && !isVowel(word.charAt(i)))
414 if (word.length() - 1 < ++i)
415 return; // R2 zone is empty
416 while (word.length() > i && isVowel(word.charAt(i)))
420 if (word.length() - 1 < ++i)
421 return; // R2 zone is empty
426 * Checks if character is a vowel..
427 * Creation date: (16/03/2002 10:47:03 PM)
431 private boolean isVowel(char letter)
433 for (int i = 0; i < vowels.length; i++)
435 if (letter == vowels[i])
443 * Creation date: (17/03/2002 12:14:58 AM)
444 * @param stemmingZone java.lang.StringBuilder
446 private boolean noun(StringBuilder stemmingZone)
448 return findAndRemoveEnding(stemmingZone, nounEndings);
452 * Perfective gerund endings.
453 * Creation date: (17/03/2002 12:14:58 AM)
454 * @param stemmingZone java.lang.StringBuilder
456 private boolean perfectiveGerund(StringBuilder stemmingZone)
458 return findAndRemoveEnding(
460 perfectiveGerundEndings1,
461 perfectiveGerund1Predessors)
462 || findAndRemoveEnding(stemmingZone, perfectiveGerundEndings2);
467 * Creation date: (17/03/2002 12:14:58 AM)
468 * @param stemmingZone java.lang.StringBuilder
470 private boolean reflexive(StringBuilder stemmingZone)
472 return findAndRemoveEnding(stemmingZone, reflexiveEndings);
476 * Insert the method's description here.
477 * Creation date: (17/03/2002 12:14:58 AM)
478 * @param stemmingZone java.lang.StringBuilder
480 private boolean removeI(StringBuilder stemmingZone)
482 if (stemmingZone.length() > 0
483 && stemmingZone.charAt(stemmingZone.length() - 1) == I)
485 stemmingZone.setLength(stemmingZone.length() - 1);
495 * Insert the method's description here.
496 * Creation date: (17/03/2002 12:14:58 AM)
497 * @param stemmingZone java.lang.StringBuilder
499 private boolean removeSoft(StringBuilder stemmingZone)
501 if (stemmingZone.length() > 0
502 && stemmingZone.charAt(stemmingZone.length() - 1) == SOFT)
504 stemmingZone.setLength(stemmingZone.length() - 1);
514 * Finds the stem for given Russian word.
515 * Creation date: (16/03/2002 3:36:48 PM)
516 * @return java.lang.String
517 * @param input java.lang.String
519 public String stem(String input)
521 markPositions(input);
523 return input; //RV wasn't detected, nothing to stem
524 StringBuilder stemmingZone = new StringBuilder(input.substring(RV));
525 // stemming goes on in RV
528 if (!perfectiveGerund(stemmingZone))
530 reflexive(stemmingZone);
531 if (!adjectival(stemmingZone))
532 if (!verb(stemmingZone))
536 removeI(stemmingZone);
538 derivational(stemmingZone);
540 superlative(stemmingZone);
541 undoubleN(stemmingZone);
542 removeSoft(stemmingZone);
544 return input.substring(0, RV) + stemmingZone.toString();
548 * Superlative endings.
549 * Creation date: (17/03/2002 12:14:58 AM)
550 * @param stemmingZone java.lang.StringBuilder
552 private boolean superlative(StringBuilder stemmingZone)
554 return findAndRemoveEnding(stemmingZone, superlativeEndings);
559 * Creation date: (17/03/2002 12:14:58 AM)
560 * @param stemmingZone java.lang.StringBuilder
562 private boolean undoubleN(StringBuilder stemmingZone)
567 if (findEnding(stemmingZone, doubleN) != 0)
569 stemmingZone.setLength(stemmingZone.length() - 1);
580 * Creation date: (17/03/2002 12:14:58 AM)
581 * @param stemmingZone java.lang.StringBuilder
583 private boolean verb(StringBuilder stemmingZone)
585 return findAndRemoveEnding(
589 || findAndRemoveEnding(stemmingZone, verbEndings2);
593 * Static method for stemming.
595 public static String stemWord(String theWord)
597 RussianStemmer stemmer = new RussianStemmer();
598 return stemmer.stem(theWord);