1 package org.apache.lucene.analysis.bg;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
23 * Light Stemmer for Bulgarian.
25 * Implements the algorithm described in:
27 * Searching Strategies for the Bulgarian Language
29 * http://members.unine.ch/jacques.savoy/Papers/BUIR.pdf
31 public class BulgarianStemmer {
34 * Stem an input buffer of Bulgarian text.
36 * @param s input buffer
37 * @param len length of input buffer
38 * @return length of input buffer after normalization
40 public int stem(final char s[], int len) {
41 if (len < 4) // do not stem
44 if (len > 5 && endsWith(s, len, "ища"))
47 len = removeArticle(s, len);
48 len = removePlural(s, len);
51 if (endsWith(s, len, "я"))
53 if (endsWith(s, len, "а") ||
54 endsWith(s, len, "о") ||
55 endsWith(s, len, "е"))
59 // the rule to rewrite ен -> н is duplicated in the paper.
60 // in the perl implementation referenced by the paper, this is fixed.
61 // (it is fixed here as well)
62 if (len > 4 && endsWith(s, len, "ен")) {
63 s[len - 2] = 'н'; // replace with н
67 if (len > 5 && s[len - 2] == 'ъ') {
68 s[len - 2] = s[len - 1]; // replace ъN with N
76 * Mainly remove the definite article
77 * @param s input buffer
78 * @param len length of input buffer
79 * @return new stemmed length
81 private int removeArticle(final char s[], final int len) {
82 if (len > 6 && endsWith(s, len, "ият"))
86 if (endsWith(s, len, "ът") ||
87 endsWith(s, len, "то") ||
88 endsWith(s, len, "те") ||
89 endsWith(s, len, "та") ||
90 endsWith(s, len, "ия"))
94 if (len > 4 && endsWith(s, len, "ят"))
100 private int removePlural(final char s[], final int len) {
102 if (endsWith(s, len, "овци"))
103 return len - 3; // replace with о
104 if (endsWith(s, len, "ове"))
106 if (endsWith(s, len, "еве")) {
107 s[len - 3] = 'й'; // replace with й
113 if (endsWith(s, len, "ища"))
115 if (endsWith(s, len, "та"))
117 if (endsWith(s, len, "ци")) {
118 s[len - 2] = 'к'; // replace with к
121 if (endsWith(s, len, "зи")) {
122 s[len - 2] = 'г'; // replace with г
126 if (s[len - 3] == 'е' && s[len - 1] == 'и') {
127 s[len - 3] = 'я'; // replace е with я, remove и
133 if (endsWith(s, len, "си")) {
134 s[len - 2] = 'х'; // replace with х
137 if (endsWith(s, len, "и"))