1 package org.apache.lucene.analysis.id;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import static org.apache.lucene.analysis.util.StemmerUtil.*;
23 * Stemmer for Indonesian.
25 * Stems Indonesian words with the algorithm presented in:
26 * <i>A Study of Stemming Effects on Information Retrieval in
27 * Bahasa Indonesia</i>, Fadillah Z Tala.
28 * http://www.illc.uva.nl/Publications/ResearchReports/MoL-2003-02.text.pdf
30 public class IndonesianStemmer {
31 private int numSyllables;
33 private static final int REMOVED_KE = 1;
34 private static final int REMOVED_PENG = 2;
35 private static final int REMOVED_DI = 4;
36 private static final int REMOVED_MENG = 8;
37 private static final int REMOVED_TER = 16;
38 private static final int REMOVED_BER = 32;
39 private static final int REMOVED_PE = 64;
42 * Stem a term (returning its new length).
44 * Use <code>stemDerivational</code> to control whether full stemming
45 * or only light inflectional stemming is done.
47 public int stem(char text[], int length, boolean stemDerivational) {
50 for (int i = 0; i < length; i++)
54 if (numSyllables > 2) length = removeParticle(text, length);
55 if (numSyllables > 2) length = removePossessivePronoun(text, length);
58 length = stemDerivational(text, length);
62 private int stemDerivational(char text[], int length) {
63 int oldLength = length;
64 if (numSyllables > 2) length = removeFirstOrderPrefix(text, length);
65 if (oldLength != length) { // a rule is fired
67 if (numSyllables > 2) length = removeSuffix(text, length);
68 if (oldLength != length) // a rule is fired
69 if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
71 if (numSyllables > 2) length = removeSecondOrderPrefix(text, length);
72 if (numSyllables > 2) length = removeSuffix(text, length);
77 private boolean isVowel(char ch) {
90 private int removeParticle(char text[], int length) {
91 if (endsWith(text, length, "kah") ||
92 endsWith(text, length, "lah") ||
93 endsWith(text, length, "pun")) {
101 private int removePossessivePronoun(char text[], int length) {
102 if (endsWith(text, length, "ku") || endsWith(text, length, "mu")) {
107 if (endsWith(text, length, "nya")) {
115 private int removeFirstOrderPrefix(char text[], int length) {
116 if (startsWith(text, length, "meng")) {
117 flags |= REMOVED_MENG;
119 return deleteN(text, 0, length, 4);
122 if (startsWith(text, length, "meny") && length > 4 && isVowel(text[4])) {
123 flags |= REMOVED_MENG;
126 return deleteN(text, 0, length, 3);
129 if (startsWith(text, length, "men")) {
130 flags |= REMOVED_MENG;
132 return deleteN(text, 0, length, 3);
135 if (startsWith(text, length, "mem")) {
136 flags |= REMOVED_MENG;
138 return deleteN(text, 0, length, 3);
141 if (startsWith(text, length, "me")) {
142 flags |= REMOVED_MENG;
144 return deleteN(text, 0, length, 2);
147 if (startsWith(text, length, "peng")) {
148 flags |= REMOVED_PENG;
150 return deleteN(text, 0, length, 4);
153 if (startsWith(text, length, "peny") && length > 4 && isVowel(text[4])) {
154 flags |= REMOVED_PENG;
157 return deleteN(text, 0, length, 3);
160 if (startsWith(text, length, "peny")) {
161 flags |= REMOVED_PENG;
163 return deleteN(text, 0, length, 4);
166 if (startsWith(text, length, "pen") && length > 3 && isVowel(text[3])) {
167 flags |= REMOVED_PENG;
170 return deleteN(text, 0, length, 2);
173 if (startsWith(text, length, "pen")) {
174 flags |= REMOVED_PENG;
176 return deleteN(text, 0, length, 3);
179 if (startsWith(text, length, "pem")) {
180 flags |= REMOVED_PENG;
182 return deleteN(text, 0, length, 3);
185 if (startsWith(text, length, "di")) {
188 return deleteN(text, 0, length, 2);
191 if (startsWith(text, length, "ter")) {
192 flags |= REMOVED_TER;
194 return deleteN(text, 0, length, 3);
197 if (startsWith(text, length, "ke")) {
200 return deleteN(text, 0, length, 2);
206 private int removeSecondOrderPrefix(char text[], int length) {
207 if (startsWith(text, length, "ber")) {
208 flags |= REMOVED_BER;
210 return deleteN(text, 0, length, 3);
213 if (length == 7 && startsWith(text, length, "belajar")) {
214 flags |= REMOVED_BER;
216 return deleteN(text, 0, length, 3);
219 if (startsWith(text, length, "be") && length > 4
220 && !isVowel(text[2]) && text[3] == 'e' && text[4] == 'r') {
221 flags |= REMOVED_BER;
223 return deleteN(text, 0, length, 2);
226 if (startsWith(text, length, "per")) {
228 return deleteN(text, 0, length, 3);
231 if (length == 7 && startsWith(text, length, "pelajar")) {
233 return deleteN(text, 0, length, 3);
236 if (startsWith(text, length, "pe")) {
239 return deleteN(text, 0, length, 2);
245 private int removeSuffix(char text[], int length) {
246 if (endsWith(text, length, "kan")
247 && (flags & REMOVED_KE) == 0
248 && (flags & REMOVED_PENG) == 0
249 && (flags & REMOVED_PE) == 0) {
254 if (endsWith(text, length, "an")
255 && (flags & REMOVED_DI) == 0
256 && (flags & REMOVED_MENG) == 0
257 && (flags & REMOVED_TER) == 0) {
262 if (endsWith(text, length, "i")
263 && !endsWith(text, length, "si")
264 && (flags & REMOVED_BER) == 0
265 && (flags & REMOVED_KE) == 0
266 && (flags & REMOVED_PENG) == 0) {