1 package org.apache.lucene.analysis.de;
2 // This file is encoded in UTF-8
5 * Licensed to the Apache Software Foundation (ASF) under one or more
6 * contributor license agreements. See the NOTICE file distributed with
7 * this work for additional information regarding copyright ownership.
8 * The ASF licenses this file to You under the Apache License, Version 2.0
9 * (the "License"); you may not use this file except in compliance with
10 * the License. You may obtain a copy of the License at
12 * http://www.apache.org/licenses/LICENSE-2.0
14 * Unless required by applicable law or agreed to in writing, software
15 * distributed under the License is distributed on an "AS IS" BASIS,
16 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17 * See the License for the specific language governing permissions and
18 * limitations under the License.
22 * A stemmer for German words.
24 * The algorithm is based on the report
25 * "A Fast and Simple Stemming Algorithm for German Words" by Jörg
26 * Caumanns (joerg.caumanns at isst.fhg.de).
29 public class GermanStemmer
32 * Buffer for the terms while stemming them.
34 private StringBuilder sb = new StringBuilder();
37 * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
39 private int substCount = 0;
42 * Stemms the given term to an unique <tt>discriminator</tt>.
44 * @param term The term that should be stemmed.
45 * @return Discriminator for <tt>term</tt>
47 protected String stem( String term )
49 // Use lowercase for medium stemming.
50 term = term.toLowerCase();
51 if ( !isStemmable( term ) )
53 // Reset the StringBuilder.
54 sb.delete( 0, sb.length() );
56 // Stemming starts here...
61 removeParticleDenotion( sb );
66 * Checks if a term could be stemmed.
68 * @return true if, and only if, the given term consists in letters.
70 private boolean isStemmable( String term )
72 for ( int c = 0; c < term.length(); c++ ) {
73 if ( !Character.isLetter( term.charAt( c ) ) )
80 * suffix stripping (stemming) on the current term. The stripping is reduced
81 * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
82 * from which all regular suffixes are build of. The simplification causes
83 * some overstemming, and way more irregular stems, but still provides unique.
84 * discriminators in the most of those cases.
85 * The algorithm is context free, except of the length restrictions.
87 private void strip( StringBuilder buffer )
89 boolean doMore = true;
90 while ( doMore && buffer.length() > 3 ) {
91 if ( ( buffer.length() + substCount > 5 ) &&
92 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
94 buffer.delete( buffer.length() - 2, buffer.length() );
96 else if ( ( buffer.length() + substCount > 4 ) &&
97 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
98 buffer.delete( buffer.length() - 2, buffer.length() );
100 else if ( ( buffer.length() + substCount > 4 ) &&
101 buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
102 buffer.delete( buffer.length() - 2, buffer.length() );
104 else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
105 buffer.deleteCharAt( buffer.length() - 1 );
107 else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
108 buffer.deleteCharAt( buffer.length() - 1 );
110 else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
111 buffer.deleteCharAt( buffer.length() - 1 );
113 // "t" occurs only as suffix of verbs.
114 else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
115 buffer.deleteCharAt( buffer.length() - 1 );
124 * Does some optimizations on the term. This optimisations are
127 private void optimize( StringBuilder buffer )
129 // Additional step for female plurals of professions and inhabitants.
130 if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
131 buffer.deleteCharAt( buffer.length() -1 );
134 // Additional step for irregular plural nouns like "Matrizen -> Matrix".
135 // NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
136 if ( buffer.length() > 0 && buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
137 buffer.setCharAt( buffer.length() - 1, 'x' );
142 * Removes a particle denotion ("ge") from a term.
144 private void removeParticleDenotion( StringBuilder buffer )
146 if ( buffer.length() > 4 ) {
147 for ( int c = 0; c < buffer.length() - 3; c++ ) {
148 if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
149 buffer.delete( c, c + 2 );
157 * Do some substitutions for the term to reduce overstemming:
159 * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
160 * "ß" is substituted by "ss"
161 * - Substitute a second char of a pair of equal characters with
162 * an asterisk: ?? -> ?*
163 * - Substitute some common character combinations with a token:
164 * sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
166 private void substitute( StringBuilder buffer )
169 for ( int c = 0; c < buffer.length(); c++ ) {
170 // Replace the second char of a pair of the equal characters with an asterisk
171 if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 ) ) {
172 buffer.setCharAt( c, '*' );
174 // Substitute Umlauts.
175 else if ( buffer.charAt( c ) == 'ä' ) {
176 buffer.setCharAt( c, 'a' );
178 else if ( buffer.charAt( c ) == 'ö' ) {
179 buffer.setCharAt( c, 'o' );
181 else if ( buffer.charAt( c ) == 'ü' ) {
182 buffer.setCharAt( c, 'u' );
184 // Fix bug so that 'ß' at the end of a word is replaced.
185 else if ( buffer.charAt( c ) == 'ß' ) {
186 buffer.setCharAt( c, 's' );
187 buffer.insert( c + 1, 's' );
190 // Take care that at least one character is left left side from the current one
191 if ( c < buffer.length() - 1 ) {
192 // Masking several common character combinations with an token
193 if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
194 buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
196 buffer.setCharAt( c, '$' );
197 buffer.delete( c + 1, c + 3 );
200 else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
201 buffer.setCharAt( c, '§' );
202 buffer.deleteCharAt( c + 1 );
205 else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
206 buffer.setCharAt( c, '%' );
207 buffer.deleteCharAt( c + 1 );
210 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
211 buffer.setCharAt( c, '&' );
212 buffer.deleteCharAt( c + 1 );
215 else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
216 buffer.setCharAt( c, '#' );
217 buffer.deleteCharAt( c + 1 );
220 else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
221 buffer.setCharAt( c, '!' );
222 buffer.deleteCharAt( c + 1 );
230 * Undoes the changes made by substitute(). That are character pairs and
231 * character combinations. Umlauts will remain as their corresponding vowel,
232 * as "ß" remains as "ss".
234 private void resubstitute( StringBuilder buffer )
236 for ( int c = 0; c < buffer.length(); c++ ) {
237 if ( buffer.charAt( c ) == '*' ) {
238 char x = buffer.charAt( c - 1 );
239 buffer.setCharAt( c, x );
241 else if ( buffer.charAt( c ) == '$' ) {
242 buffer.setCharAt( c, 's' );
243 buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
245 else if ( buffer.charAt( c ) == '§' ) {
246 buffer.setCharAt( c, 'c' );
247 buffer.insert( c + 1, 'h' );
249 else if ( buffer.charAt( c ) == '%' ) {
250 buffer.setCharAt( c, 'e' );
251 buffer.insert( c + 1, 'i' );
253 else if ( buffer.charAt( c ) == '&' ) {
254 buffer.setCharAt( c, 'i' );
255 buffer.insert( c + 1, 'e' );
257 else if ( buffer.charAt( c ) == '#' ) {
258 buffer.setCharAt( c, 'i' );
259 buffer.insert( c + 1, 'g' );
261 else if ( buffer.charAt( c ) == '!' ) {
262 buffer.setCharAt( c, 's' );
263 buffer.insert( c + 1, 't' );