lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/de/GermanStemmer.java

   1 package org.apache.lucene.analysis.de;
   2 // This file is encoded in UTF-8
   3
   4 /**
   5  * Licensed to the Apache Software Foundation (ASF) under one or more
   6  * contributor license agreements.  See the NOTICE file distributed with
   7  * this work for additional information regarding copyright ownership.
   8  * The ASF licenses this file to You under the Apache License, Version 2.0
   9  * (the "License"); you may not use this file except in compliance with
  10  * the License.  You may obtain a copy of the License at
  11  *
  12  *     http://www.apache.org/licenses/LICENSE-2.0
  13  *
  14  * Unless required by applicable law or agreed to in writing, software
  15  * distributed under the License is distributed on an "AS IS" BASIS,
  16  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  17  * See the License for the specific language governing permissions and
  18  * limitations under the License.
  19  */
  20
  21 /**
  22  * A stemmer for German words.
  23  * <p>
  24  * The algorithm is based on the report
  25  * "A Fast and Simple Stemming Algorithm for German Words" by J&ouml;rg
  26  * Caumanns (joerg.caumanns at isst.fhg.de).
  27  * </p>
  28  */
  29 public class GermanStemmer
  30 {
  31     /**
  32      * Buffer for the terms while stemming them.
  33      */
  34     private StringBuilder sb = new StringBuilder();
  35
  36     /**
  37      * Amount of characters that are removed with <tt>substitute()</tt> while stemming.
  38      */
  39     private int substCount = 0;
  40
  41     /**
  42      * Stemms the given term to an unique <tt>discriminator</tt>.
  43      *
  44      * @param term  The term that should be stemmed.
  45      * @return      Discriminator for <tt>term</tt>
  46      */
  47     protected String stem( String term )
  48     {
  49       // Use lowercase for medium stemming.
  50       term = term.toLowerCase();
  51       if ( !isStemmable( term ) )
  52         return term;
  53       // Reset the StringBuilder.
  54       sb.delete( 0, sb.length() );
  55       sb.insert( 0, term );
  56       // Stemming starts here...
  57       substitute( sb );
  58       strip( sb );
  59       optimize( sb );
  60       resubstitute( sb );
  61       removeParticleDenotion( sb );
  62       return sb.toString();
  63     }
  64
  65     /**
  66      * Checks if a term could be stemmed.
  67      *
  68      * @return  true if, and only if, the given term consists in letters.
  69      */
  70     private boolean isStemmable( String term )
  71     {
  72       for ( int c = 0; c < term.length(); c++ ) {
  73         if ( !Character.isLetter( term.charAt( c ) ) )
  74           return false;
  75       }
  76       return true;
  77     }
  78
  79     /**
  80      * suffix stripping (stemming) on the current term. The stripping is reduced
  81      * to the seven "base" suffixes "e", "s", "n", "t", "em", "er" and * "nd",
  82      * from which all regular suffixes are build of. The simplification causes
  83      * some overstemming, and way more irregular stems, but still provides unique.
  84      * discriminators in the most of those cases.
  85      * The algorithm is context free, except of the length restrictions.
  86      */
  87     private void strip( StringBuilder buffer )
  88     {
  89       boolean doMore = true;
  90       while ( doMore && buffer.length() > 3 ) {
  91         if ( ( buffer.length() + substCount > 5 ) &&
  92           buffer.substring( buffer.length() - 2, buffer.length() ).equals( "nd" ) )
  93         {
  94           buffer.delete( buffer.length() - 2, buffer.length() );
  95         }
  96         else if ( ( buffer.length() + substCount > 4 ) &&
  97           buffer.substring( buffer.length() - 2, buffer.length() ).equals( "em" ) ) {
  98             buffer.delete( buffer.length() - 2, buffer.length() );
  99         }
 100         else if ( ( buffer.length() + substCount > 4 ) &&
 101           buffer.substring( buffer.length() - 2, buffer.length() ).equals( "er" ) ) {
 102             buffer.delete( buffer.length() - 2, buffer.length() );
 103         }
 104         else if ( buffer.charAt( buffer.length() - 1 ) == 'e' ) {
 105           buffer.deleteCharAt( buffer.length() - 1 );
 106         }
 107         else if ( buffer.charAt( buffer.length() - 1 ) == 's' ) {
 108           buffer.deleteCharAt( buffer.length() - 1 );
 109         }
 110         else if ( buffer.charAt( buffer.length() - 1 ) == 'n' ) {
 111           buffer.deleteCharAt( buffer.length() - 1 );
 112         }
 113         // "t" occurs only as suffix of verbs.
 114         else if ( buffer.charAt( buffer.length() - 1 ) == 't' ) {
 115           buffer.deleteCharAt( buffer.length() - 1 );
 116         }
 117         else {
 118           doMore = false;
 119         }
 120       }
 121     }
 122
 123     /**
 124      * Does some optimizations on the term. This optimisations are
 125      * contextual.
 126      */
 127     private void optimize( StringBuilder buffer )
 128     {
 129       // Additional step for female plurals of professions and inhabitants.
 130       if ( buffer.length() > 5 && buffer.substring( buffer.length() - 5, buffer.length() ).equals( "erin*" ) ) {
 131         buffer.deleteCharAt( buffer.length() -1 );
 132         strip( buffer );
 133       }
 134       // Additional step for irregular plural nouns like "Matrizen -> Matrix".
 135       // NOTE: this length constraint is probably not a great value, its just to prevent AIOOBE on empty terms
 136       if ( buffer.length() > 0 && buffer.charAt( buffer.length() - 1 ) == ( 'z' ) ) {
 137         buffer.setCharAt( buffer.length() - 1, 'x' );
 138       }
 139     }
 140
 141     /**
 142      * Removes a particle denotion ("ge") from a term.
 143      */
 144     private void removeParticleDenotion( StringBuilder buffer )
 145     {
 146       if ( buffer.length() > 4 ) {
 147         for ( int c = 0; c < buffer.length() - 3; c++ ) {
 148           if ( buffer.substring( c, c + 4 ).equals( "gege" ) ) {
 149             buffer.delete( c, c + 2 );
 150             return;
 151           }
 152         }
 153       }
 154     }
 155
 156     /**
 157      * Do some substitutions for the term to reduce overstemming:
 158      *
 159      * - Substitute Umlauts with their corresponding vowel: äöü -> aou,
 160      *   "ß" is substituted by "ss"
 161      * - Substitute a second char of a pair of equal characters with
 162      *   an asterisk: ?? -> ?*
 163      * - Substitute some common character combinations with a token:
 164      *   sch/ch/ei/ie/ig/st -> $/§/%/&/#/!
 165      */
 166     private void substitute( StringBuilder buffer )
 167     {
 168       substCount = 0;
 169       for ( int c = 0; c < buffer.length(); c++ ) {
 170         // Replace the second char of a pair of the equal characters with an asterisk
 171         if ( c > 0 && buffer.charAt( c ) == buffer.charAt ( c - 1 )  ) {
 172           buffer.setCharAt( c, '*' );
 173         }
 174         // Substitute Umlauts.
 175         else if ( buffer.charAt( c ) == 'ä' ) {
 176           buffer.setCharAt( c, 'a' );
 177         }
 178         else if ( buffer.charAt( c ) == 'ö' ) {
 179           buffer.setCharAt( c, 'o' );
 180         }
 181         else if ( buffer.charAt( c ) == 'ü' ) {
 182           buffer.setCharAt( c, 'u' );
 183         }
 184         // Fix bug so that 'ß' at the end of a word is replaced.
 185         else if ( buffer.charAt( c ) == 'ß' ) {
 186             buffer.setCharAt( c, 's' );
 187             buffer.insert( c + 1, 's' );
 188             substCount++;
 189         }
 190         // Take care that at least one character is left left side from the current one
 191         if ( c < buffer.length() - 1 ) {
 192           // Masking several common character combinations with an token
 193           if ( ( c < buffer.length() - 2 ) && buffer.charAt( c ) == 's' &&
 194             buffer.charAt( c + 1 ) == 'c' && buffer.charAt( c + 2 ) == 'h' )
 195           {
 196             buffer.setCharAt( c, '$' );
 197             buffer.delete( c + 1, c + 3 );
 198             substCount =+ 2;
 199           }
 200           else if ( buffer.charAt( c ) == 'c' && buffer.charAt( c + 1 ) == 'h' ) {
 201             buffer.setCharAt( c, '§' );
 202             buffer.deleteCharAt( c + 1 );
 203             substCount++;
 204           }
 205           else if ( buffer.charAt( c ) == 'e' && buffer.charAt( c + 1 ) == 'i' ) {
 206             buffer.setCharAt( c, '%' );
 207             buffer.deleteCharAt( c + 1 );
 208             substCount++;
 209           }
 210           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'e' ) {
 211             buffer.setCharAt( c, '&' );
 212             buffer.deleteCharAt( c + 1 );
 213             substCount++;
 214           }
 215           else if ( buffer.charAt( c ) == 'i' && buffer.charAt( c + 1 ) == 'g' ) {
 216             buffer.setCharAt( c, '#' );
 217             buffer.deleteCharAt( c + 1 );
 218             substCount++;
 219           }
 220           else if ( buffer.charAt( c ) == 's' && buffer.charAt( c + 1 ) == 't' ) {
 221             buffer.setCharAt( c, '!' );
 222             buffer.deleteCharAt( c + 1 );
 223             substCount++;
 224           }
 225         }
 226       }
 227     }
 228
 229     /**
 230      * Undoes the changes made by substitute(). That are character pairs and
 231      * character combinations. Umlauts will remain as their corresponding vowel,
 232      * as "ß" remains as "ss".
 233      */
 234     private void resubstitute( StringBuilder buffer )
 235     {
 236       for ( int c = 0; c < buffer.length(); c++ ) {
 237         if ( buffer.charAt( c ) == '*' ) {
 238           char x = buffer.charAt( c - 1 );
 239           buffer.setCharAt( c, x );
 240         }
 241         else if ( buffer.charAt( c ) == '$' ) {
 242           buffer.setCharAt( c, 's' );
 243           buffer.insert( c + 1, new char[]{'c', 'h'}, 0, 2 );
 244         }
 245         else if ( buffer.charAt( c ) == '§' ) {
 246           buffer.setCharAt( c, 'c' );
 247           buffer.insert( c + 1, 'h' );
 248         }
 249         else if ( buffer.charAt( c ) == '%' ) {
 250           buffer.setCharAt( c, 'e' );
 251           buffer.insert( c + 1, 'i' );
 252         }
 253         else if ( buffer.charAt( c ) == '&' ) {
 254           buffer.setCharAt( c, 'i' );
 255           buffer.insert( c + 1, 'e' );
 256         }
 257         else if ( buffer.charAt( c ) == '#' ) {
 258           buffer.setCharAt( c, 'i' );
 259           buffer.insert( c + 1, 'g' );
 260         }
 261         else if ( buffer.charAt( c ) == '!' ) {
 262           buffer.setCharAt( c, 's' );
 263           buffer.insert( c + 1, 't' );
 264         }
 265       }
 266     }
 267
 268 }