lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java

   1 package org.apache.lucene.analysis;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 /*
  21
  22    Porter stemmer in Java. The original paper is in
  23
  24        Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14,
  25        no. 3, pp 130-137,
  26
  27    See also http://www.tartarus.org/~martin/PorterStemmer/index.html
  28
  29    Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below.
  30    Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1]
  31    is then out outside the bounds of b.
  32
  33    Similarly,
  34
  35    Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below.
  36    'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and
  37    b[j] is then outside the bounds of b.
  38
  39    Release 3.
  40
  41    [ This version is derived from Release 3, modified by Brian Goetz to
  42      optimize for fewer object creations.  ]
  43
  44 */
  45
  46
  47 import java.io.IOException;
  48 import java.io.InputStream;
  49 import java.io.FileInputStream;
  50
  51 import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR;
  52 import org.apache.lucene.util.ArrayUtil;
  53
  54 /**
  55  *
  56  * Stemmer, implementing the Porter Stemming Algorithm
  57  *
  58  * The Stemmer class transforms a word into its root form.  The input
  59  * word can be provided a character at time (by calling add()), or at once
  60  * by calling one of the various stem(something) methods.
  61  */
  62
  63 class PorterStemmer
  64 {
  65   private char[] b;
  66   private int i,    /* offset into b */
  67     j, k, k0;
  68   private boolean dirty = false;
  69   private static final int INITIAL_SIZE = 50;
  70
  71   public PorterStemmer() {
  72     b = new char[INITIAL_SIZE];
  73     i = 0;
  74   }
  75
  76   /**
  77    * reset() resets the stemmer so it can stem another word.  If you invoke
  78    * the stemmer by calling add(char) and then stem(), you must call reset()
  79    * before starting another word.
  80    */
  81   public void reset() { i = 0; dirty = false; }
  82
  83   /**
  84    * Add a character to the word being stemmed.  When you are finished
  85    * adding characters, you can call stem(void) to process the word.
  86    */
  87   public void add(char ch) {
  88     if (b.length <= i) {
  89       b = ArrayUtil.grow(b, i+1);
  90     }
  91     b[i++] = ch;
  92   }
  93
  94   /**
  95    * After a word has been stemmed, it can be retrieved by toString(),
  96    * or a reference to the internal buffer can be retrieved by getResultBuffer
  97    * and getResultLength (which is generally more efficient.)
  98    */
  99   @Override
 100   public String toString() { return new String(b,0,i); }
 101
 102   /**
 103    * Returns the length of the word resulting from the stemming process.
 104    */
 105   public int getResultLength() { return i; }
 106
 107   /**
 108    * Returns a reference to a character buffer containing the results of
 109    * the stemming process.  You also need to consult getResultLength()
 110    * to determine the length of the result.
 111    */
 112   public char[] getResultBuffer() { return b; }
 113
 114   /* cons(i) is true <=> b[i] is a consonant. */
 115
 116   private final boolean cons(int i) {
 117     switch (b[i]) {
 118     case 'a': case 'e': case 'i': case 'o': case 'u':
 119       return false;
 120     case 'y':
 121       return (i==k0) ? true : !cons(i-1);
 122     default:
 123       return true;
 124     }
 125   }
 126
 127   /* m() measures the number of consonant sequences between k0 and j. if c is
 128      a consonant sequence and v a vowel sequence, and <..> indicates arbitrary
 129      presence,
 130
 131           <c><v>       gives 0
 132           <c>vc<v>     gives 1
 133           <c>vcvc<v>   gives 2
 134           <c>vcvcvc<v> gives 3
 135           ....
 136   */
 137
 138   private final int m() {
 139     int n = 0;
 140     int i = k0;
 141     while(true) {
 142       if (i > j)
 143         return n;
 144       if (! cons(i))
 145         break;
 146       i++;
 147     }
 148     i++;
 149     while(true) {
 150       while(true) {
 151         if (i > j)
 152           return n;
 153         if (cons(i))
 154           break;
 155         i++;
 156       }
 157       i++;
 158       n++;
 159       while(true) {
 160         if (i > j)
 161           return n;
 162         if (! cons(i))
 163           break;
 164         i++;
 165       }
 166       i++;
 167     }
 168   }
 169
 170   /* vowelinstem() is true <=> k0,...j contains a vowel */
 171
 172   private final boolean vowelinstem() {
 173     int i;
 174     for (i = k0; i <= j; i++)
 175       if (! cons(i))
 176         return true;
 177     return false;
 178   }
 179
 180   /* doublec(j) is true <=> j,(j-1) contain a double consonant. */
 181
 182   private final boolean doublec(int j) {
 183     if (j < k0+1)
 184       return false;
 185     if (b[j] != b[j-1])
 186       return false;
 187     return cons(j);
 188   }
 189
 190   /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant
 191      and also if the second c is not w,x or y. this is used when trying to
 192      restore an e at the end of a short word. e.g.
 193
 194           cav(e), lov(e), hop(e), crim(e), but
 195           snow, box, tray.
 196
 197   */
 198
 199   private final boolean cvc(int i) {
 200     if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2))
 201       return false;
 202     else {
 203       int ch = b[i];
 204       if (ch == 'w' || ch == 'x' || ch == 'y') return false;
 205     }
 206     return true;
 207   }
 208
 209   private final boolean ends(String s) {
 210     int l = s.length();
 211     int o = k-l+1;
 212     if (o < k0)
 213       return false;
 214     for (int i = 0; i < l; i++)
 215       if (b[o+i] != s.charAt(i))
 216         return false;
 217     j = k-l;
 218     return true;
 219   }
 220
 221   /* setto(s) sets (j+1),...k to the characters in the string s, readjusting
 222      k. */
 223
 224   void setto(String s) {
 225     int l = s.length();
 226     int o = j+1;
 227     for (int i = 0; i < l; i++)
 228       b[o+i] = s.charAt(i);
 229     k = j+l;
 230     dirty = true;
 231   }
 232
 233   /* r(s) is used further down. */
 234
 235   void r(String s) { if (m() > 0) setto(s); }
 236
 237   /* step1() gets rid of plurals and -ed or -ing. e.g.
 238
 239            caresses  ->  caress
 240            ponies    ->  poni
 241            ties      ->  ti
 242            caress    ->  caress
 243            cats      ->  cat
 244
 245            feed      ->  feed
 246            agreed    ->  agree
 247            disabled  ->  disable
 248
 249            matting   ->  mat
 250            mating    ->  mate
 251            meeting   ->  meet
 252            milling   ->  mill
 253            messing   ->  mess
 254
 255            meetings  ->  meet
 256
 257   */
 258
 259   private final void step1() {
 260     if (b[k] == 's') {
 261       if (ends("sses")) k -= 2;
 262       else if (ends("ies")) setto("i");
 263       else if (b[k-1] != 's') k--;
 264     }
 265     if (ends("eed")) {
 266       if (m() > 0)
 267         k--;
 268     }
 269     else if ((ends("ed") || ends("ing")) && vowelinstem()) {
 270       k = j;
 271       if (ends("at")) setto("ate");
 272       else if (ends("bl")) setto("ble");
 273       else if (ends("iz")) setto("ize");
 274       else if (doublec(k)) {
 275         int ch = b[k--];
 276         if (ch == 'l' || ch == 's' || ch == 'z')
 277           k++;
 278       }
 279       else if (m() == 1 && cvc(k))
 280         setto("e");
 281     }
 282   }
 283
 284   /* step2() turns terminal y to i when there is another vowel in the stem. */
 285
 286   private final void step2() {
 287     if (ends("y") && vowelinstem()) {
 288       b[k] = 'i';
 289       dirty = true;
 290     }
 291   }
 292
 293   /* step3() maps double suffices to single ones. so -ization ( = -ize plus
 294      -ation) maps to -ize etc. note that the string before the suffix must give
 295      m() > 0. */
 296
 297   private final void step3() {
 298     if (k == k0) return; /* For Bug 1 */
 299     switch (b[k-1]) {
 300     case 'a':
 301       if (ends("ational")) { r("ate"); break; }
 302       if (ends("tional")) { r("tion"); break; }
 303       break;
 304     case 'c':
 305       if (ends("enci")) { r("ence"); break; }
 306       if (ends("anci")) { r("ance"); break; }
 307       break;
 308     case 'e':
 309       if (ends("izer")) { r("ize"); break; }
 310       break;
 311     case 'l':
 312       if (ends("bli")) { r("ble"); break; }
 313       if (ends("alli")) { r("al"); break; }
 314       if (ends("entli")) { r("ent"); break; }
 315       if (ends("eli")) { r("e"); break; }
 316       if (ends("ousli")) { r("ous"); break; }
 317       break;
 318     case 'o':
 319       if (ends("ization")) { r("ize"); break; }
 320       if (ends("ation")) { r("ate"); break; }
 321       if (ends("ator")) { r("ate"); break; }
 322       break;
 323     case 's':
 324       if (ends("alism")) { r("al"); break; }
 325       if (ends("iveness")) { r("ive"); break; }
 326       if (ends("fulness")) { r("ful"); break; }
 327       if (ends("ousness")) { r("ous"); break; }
 328       break;
 329     case 't':
 330       if (ends("aliti")) { r("al"); break; }
 331       if (ends("iviti")) { r("ive"); break; }
 332       if (ends("biliti")) { r("ble"); break; }
 333       break;
 334     case 'g':
 335       if (ends("logi")) { r("log"); break; }
 336     }
 337   }
 338
 339   /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */
 340
 341   private final void step4() {
 342     switch (b[k]) {
 343     case 'e':
 344       if (ends("icate")) { r("ic"); break; }
 345       if (ends("ative")) { r(""); break; }
 346       if (ends("alize")) { r("al"); break; }
 347       break;
 348     case 'i':
 349       if (ends("iciti")) { r("ic"); break; }
 350       break;
 351     case 'l':
 352       if (ends("ical")) { r("ic"); break; }
 353       if (ends("ful")) { r(""); break; }
 354       break;
 355     case 's':
 356       if (ends("ness")) { r(""); break; }
 357       break;
 358     }
 359   }
 360
 361   /* step5() takes off -ant, -ence etc., in context <c>vcvc<v>. */
 362
 363   private final void step5() {
 364     if (k == k0) return; /* for Bug 1 */
 365     switch (b[k-1]) {
 366     case 'a':
 367       if (ends("al")) break;
 368       return;
 369     case 'c':
 370       if (ends("ance")) break;
 371       if (ends("ence")) break;
 372       return;
 373     case 'e':
 374       if (ends("er")) break; return;
 375     case 'i':
 376       if (ends("ic")) break; return;
 377     case 'l':
 378       if (ends("able")) break;
 379       if (ends("ible")) break; return;
 380     case 'n':
 381       if (ends("ant")) break;
 382       if (ends("ement")) break;
 383       if (ends("ment")) break;
 384       /* element etc. not stripped before the m */
 385       if (ends("ent")) break;
 386       return;
 387     case 'o':
 388       if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break;
 389       /* j >= 0 fixes Bug 2 */
 390       if (ends("ou")) break;
 391       return;
 392       /* takes care of -ous */
 393     case 's':
 394       if (ends("ism")) break;
 395       return;
 396     case 't':
 397       if (ends("ate")) break;
 398       if (ends("iti")) break;
 399       return;
 400     case 'u':
 401       if (ends("ous")) break;
 402       return;
 403     case 'v':
 404       if (ends("ive")) break;
 405       return;
 406     case 'z':
 407       if (ends("ize")) break;
 408       return;
 409     default:
 410       return;
 411     }
 412     if (m() > 1)
 413       k = j;
 414   }
 415
 416   /* step6() removes a final -e if m() > 1. */
 417
 418   private final void step6() {
 419     j = k;
 420     if (b[k] == 'e') {
 421       int a = m();
 422       if (a > 1 || a == 1 && !cvc(k-1))
 423         k--;
 424     }
 425     if (b[k] == 'l' && doublec(k) && m() > 1)
 426       k--;
 427   }
 428
 429
 430   /**
 431    * Stem a word provided as a String.  Returns the result as a String.
 432    */
 433   public String stem(String s) {
 434     if (stem(s.toCharArray(), s.length()))
 435       return toString();
 436     else
 437       return s;
 438   }
 439
 440   /** Stem a word contained in a char[].  Returns true if the stemming process
 441    * resulted in a word different from the input.  You can retrieve the
 442    * result with getResultLength()/getResultBuffer() or toString().
 443    */
 444   public boolean stem(char[] word) {
 445     return stem(word, word.length);
 446   }
 447
 448   /** Stem a word contained in a portion of a char[] array.  Returns
 449    * true if the stemming process resulted in a word different from
 450    * the input.  You can retrieve the result with
 451    * getResultLength()/getResultBuffer() or toString().
 452    */
 453   public boolean stem(char[] wordBuffer, int offset, int wordLen) {
 454     reset();
 455     if (b.length < wordLen) {
 456       b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)];
 457     }
 458     System.arraycopy(wordBuffer, offset, b, 0, wordLen);
 459     i = wordLen;
 460     return stem(0);
 461   }
 462
 463   /** Stem a word contained in a leading portion of a char[] array.
 464    * Returns true if the stemming process resulted in a word different
 465    * from the input.  You can retrieve the result with
 466    * getResultLength()/getResultBuffer() or toString().
 467    */
 468   public boolean stem(char[] word, int wordLen) {
 469     return stem(word, 0, wordLen);
 470   }
 471
 472   /** Stem the word placed into the Stemmer buffer through calls to add().
 473    * Returns true if the stemming process resulted in a word different
 474    * from the input.  You can retrieve the result with
 475    * getResultLength()/getResultBuffer() or toString().
 476    */
 477   public boolean stem() {
 478     return stem(0);
 479   }
 480
 481   public boolean stem(int i0) {
 482     k = i - 1;
 483     k0 = i0;
 484     if (k > k0+1) {
 485       step1(); step2(); step3(); step4(); step5(); step6();
 486     }
 487     // Also, a word is considered dirty if we lopped off letters
 488     // Thanks to Ifigenia Vairelles for pointing this out.
 489     if (i != k+1)
 490       dirty = true;
 491     i = k+1;
 492     return dirty;
 493   }
 494
 495   /** Test program for demonstrating the Stemmer.  It reads a file and
 496    * stems each word, writing the result to standard out.
 497    * Usage: Stemmer file-name
 498    */
 499   public static void main(String[] args) {
 500     PorterStemmer s = new PorterStemmer();
 501
 502     for (int i = 0; i < args.length; i++) {
 503       try {
 504         InputStream in = new FileInputStream(args[i]);
 505         byte[] buffer = new byte[1024];
 506         int bufferLen, offset, ch;
 507
 508         bufferLen = in.read(buffer);
 509         offset = 0;
 510         s.reset();
 511
 512         while(true) {
 513           if (offset < bufferLen)
 514             ch = buffer[offset++];
 515           else {
 516             bufferLen = in.read(buffer);
 517             offset = 0;
 518             if (bufferLen < 0)
 519               ch = -1;
 520             else
 521               ch = buffer[offset++];
 522           }
 523
 524           if (Character.isLetter((char) ch)) {
 525             s.add(Character.toLowerCase((char) ch));
 526           }
 527           else {
 528              s.stem();
 529              System.out.print(s.toString());
 530              s.reset();
 531              if (ch < 0)
 532                break;
 533              else {
 534                System.out.print((char) ch);
 535              }
 536            }
 537         }
 538
 539         in.close();
 540       }
 541       catch (IOException e) {
 542         System.out.println("error reading " + args[i]);
 543       }
 544     }
 545   }
 546 }
 547