lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/en/KStemmer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 /*
  19 This file was partially derived from the
  20 original CIIR University of Massachusetts Amherst version of KStemmer.java (license for
  21 the original shown below)
  22  */
  23
  24 /*
  25  Copyright © 2003,
  26  Center for Intelligent Information Retrieval,
  27  University of Massachusetts, Amherst.
  28  All rights reserved.
  29
  30  Redistribution and use in source and binary forms, with or without modification,
  31  are permitted provided that the following conditions are met:
  32
  33  1. Redistributions of source code must retain the above copyright notice, this
  34  list of conditions and the following disclaimer.
  35
  36  2. Redistributions in binary form must reproduce the above copyright notice,
  37  this list of conditions and the following disclaimer in the documentation
  38  and/or other materials provided with the distribution.
  39
  40  3. The names "Center for Intelligent Information Retrieval" and
  41  "University of Massachusetts" must not be used to endorse or promote products
  42  derived from this software without prior written permission. To obtain
  43  permission, contact info@ciir.cs.umass.edu.
  44
  45  THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
  46  "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
  47  THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
  48  ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
  49  LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
  50  CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
  51  GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
  52  HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
  53  LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
  54  OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
  55  SUCH DAMAGE.
  56  */
  57 package org.apache.lucene.analysis.en;
  58
  59 import org.apache.lucene.analysis.CharArrayMap;
  60 import org.apache.lucene.analysis.util.OpenStringBuilder;
  61 /**
  62  * <p>Title: Kstemmer</p>
  63  * <p>Description: This is a java version of Bob Krovetz' kstem stemmer</p>
  64  * <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
  65  * <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
  66  */
  67 import org.apache.lucene.util.Version;
  68
  69 /**
  70  * This class implements the Kstem algorithm
  71  */
  72 public class KStemmer {
  73   static private final int MaxWordLen = 50;
  74
  75   static private final String[] exceptionWords = {"aide", "bathe", "caste",
  76       "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
  77       "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
  78       "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
  79       "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
  80       "swinge", "tare", "tine", "tope", "tripe", "twine"};
  81
  82   static private final String[][] directConflations = { {"aging", "age"},
  83       {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
  84       {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
  85       {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
  86       {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
  87       {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
  88       {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
  89       {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
  90       {"political", "politics"}, {"metaphysical", "metaphysics"},
  91       {"cylindrical", "cylinder"}, {"nazism", "nazi"},
  92       {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
  93       {"credulity", "credulous"}, {"generosity", "generous"},
  94       {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
  95       {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
  96
  97   static private final String[][] countryNationality = {
  98       {"afghan", "afghanistan"}, {"african", "africa"},
  99       {"albanian", "albania"}, {"algerian", "algeria"},
 100       {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
 101       {"arabian", "arabia"}, {"argentine", "argentina"},
 102       {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
 103       {"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
 104       {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
 105       {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
 106       {"bosnian", "bosnia"}, {"botswanan", "botswana"},
 107       {"brazilian", "brazil"}, {"british", "britain"},
 108       {"bulgarian", "bulgaria"}, {"burmese", "burma"},
 109       {"californian", "california"}, {"cambodian", "cambodia"},
 110       {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
 111       {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
 112       {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
 113       {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
 114       {"egyptian", "egypt"}, {"equadorian", "equador"},
 115       {"eritrean", "eritrea"}, {"estonian", "estonia"},
 116       {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
 117       {"filipino", "philippines"}, {"finnish", "finland"},
 118       {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
 119       {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
 120       {"grenadan", "grenada"}, {"guamian", "guam"},
 121       {"guatemalan", "guatemala"}, {"guinean", "guinea"},
 122       {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
 123       {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
 124       {"icelandic", "iceland"}, {"indonesian", "indonesia"},
 125       {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
 126       {"irish", "ireland"}, {"israeli", "israel"},
 127       {"italian", "italy"},
 128       {"jamaican", "jamaica"},
 129       {"japanese", "japan"},
 130       {"jordanian", "jordan"},
 131       {"kampuchean", "cambodia"},
 132       {"kenyan", "kenya"},
 133       {"korean", "korea"},
 134       {"kuwaiti", "kuwait"},
 135       {"lankan", "lanka"},
 136       {"laotian", "laos"},
 137       {"latvian", "latvia"},
 138       {"lebanese", "lebanon"},
 139       {"liberian", "liberia"},
 140       {"libyan", "libya"},
 141       {"lithuanian", "lithuania"},
 142       {"macedonian", "macedonia"},
 143       {"madagascan", "madagascar"},
 144       {"malaysian", "malaysia"},
 145       {"maltese", "malta"},
 146       {"mauritanian", "mauritania"},
 147       {"mexican", "mexico"},
 148       {"micronesian", "micronesia"},
 149       {"moldovan", "moldova"},
 150       {"monacan", "monaco"},
 151       {"mongolian", "mongolia"},
 152       {"montenegran", "montenegro"},
 153       {"moroccan", "morocco"},
 154       {"myanmar", "burma"},
 155       {"namibian", "namibia"},
 156       {"nepalese", "nepal"},
 157       // {"netherlands", "dutch"},
 158       {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
 159       {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
 160       {"panamanian", "panama"}, {"papuan", "papua"},
 161       {"paraguayan", "paraguay"}, {"peruvian", "peru"},
 162       {"portuguese", "portugal"}, {"romanian", "romania"},
 163       {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
 164       {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
 165       {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
 166       {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
 167       {"slovenian", "slovenia"}, {"somali", "somalia"},
 168       {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
 169       {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
 170       {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
 171       {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
 172       {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
 173       {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
 174       {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
 175       {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
 176       {"zambian", "zambia"}, {"zealander", "zealand"},
 177       {"zimbabwean", "zimbabwe"}};
 178
 179   static private final String[] supplementDict = {"aids", "applicator",
 180       "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
 181       "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
 182       "toroid", "superconduct", "backscatter", "connectionism"};
 183
 184   static private final String[] properNouns = {"abrams", "achilles",
 185       "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
 186       "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
 187       "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
 188       "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
 189       "barbados", "barger", "bering", "brahms", "brandeis", "brussels",
 190       "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
 191       "charles", "cheops", "ching", "christmas", "cocos", "collins",
 192       "columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
 193       "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
 194       "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
 195       "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
 196       "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
 197       "everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
 198       "fleming", "forbes", "fowler", "france", "francis", "goering",
 199       "goodling", "goths", "grenadines", "guiness", "hades", "harding",
 200       "harris", "hastings", "hawkes", "hawking", "hayes", "heights",
 201       "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
 202       "hopkins", "hughes", "humphreys", "illinois", "indianapolis",
 203       "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
 204       "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
 205       "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
 206       "levis", "leviticus", "lewis", "louis", "maccabees", "madras",
 207       "maimonides", "maldive", "massachusetts", "matthews", "mauritius",
 208       "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
 209       "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
 210       "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
 211       "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
 212       "pauling", "peking", "pershing", "peter", "peters", "philippines",
 213       "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
 214       "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
 215       "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
 216       "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
 217       "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
 218       "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
 219       "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
 220       "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
 221       "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
 222       "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
 223       "connally", "conner", "coors", "cummings", "curtis", "daedalus",
 224       "dionysus", "dobbs", "dolores", "edmonds"};
 225
 226   static class DictEntry {
 227     boolean exception;
 228     String root;
 229
 230     DictEntry(String root, boolean isException) {
 231       this.root = root;
 232       this.exception = isException;
 233     }
 234   }
 235
 236   private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
 237
 238   /***
 239    * caching off private int maxCacheSize; private CharArrayMap<String> cache =
 240    * null; private static final String SAME = "SAME"; // use if stemmed form is
 241    * the same
 242    ***/
 243
 244   private final OpenStringBuilder word = new OpenStringBuilder();
 245   private int j; /* index of final letter in stem (within word) */
 246   private int k; /*
 247                   * INDEX of final letter in word. You must add 1 to k to get
 248                   * the current length of word. When you want the length of
 249                   * word, use the method wordLength, which returns (k+1).
 250                   */
 251
 252   /***
 253    * private void initializeStemHash() { if (maxCacheSize > 0) cache = new
 254    * CharArrayMap<String>(maxCacheSize,false); }
 255    ***/
 256
 257   private char finalChar() {
 258     return word.charAt(k);
 259   }
 260
 261   private char penultChar() {
 262     return word.charAt(k - 1);
 263   }
 264
 265   private boolean isVowel(int index) {
 266     return !isCons(index);
 267   }
 268
 269   private boolean isCons(int index) {
 270     char ch;
 271
 272     ch = word.charAt(index);
 273
 274     if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
 275     if ((ch != 'y') || (index == 0)) return true;
 276     else return (!isCons(index - 1));
 277   }
 278
 279   private static CharArrayMap<DictEntry> initializeDictHash() {
 280     DictEntry defaultEntry;
 281     DictEntry entry;
 282
 283     CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
 284         Version.LUCENE_31, 1000, false);
 285
 286     d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
 287     for (int i = 0; i < exceptionWords.length; i++) {
 288       if (!d.containsKey(exceptionWords[i])) {
 289         entry = new DictEntry(exceptionWords[i], true);
 290         d.put(exceptionWords[i], entry);
 291       } else {
 292         System.out.println("Warning: Entry [" + exceptionWords[i]
 293             + "] already in dictionary 1");
 294       }
 295     }
 296
 297     for (int i = 0; i < directConflations.length; i++) {
 298       if (!d.containsKey(directConflations[i][0])) {
 299         entry = new DictEntry(directConflations[i][1], false);
 300         d.put(directConflations[i][0], entry);
 301       } else {
 302         System.out.println("Warning: Entry [" + directConflations[i][0]
 303             + "] already in dictionary 2");
 304       }
 305     }
 306
 307     for (int i = 0; i < countryNationality.length; i++) {
 308       if (!d.containsKey(countryNationality[i][0])) {
 309         entry = new DictEntry(countryNationality[i][1], false);
 310         d.put(countryNationality[i][0], entry);
 311       } else {
 312         System.out.println("Warning: Entry [" + countryNationality[i][0]
 313             + "] already in dictionary 3");
 314       }
 315     }
 316
 317     defaultEntry = new DictEntry(null, false);
 318
 319     String[] array;
 320     array = KStemData1.data;
 321
 322     for (int i = 0; i < array.length; i++) {
 323       if (!d.containsKey(array[i])) {
 324         d.put(array[i], defaultEntry);
 325       } else {
 326         System.out.println("Warning: Entry [" + array[i]
 327             + "] already in dictionary 4");
 328       }
 329     }
 330
 331     array = KStemData2.data;
 332     for (int i = 0; i < array.length; i++) {
 333       if (!d.containsKey(array[i])) {
 334         d.put(array[i], defaultEntry);
 335       } else {
 336         System.out.println("Warning: Entry [" + array[i]
 337             + "] already in dictionary 4");
 338       }
 339     }
 340
 341     array = KStemData3.data;
 342     for (int i = 0; i < array.length; i++) {
 343       if (!d.containsKey(array[i])) {
 344         d.put(array[i], defaultEntry);
 345       } else {
 346         System.out.println("Warning: Entry [" + array[i]
 347             + "] already in dictionary 4");
 348       }
 349     }
 350
 351     array = KStemData4.data;
 352     for (int i = 0; i < array.length; i++) {
 353       if (!d.containsKey(array[i])) {
 354         d.put(array[i], defaultEntry);
 355       } else {
 356         System.out.println("Warning: Entry [" + array[i]
 357             + "] already in dictionary 4");
 358       }
 359     }
 360
 361     array = KStemData5.data;
 362     for (int i = 0; i < array.length; i++) {
 363       if (!d.containsKey(array[i])) {
 364         d.put(array[i], defaultEntry);
 365       } else {
 366         System.out.println("Warning: Entry [" + array[i]
 367             + "] already in dictionary 4");
 368       }
 369     }
 370
 371     array = KStemData6.data;
 372     for (int i = 0; i < array.length; i++) {
 373       if (!d.containsKey(array[i])) {
 374         d.put(array[i], defaultEntry);
 375       } else {
 376         System.out.println("Warning: Entry [" + array[i]
 377             + "] already in dictionary 4");
 378       }
 379     }
 380
 381     array = KStemData7.data;
 382     for (int i = 0; i < array.length; i++) {
 383       if (!d.containsKey(array[i])) {
 384         d.put(array[i], defaultEntry);
 385       } else {
 386         System.out.println("Warning: Entry [" + array[i]
 387             + "] already in dictionary 4");
 388       }
 389     }
 390
 391     for (int i = 0; i < KStemData8.data.length; i++) {
 392       if (!d.containsKey(KStemData8.data[i])) {
 393         d.put(KStemData8.data[i], defaultEntry);
 394       } else {
 395         System.out.println("Warning: Entry [" + KStemData8.data[i]
 396             + "] already in dictionary 4");
 397       }
 398     }
 399
 400     for (int i = 0; i < supplementDict.length; i++) {
 401       if (!d.containsKey(supplementDict[i])) {
 402         d.put(supplementDict[i], defaultEntry);
 403       } else {
 404         System.out.println("Warning: Entry [" + supplementDict[i]
 405             + "] already in dictionary 5");
 406       }
 407     }
 408
 409     for (int i = 0; i < properNouns.length; i++) {
 410       if (!d.containsKey(properNouns[i])) {
 411         d.put(properNouns[i], defaultEntry);
 412       } else {
 413         System.out.println("Warning: Entry [" + properNouns[i]
 414             + "] already in dictionary 6");
 415       }
 416     }
 417
 418     return d;
 419   }
 420
 421   private boolean isAlpha(char ch) {
 422     return ch >= 'a' && ch <= 'z'; // terms must be lowercased already
 423   }
 424
 425   /* length of stem within word */
 426   private int stemLength() {
 427     return j + 1;
 428   };
 429
 430   private boolean endsIn(char[] s) {
 431     if (s.length > k) return false;
 432
 433     int r = word.length() - s.length; /* length of word before this suffix */
 434     j = k;
 435     for (int r1 = r, i = 0; i < s.length; i++, r1++) {
 436       if (s[i] != word.charAt(r1)) return false;
 437     }
 438     j = r - 1; /* index of the character BEFORE the posfix */
 439     return true;
 440   }
 441
 442   private boolean endsIn(char a, char b) {
 443     if (2 > k) return false;
 444     // check left to right since the endings have often already matched
 445     if (word.charAt(k - 1) == a && word.charAt(k) == b) {
 446       j = k - 2;
 447       return true;
 448     }
 449     return false;
 450   }
 451
 452   private boolean endsIn(char a, char b, char c) {
 453     if (3 > k) return false;
 454     if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
 455         && word.charAt(k) == c) {
 456       j = k - 3;
 457       return true;
 458     }
 459     return false;
 460   }
 461
 462   private boolean endsIn(char a, char b, char c, char d) {
 463     if (4 > k) return false;
 464     if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
 465         && word.charAt(k - 1) == c && word.charAt(k) == d) {
 466       j = k - 4;
 467       return true;
 468     }
 469     return false;
 470   }
 471
 472   private DictEntry wordInDict() {
 473     /***
 474      * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
 475      * word.size()) != matchedEntry) {
 476      * System.out.println("Uh oh... cached entry doesn't match"); } return
 477      * matchedEntry; }
 478      ***/
 479     if (matchedEntry != null) return matchedEntry;
 480     DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
 481     if (e != null && !e.exception) {
 482       matchedEntry = e; // only cache if it's not an exception.
 483     }
 484     // lookups.add(word.toString());
 485     return e;
 486   }
 487
 488   /* Convert plurals to singular form, and '-ies' to 'y' */
 489   private void plural() {
 490     if (word.charAt(k) == 's') {
 491       if (endsIn('i', 'e', 's')) {
 492         word.setLength(j + 3);
 493         k--;
 494         if (lookup()) /* ensure calories -> calorie */
 495         return;
 496         k++;
 497         word.unsafeWrite('s');
 498         setSuffix("y");
 499         lookup();
 500       } else if (endsIn('e', 's')) {
 501         /* try just removing the "s" */
 502         word.setLength(j + 2);
 503         k--;
 504
 505         /*
 506          * note: don't check for exceptions here. So, `aides' -> `aide', but
 507          * `aided' -> `aid'. The exception for double s is used to prevent
 508          * crosses -> crosse. This is actually correct if crosses is a plural
 509          * noun (a type of racket used in lacrosse), but the verb is much more
 510          * common
 511          */
 512
 513         /****
 514          * YCS: this was the one place where lookup was not followed by return.
 515          * So restructure it. if ((j>0)&&(lookup(word.toString())) &&
 516          * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
 517          *****/
 518         boolean tryE = j > 0
 519             && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
 520         if (tryE && lookup()) return;
 521
 522         /* try removing the "es" */
 523
 524         word.setLength(j + 1);
 525         k--;
 526         if (lookup()) return;
 527
 528         /* the default is to retain the "e" */
 529         word.unsafeWrite('e');
 530         k++;
 531
 532         if (!tryE) lookup(); // if we didn't try the "e" ending before
 533         return;
 534       } else {
 535         if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
 536           /* unless the word ends in "ous" or a double "s", remove the final "s" */
 537
 538           word.setLength(k);
 539           k--;
 540           lookup();
 541         }
 542       }
 543     }
 544   }
 545
 546   private void setSuffix(String s) {
 547     setSuff(s, s.length());
 548   }
 549
 550   /* replace old suffix with s */
 551   private void setSuff(String s, int len) {
 552     word.setLength(j + 1);
 553     for (int l = 0; l < len; l++) {
 554       word.unsafeWrite(s.charAt(l));
 555     }
 556     k = j + len;
 557   }
 558
 559   /* Returns true if the word is found in the dictionary */
 560   // almost all uses of lookup() return immediately and are
 561   // followed by another lookup in the dict. Store the match
 562   // to avoid this double lookup.
 563   DictEntry matchedEntry = null;
 564
 565   private boolean lookup() {
 566     /******
 567      * debugging code String thisLookup = word.toString(); boolean added =
 568      * lookups.add(thisLookup); if (!added) {
 569      * System.out.println("######extra lookup:" + thisLookup); // occaasional
 570      * extra lookups aren't necessarily errors... could happen by diff
 571      * manipulations // throw new RuntimeException("######extra lookup:" +
 572      * thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
 573      * }
 574      ******/
 575
 576     matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
 577     return matchedEntry != null;
 578   }
 579
 580   // Set<String> lookups = new HashSet<String>();
 581
 582   /* convert past tense (-ed) to present, and `-ied' to `y' */
 583   private void pastTense() {
 584     /*
 585      * Handle words less than 5 letters with a direct mapping This prevents
 586      * (fled -> fl).
 587      */
 588     if (word.length() <= 4) return;
 589
 590     if (endsIn('i', 'e', 'd')) {
 591       word.setLength(j + 3);
 592       k--;
 593       if (lookup()) /* we almost always want to convert -ied to -y, but */
 594       return; /* this isn't true for short words (died->die) */
 595       k++; /* I don't know any long words that this applies to, */
 596       word.unsafeWrite('d'); /* but just in case... */
 597       setSuffix("y");
 598       lookup();
 599       return;
 600     }
 601
 602     /* the vowelInStem() is necessary so we don't stem acronyms */
 603     if (endsIn('e', 'd') && vowelInStem()) {
 604       /* see if the root ends in `e' */
 605       word.setLength(j + 2);
 606       k = j + 1;
 607
 608       DictEntry entry = wordInDict();
 609       if (entry != null) if (!entry.exception) /*
 610                                                 * if it's in the dictionary and
 611                                                 * not an exception
 612                                                 */
 613       return;
 614
 615       /* try removing the "ed" */
 616       word.setLength(j + 1);
 617       k = j;
 618       if (lookup()) return;
 619
 620       /*
 621        * try removing a doubled consonant. if the root isn't found in the
 622        * dictionary, the default is to leave it doubled. This will correctly
 623        * capture `backfilled' -> `backfill' instead of `backfill' ->
 624        * `backfille', and seems correct most of the time
 625        */
 626
 627       if (doubleC(k)) {
 628         word.setLength(k);
 629         k--;
 630         if (lookup()) return;
 631         word.unsafeWrite(word.charAt(k));
 632         k++;
 633         lookup();
 634         return;
 635       }
 636
 637       /* if we have a `un-' prefix, then leave the word alone */
 638       /* (this will sometimes screw up with `under-', but we */
 639       /* will take care of that later) */
 640
 641       if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
 642         word.unsafeWrite('e');
 643         word.unsafeWrite('d');
 644         k = k + 2;
 645         // nolookup()
 646         return;
 647       }
 648
 649       /*
 650        * it wasn't found by just removing the `d' or the `ed', so prefer to end
 651        * with an `e' (e.g., `microcoded' -> `microcode').
 652        */
 653
 654       word.setLength(j + 1);
 655       word.unsafeWrite('e');
 656       k = j + 1;
 657       // nolookup() - we already tried the "e" ending
 658       return;
 659     }
 660   }
 661
 662   /* return TRUE if word ends with a double consonant */
 663   private boolean doubleC(int i) {
 664     if (i < 1) return false;
 665
 666     if (word.charAt(i) != word.charAt(i - 1)) return false;
 667     return (isCons(i));
 668   }
 669
 670   private boolean vowelInStem() {
 671     for (int i = 0; i < stemLength(); i++) {
 672       if (isVowel(i)) return true;
 673     }
 674     return false;
 675   }
 676
 677   /* handle `-ing' endings */
 678   private void aspect() {
 679     /*
 680      * handle short words (aging -> age) via a direct mapping. This prevents
 681      * (thing -> the) in the version of this routine that ignores inflectional
 682      * variants that are mentioned in the dictionary (when the root is also
 683      * present)
 684      */
 685
 686     if (word.length() <= 5) return;
 687
 688     /* the vowelinstem() is necessary so we don't stem acronyms */
 689     if (endsIn('i', 'n', 'g') && vowelInStem()) {
 690
 691       /* try adding an `e' to the stem and check against the dictionary */
 692       word.setCharAt(j + 1, 'e');
 693       word.setLength(j + 2);
 694       k = j + 1;
 695
 696       DictEntry entry = wordInDict();
 697       if (entry != null) {
 698         if (!entry.exception) /* if it's in the dictionary and not an exception */
 699         return;
 700       }
 701
 702       /* adding on the `e' didn't work, so remove it */
 703       word.setLength(k);
 704       k--; /* note that `ing' has also been removed */
 705
 706       if (lookup()) return;
 707
 708       /* if I can remove a doubled consonant and get a word, then do so */
 709       if (doubleC(k)) {
 710         k--;
 711         word.setLength(k + 1);
 712         if (lookup()) return;
 713         word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */
 714
 715         /* the default is to leave the consonant doubled */
 716         /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */
 717         /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */
 718         /* Without making the algorithm significantly more complicated, this */
 719         /* is the best I can do */
 720         k++;
 721         lookup();
 722         return;
 723       }
 724
 725       /*
 726        * the word wasn't in the dictionary after removing the stem, and then
 727        * checking with and without a final `e'. The default is to add an `e'
 728        * unless the word ends in two consonants, so `microcoding' ->
 729        * `microcode'. The two consonants restriction wouldn't normally be
 730        * necessary, but is needed because we don't try to deal with prefixes and
 731        * compounds, and most of the time it is correct (e.g., footstamping ->
 732        * footstamp, not footstampe; however, decoupled -> decoupl). We can
 733        * prevent almost all of the incorrect stems if we try to do some prefix
 734        * analysis first
 735        */
 736
 737       if ((j > 0) && isCons(j) && isCons(j - 1)) {
 738         k = j;
 739         word.setLength(k + 1);
 740         // nolookup() because we already did according to the comment
 741         return;
 742       }
 743
 744       word.setLength(j + 1);
 745       word.unsafeWrite('e');
 746       k = j + 1;
 747       // nolookup(); we already tried an 'e' ending
 748       return;
 749     }
 750   }
 751
 752   /*
 753    * this routine deals with -ity endings. It accepts -ability, -ibility, and
 754    * -ality, even without checking the dictionary because they are so
 755    * productive. The first two are mapped to -ble, and the -ity is remove for
 756    * the latter
 757    */
 758   private void ityEndings() {
 759     int old_k = k;
 760
 761     if (endsIn('i', 't', 'y')) {
 762       word.setLength(j + 1); /* try just removing -ity */
 763       k = j;
 764       if (lookup()) return;
 765       word.unsafeWrite('e'); /* try removing -ity and adding -e */
 766       k = j + 1;
 767       if (lookup()) return;
 768       word.setCharAt(j + 1, 'i');
 769       word.append("ty");
 770       k = old_k;
 771       /*
 772        * the -ability and -ibility endings are highly productive, so just accept
 773        * them
 774        */
 775       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
 776         word.setLength(j - 1);
 777         word.append("le"); /* convert to -ble */
 778         k = j;
 779         lookup();
 780         return;
 781       }
 782
 783       /* ditto for -ivity */
 784       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
 785         word.setLength(j + 1);
 786         word.unsafeWrite('e'); /* convert to -ive */
 787         k = j + 1;
 788         lookup();
 789         return;
 790       }
 791       /* ditto for -ality */
 792       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
 793         word.setLength(j + 1);
 794         k = j;
 795         lookup();
 796         return;
 797       }
 798
 799       /*
 800        * if the root isn't in the dictionary, and the variant *is* there, then
 801        * use the variant. This allows `immunity'->`immune', but prevents
 802        * `capacity'->`capac'. If neither the variant nor the root form are in
 803        * the dictionary, then remove the ending as a default
 804        */
 805
 806       if (lookup()) return;
 807
 808       /* the default is to remove -ity altogether */
 809       word.setLength(j + 1);
 810       k = j;
 811       // nolookup(), we already did it.
 812       return;
 813     }
 814   }
 815
 816   /* handle -ence and -ance */
 817   private void nceEndings() {
 818     int old_k = k;
 819     char word_char;
 820
 821     if (endsIn('n', 'c', 'e')) {
 822       word_char = word.charAt(j);
 823       if (!((word_char == 'e') || (word_char == 'a'))) return;
 824       word.setLength(j);
 825       word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */
 826       k = j;
 827       if (lookup()) return;
 828       word.setLength(j); /*
 829                           * try removing -e/ance altogether
 830                           * (disappearance/disappear)
 831                           */
 832       k = j - 1;
 833       if (lookup()) return;
 834       word.unsafeWrite(word_char); /* restore the original ending */
 835       word.append("nce");
 836       k = old_k;
 837       // nolookup() because we restored the original ending
 838     }
 839     return;
 840   }
 841
 842   /* handle -ness */
 843   private void nessEndings() {
 844     if (endsIn('n', 'e', 's', 's')) { /*
 845                                        * this is a very productive endings, so
 846                                        * just accept it
 847                                        */
 848       word.setLength(j + 1);
 849       k = j;
 850       if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
 851       lookup();
 852     }
 853     return;
 854   }
 855
 856   /* handle -ism */
 857   private void ismEndings() {
 858     if (endsIn('i', 's', 'm')) { /*
 859                                   * this is a very productive ending, so just
 860                                   * accept it
 861                                   */
 862       word.setLength(j + 1);
 863       k = j;
 864       lookup();
 865     }
 866     return;
 867   }
 868
 869   /* this routine deals with -ment endings. */
 870   private void mentEndings() {
 871     int old_k = k;
 872
 873     if (endsIn('m', 'e', 'n', 't')) {
 874       word.setLength(j + 1);
 875       k = j;
 876       if (lookup()) return;
 877       word.append("ment");
 878       k = old_k;
 879       // nolookup
 880     }
 881     return;
 882   }
 883
 884   /* this routine deals with -ize endings. */
 885   private void izeEndings() {
 886     int old_k = k;
 887
 888     if (endsIn('i', 'z', 'e')) {
 889       word.setLength(j + 1); /* try removing -ize entirely */
 890       k = j;
 891       if (lookup()) return;
 892       word.unsafeWrite('i');
 893
 894       if (doubleC(j)) { /* allow for a doubled consonant */
 895         word.setLength(j);
 896         k = j - 1;
 897         if (lookup()) return;
 898         word.unsafeWrite(word.charAt(j - 1));
 899       }
 900
 901       word.setLength(j + 1);
 902       word.unsafeWrite('e'); /* try removing -ize and adding -e */
 903       k = j + 1;
 904       if (lookup()) return;
 905       word.setLength(j + 1);
 906       word.append("ize");
 907       k = old_k;
 908       // nolookup()
 909     }
 910     return;
 911   }
 912
 913   /* handle -ency and -ancy */
 914   private void ncyEndings() {
 915     if (endsIn('n', 'c', 'y')) {
 916       if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
 917       word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */
 918       word.setLength(j + 3);
 919       k = j + 2;
 920
 921       if (lookup()) return;
 922
 923       word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */
 924       word.unsafeWrite('e');
 925       k = j + 3;
 926       lookup();
 927     }
 928     return;
 929   }
 930
 931   /* handle -able and -ible */
 932   private void bleEndings() {
 933     int old_k = k;
 934     char word_char;
 935
 936     if (endsIn('b', 'l', 'e')) {
 937       if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
 938       word_char = word.charAt(j);
 939       word.setLength(j); /* try just removing the ending */
 940       k = j - 1;
 941       if (lookup()) return;
 942       if (doubleC(k)) { /* allow for a doubled consonant */
 943         word.setLength(k);
 944         k--;
 945         if (lookup()) return;
 946         k++;
 947         word.unsafeWrite(word.charAt(k - 1));
 948       }
 949       word.setLength(j);
 950       word.unsafeWrite('e'); /* try removing -a/ible and adding -e */
 951       k = j;
 952       if (lookup()) return;
 953       word.setLength(j);
 954       word.append("ate"); /* try removing -able and adding -ate */
 955       /* (e.g., compensable/compensate) */
 956       k = j + 2;
 957       if (lookup()) return;
 958       word.setLength(j);
 959       word.unsafeWrite(word_char); /* restore the original values */
 960       word.append("ble");
 961       k = old_k;
 962       // nolookup()
 963     }
 964     return;
 965   }
 966
 967   /*
 968    * handle -ic endings. This is fairly straightforward, but this is also the
 969    * only place we try *expanding* an ending, -ic -> -ical. This is to handle
 970    * cases like `canonic' -> `canonical'
 971    */
 972   private void icEndings() {
 973     if (endsIn('i', 'c')) {
 974       word.setLength(j + 3);
 975       word.append("al"); /* try converting -ic to -ical */
 976       k = j + 4;
 977       if (lookup()) return;
 978
 979       word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */
 980       word.setLength(j + 2);
 981       k = j + 1;
 982       if (lookup()) return;
 983
 984       word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */
 985       if (lookup()) return;
 986
 987       word.setLength(j + 1); /* try removing -ic altogether */
 988       k = j;
 989       if (lookup()) return;
 990       word.append("ic"); /* restore the original ending */
 991       k = j + 2;
 992       // nolookup()
 993     }
 994     return;
 995   }
 996
 997   private static char[] ization = "ization".toCharArray();
 998   private static char[] ition = "ition".toCharArray();
 999   private static char[] ation = "ation".toCharArray();
1000   private static char[] ication = "ication".toCharArray();
1001
1002   /* handle some derivational endings */
1003   /*
1004    * this routine deals with -ion, -ition, -ation, -ization, and -ication. The
1005    * -ization ending is always converted to -ize
1006    */
1007   private void ionEndings() {
1008     int old_k = k;
1009     if (!endsIn('i', 'o', 'n')) {
1010       return;
1011     }
1012
1013     if (endsIn(ization)) { /*
1014                             * the -ize ending is very productive, so simply
1015                             * accept it as the root
1016                             */
1017       word.setLength(j + 3);
1018       word.unsafeWrite('e');
1019       k = j + 3;
1020       lookup();
1021       return;
1022     }
1023
1024     if (endsIn(ition)) {
1025       word.setLength(j + 1);
1026       word.unsafeWrite('e');
1027       k = j + 1;
1028       if (lookup()) /*
1029                      * remove -ition and add `e', and check against the
1030                      * dictionary
1031                      */
1032       return; /* (e.g., definition->define, opposition->oppose) */
1033
1034       /* restore original values */
1035       word.setLength(j + 1);
1036       word.append("ition");
1037       k = old_k;
1038       // nolookup()
1039     } else if (endsIn(ation)) {
1040       word.setLength(j + 3);
1041       word.unsafeWrite('e');
1042       k = j + 3;
1043       if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1044       return; /* (elmination -> eliminate) */
1045
1046       word.setLength(j + 1);
1047       word.unsafeWrite('e'); /*
1048                               * remove -ation and add `e', and check against the
1049                               * dictionary
1050                               */
1051       k = j + 1;
1052       if (lookup()) return;
1053
1054       word.setLength(j + 1);/*
1055                              * just remove -ation (resignation->resign) and
1056                              * check dictionary
1057                              */
1058       k = j;
1059       if (lookup()) return;
1060
1061       /* restore original values */
1062       word.setLength(j + 1);
1063       word.append("ation");
1064       k = old_k;
1065       // nolookup()
1066
1067     }
1068
1069     /*
1070      * test -ication after -ation is attempted (e.g., `complication->complicate'
1071      * rather than `complication->comply')
1072      */
1073
1074     if (endsIn(ication)) {
1075       word.setLength(j + 1);
1076       word.unsafeWrite('y');
1077       k = j + 1;
1078       if (lookup()) /*
1079                      * remove -ication and add `y', and check against the
1080                      * dictionary
1081                      */
1082       return; /* (e.g., amplification -> amplify) */
1083
1084       /* restore original values */
1085       word.setLength(j + 1);
1086       word.append("ication");
1087       k = old_k;
1088       // nolookup()
1089     }
1090
1091     // if (endsIn(ion)) {
1092     if (true) { // we checked for this earlier... just need to set "j"
1093       j = k - 3; // YCS
1094
1095       word.setLength(j + 1);
1096       word.unsafeWrite('e');
1097       k = j + 1;
1098       if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1099       return;
1100
1101       word.setLength(j + 1);
1102       k = j;
1103       if (lookup()) /* remove -ion, and if it's found, treat that as the root */
1104       return;
1105
1106       /* restore original values */
1107       word.setLength(j + 1);
1108       word.append("ion");
1109       k = old_k;
1110       // nolookup()
1111     }
1112
1113     // nolookup(); all of the other paths restored original values
1114     return;
1115   }
1116
1117   /*
1118    * this routine deals with -er, -or, -ier, and -eer. The -izer ending is
1119    * always converted to -ize
1120    */
1121   private void erAndOrEndings() {
1122     int old_k = k;
1123
1124     if (word.charAt(k) != 'r') return; // YCS
1125
1126     char word_char; /* so we can remember if it was -er or -or */
1127
1128     if (endsIn('i', 'z', 'e', 'r')) { /*
1129                                        * -ize is very productive, so accept it
1130                                        * as the root
1131                                        */
1132       word.setLength(j + 4);
1133       k = j + 3;
1134       lookup();
1135       return;
1136     }
1137
1138     if (endsIn('e', 'r') || endsIn('o', 'r')) {
1139       word_char = word.charAt(j + 1);
1140       if (doubleC(j)) {
1141         word.setLength(j);
1142         k = j - 1;
1143         if (lookup()) return;
1144         word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */
1145       }
1146
1147       if (word.charAt(j) == 'i') { /* do we have a -ier ending? */
1148         word.setCharAt(j, 'y');
1149         word.setLength(j + 1);
1150         k = j;
1151         if (lookup()) /* yes, so check against the dictionary */
1152         return;
1153         word.setCharAt(j, 'i'); /* restore the endings */
1154         word.unsafeWrite('e');
1155       }
1156
1157       if (word.charAt(j) == 'e') { /* handle -eer */
1158         word.setLength(j);
1159         k = j - 1;
1160         if (lookup()) return;
1161         word.unsafeWrite('e');
1162       }
1163
1164       word.setLength(j + 2); /* remove the -r ending */
1165       k = j + 1;
1166       if (lookup()) return;
1167       word.setLength(j + 1); /* try removing -er/-or */
1168       k = j;
1169       if (lookup()) return;
1170       word.unsafeWrite('e'); /* try removing -or and adding -e */
1171       k = j + 1;
1172       if (lookup()) return;
1173       word.setLength(j + 1);
1174       word.unsafeWrite(word_char);
1175       word.unsafeWrite('r'); /* restore the word to the way it was */
1176       k = old_k;
1177       // nolookup()
1178     }
1179
1180   }
1181
1182   /*
1183    * this routine deals with -ly endings. The -ally ending is always converted
1184    * to -al Sometimes this will temporarily leave us with a non-word (e.g.,
1185    * heuristically maps to heuristical), but then the -al is removed in the next
1186    * step.
1187    */
1188   private void lyEndings() {
1189     int old_k = k;
1190
1191     if (endsIn('l', 'y')) {
1192
1193       word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */
1194
1195       if (lookup()) return;
1196       word.setCharAt(j + 2, 'y');
1197
1198       word.setLength(j + 1); /* try just removing the -ly */
1199       k = j;
1200
1201       if (lookup()) return;
1202
1203       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /*
1204                                                                               * always
1205                                                                               * convert
1206                                                                               * -
1207                                                                               * ally
1208                                                                               * to
1209                                                                               * -
1210                                                                               * al
1211                                                                               */
1212       return;
1213       word.append("ly");
1214       k = old_k;
1215
1216       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { /*
1217                                                                                 * always
1218                                                                                 * convert
1219                                                                                 * -
1220                                                                                 * ably
1221                                                                                 * to
1222                                                                                 * -
1223                                                                                 * able
1224                                                                                 */
1225         word.setCharAt(j + 2, 'e');
1226         k = j + 2;
1227         return;
1228       }
1229
1230       if (word.charAt(j) == 'i') { /* e.g., militarily -> military */
1231         word.setLength(j);
1232         word.unsafeWrite('y');
1233         k = j;
1234         if (lookup()) return;
1235         word.setLength(j);
1236         word.append("ily");
1237         k = old_k;
1238       }
1239
1240       word.setLength(j + 1); /* the default is to remove -ly */
1241
1242       k = j;
1243       // nolookup()... we already tried removing the "ly" variant
1244     }
1245     return;
1246   }
1247
1248   /*
1249    * this routine deals with -al endings. Some of the endings from the previous
1250    * routine are finished up here.
1251    */
1252   private void alEndings() {
1253     int old_k = k;
1254
1255     if (word.length() < 4) return;
1256     if (endsIn('a', 'l')) {
1257       word.setLength(j + 1);
1258       k = j;
1259       if (lookup()) /* try just removing the -al */
1260       return;
1261
1262       if (doubleC(j)) { /* allow for a doubled consonant */
1263         word.setLength(j);
1264         k = j - 1;
1265         if (lookup()) return;
1266         word.unsafeWrite(word.charAt(j - 1));
1267       }
1268
1269       word.setLength(j + 1);
1270       word.unsafeWrite('e'); /* try removing the -al and adding -e */
1271       k = j + 1;
1272       if (lookup()) return;
1273
1274       word.setLength(j + 1);
1275       word.append("um"); /* try converting -al to -um */
1276       /* (e.g., optimal - > optimum ) */
1277       k = j + 2;
1278       if (lookup()) return;
1279
1280       word.setLength(j + 1);
1281       word.append("al"); /* restore the ending to the way it was */
1282       k = old_k;
1283
1284       if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
1285         word.setLength(j - 1); /* try removing -ical */
1286         k = j - 2;
1287         if (lookup()) return;
1288
1289         word.setLength(j - 1);
1290         word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */
1291         k = j - 1;
1292         if (lookup()) return;
1293
1294         word.setLength(j - 1);
1295         word.append("ic"); /* the default is to convert -ical to -ic */
1296         k = j;
1297         // nolookup() ... converting ical to ic means removing "al" which we
1298         // already tried
1299         // ERROR
1300         lookup();
1301         return;
1302       }
1303
1304       if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */
1305         word.setLength(j); /* (sometimes it gets turned into -y, but we */
1306         k = j - 1; /* aren't dealing with that case for now) */
1307         if (lookup()) return;
1308         word.append("ial");
1309         k = old_k;
1310         lookup();
1311       }
1312
1313     }
1314     return;
1315   }
1316
1317   /*
1318    * this routine deals with -ive endings. It normalizes some of the -ative
1319    * endings directly, and also maps some -ive endings to -ion.
1320    */
1321   private void iveEndings() {
1322     int old_k = k;
1323
1324     if (endsIn('i', 'v', 'e')) {
1325       word.setLength(j + 1); /* try removing -ive entirely */
1326       k = j;
1327       if (lookup()) return;
1328
1329       word.unsafeWrite('e'); /* try removing -ive and adding -e */
1330       k = j + 1;
1331       if (lookup()) return;
1332       word.setLength(j + 1);
1333       word.append("ive");
1334       if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
1335         word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */
1336         word.setLength(j); /* (e.g., determinative -> determine) */
1337         k = j - 1;
1338         if (lookup()) return;
1339         word.setLength(j - 1); /* try just removing -ative */
1340         if (lookup()) return;
1341
1342         word.append("ative");
1343         k = old_k;
1344       }
1345
1346       /* try mapping -ive to -ion (e.g., injunctive/injunction) */
1347       word.setCharAt(j + 2, 'o');
1348       word.setCharAt(j + 3, 'n');
1349       if (lookup()) return;
1350
1351       word.setCharAt(j + 2, 'v'); /* restore the original values */
1352       word.setCharAt(j + 3, 'e');
1353       k = old_k;
1354       // nolookup()
1355     }
1356     return;
1357   }
1358
1359   KStemmer() {}
1360
1361   String stem(String term) {
1362     boolean changed = stem(term.toCharArray(), term.length());
1363     if (!changed) return term;
1364     return asString();
1365   }
1366
1367   /**
1368    * Returns the result of the stem (assuming the word was changed) as a String.
1369    */
1370   String asString() {
1371     String s = getString();
1372     if (s != null) return s;
1373     return word.toString();
1374   }
1375
1376   CharSequence asCharSequence() {
1377     return result != null ? result : word;
1378   }
1379
1380   String getString() {
1381     return result;
1382   }
1383
1384   char[] getChars() {
1385     return word.getArray();
1386   }
1387
1388   int getLength() {
1389     return word.length();
1390   }
1391
1392   String result;
1393
1394   private boolean matched() {
1395     /***
1396      * if (!lookups.contains(word.toString())) { throw new
1397      * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
1398      * }
1399      ***/
1400     // lookup();
1401     return matchedEntry != null;
1402   }
1403
1404   /**
1405    * Stems the text in the token. Returns true if changed.
1406    */
1407   boolean stem(char[] term, int len) {
1408
1409     result = null;
1410
1411     k = len - 1;
1412     if ((k <= 1) || (k >= MaxWordLen - 1)) {
1413       return false; // don't stem
1414     }
1415
1416     // first check the stemmer dictionaries, and avoid using the
1417     // cache if it's in there.
1418     DictEntry entry = dict_ht.get(term, 0, len);
1419     if (entry != null) {
1420       if (entry.root != null) {
1421         result = entry.root;
1422         return true;
1423       }
1424       return false;
1425     }
1426
1427     /***
1428      * caching off is normally faster if (cache == null) initializeStemHash();
1429      *
1430      * // now check the cache, before we copy chars to "word" if (cache != null)
1431      * { String val = cache.get(term, 0, len); if (val != null) { if (val !=
1432      * SAME) { result = val; return true; } return false; } }
1433      ***/
1434
1435     word.reset();
1436     // allocate enough space so that an expansion is never needed
1437     word.reserve(len + 10);
1438     for (int i = 0; i < len; i++) {
1439       char ch = term[i];
1440       if (!isAlpha(ch)) return false; // don't stem
1441       // don't lowercase... it's a requirement that lowercase filter be
1442       // used before this stemmer.
1443       word.unsafeWrite(ch);
1444     }
1445
1446     matchedEntry = null;
1447     /***
1448      * lookups.clear(); lookups.add(word.toString());
1449      ***/
1450
1451     /*
1452      * This while loop will never be executed more than one time; it is here
1453      * only to allow the break statement to be used to escape as soon as a word
1454      * is recognized
1455      */
1456     while (true) {
1457       // YCS: extra lookup()s were inserted so we don't need to
1458       // do an extra wordInDict() here.
1459       plural();
1460       if (matched()) break;
1461       pastTense();
1462       if (matched()) break;
1463       aspect();
1464       if (matched()) break;
1465       ityEndings();
1466       if (matched()) break;
1467       nessEndings();
1468       if (matched()) break;
1469       ionEndings();
1470       if (matched()) break;
1471       erAndOrEndings();
1472       if (matched()) break;
1473       lyEndings();
1474       if (matched()) break;
1475       alEndings();
1476       if (matched()) break;
1477       entry = wordInDict();
1478       iveEndings();
1479       if (matched()) break;
1480       izeEndings();
1481       if (matched()) break;
1482       mentEndings();
1483       if (matched()) break;
1484       bleEndings();
1485       if (matched()) break;
1486       ismEndings();
1487       if (matched()) break;
1488       icEndings();
1489       if (matched()) break;
1490       ncyEndings();
1491       if (matched()) break;
1492       nceEndings();
1493       matched();
1494       break;
1495     }
1496
1497     /*
1498      * try for a direct mapping (allows for cases like `Italian'->`Italy' and
1499      * `Italians'->`Italy')
1500      */
1501     entry = matchedEntry;
1502     if (entry != null) {
1503       result = entry.root; // may be null, which means that "word" is the stem
1504     }
1505
1506     /***
1507      * caching off is normally faster if (cache != null && cache.size() <
1508      * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
1509      * key, 0, len); if (result != null) { cache.put(key, result); } else {
1510      * cache.put(key, word.toString()); } }
1511      ***/
1512
1513     /***
1514      * if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
1515      * { System.out.println("CASE:" + word.toString() + "," + new
1516      * String(term,0,len));
1517      *
1518      * } }
1519      ***/
1520
1521     // no entry matched means result is "word"
1522     return true;
1523   }
1524
1525 }