2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
19 This file was partially derived from the
20 original CIIR University of Massachusetts Amherst version of KStemmer.java (license for
21 the original shown below)
26 Center for Intelligent Information Retrieval,
27 University of Massachusetts, Amherst.
30 Redistribution and use in source and binary forms, with or without modification,
31 are permitted provided that the following conditions are met:
33 1. Redistributions of source code must retain the above copyright notice, this
34 list of conditions and the following disclaimer.
36 2. Redistributions in binary form must reproduce the above copyright notice,
37 this list of conditions and the following disclaimer in the documentation
38 and/or other materials provided with the distribution.
40 3. The names "Center for Intelligent Information Retrieval" and
41 "University of Massachusetts" must not be used to endorse or promote products
42 derived from this software without prior written permission. To obtain
43 permission, contact info@ciir.cs.umass.edu.
45 THIS SOFTWARE IS PROVIDED BY UNIVERSITY OF MASSACHUSETTS AND OTHER CONTRIBUTORS
46 "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO,
47 THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
48 ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDERS OR CONTRIBUTORS BE
49 LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
50 CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE
51 GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
52 HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
53 LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
54 OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
57 package org.apache.lucene.analysis.en;
59 import org.apache.lucene.analysis.CharArrayMap;
60 import org.apache.lucene.analysis.util.OpenStringBuilder;
62 * <p>Title: Kstemmer</p>
63 * <p>Description: This is a java version of Bob Krovetz' kstem stemmer</p>
64 * <p>Copyright: Copyright 2008, Luicid Imagination, Inc. </p>
65 * <p>Copyright: Copyright 2003, CIIR University of Massachusetts Amherst (http://ciir.cs.umass.edu) </p>
67 import org.apache.lucene.util.Version;
70 * This class implements the Kstem algorithm
72 public class KStemmer {
73 static private final int MaxWordLen = 50;
75 static private final String[] exceptionWords = {"aide", "bathe", "caste",
76 "cute", "dame", "dime", "doge", "done", "dune", "envelope", "gage",
77 "grille", "grippe", "lobe", "mane", "mare", "nape", "node", "pane",
78 "pate", "plane", "pope", "programme", "quite", "ripe", "rote", "rune",
79 "sage", "severe", "shoppe", "sine", "slime", "snipe", "steppe", "suite",
80 "swinge", "tare", "tine", "tope", "tripe", "twine"};
82 static private final String[][] directConflations = { {"aging", "age"},
83 {"going", "go"}, {"goes", "go"}, {"lying", "lie"}, {"using", "use"},
84 {"owing", "owe"}, {"suing", "sue"}, {"dying", "die"}, {"tying", "tie"},
85 {"vying", "vie"}, {"aged", "age"}, {"used", "use"}, {"vied", "vie"},
86 {"cued", "cue"}, {"died", "die"}, {"eyed", "eye"}, {"hued", "hue"},
87 {"iced", "ice"}, {"lied", "lie"}, {"owed", "owe"}, {"sued", "sue"},
88 {"toed", "toe"}, {"tied", "tie"}, {"does", "do"}, {"doing", "do"},
89 {"aeronautical", "aeronautics"}, {"mathematical", "mathematics"},
90 {"political", "politics"}, {"metaphysical", "metaphysics"},
91 {"cylindrical", "cylinder"}, {"nazism", "nazi"},
92 {"ambiguity", "ambiguous"}, {"barbarity", "barbarous"},
93 {"credulity", "credulous"}, {"generosity", "generous"},
94 {"spontaneity", "spontaneous"}, {"unanimity", "unanimous"},
95 {"voracity", "voracious"}, {"fled", "flee"}, {"miscarriage", "miscarry"}};
97 static private final String[][] countryNationality = {
98 {"afghan", "afghanistan"}, {"african", "africa"},
99 {"albanian", "albania"}, {"algerian", "algeria"},
100 {"american", "america"}, {"andorran", "andorra"}, {"angolan", "angola"},
101 {"arabian", "arabia"}, {"argentine", "argentina"},
102 {"armenian", "armenia"}, {"asian", "asia"}, {"australian", "australia"},
103 {"austrian", "austria"}, {"azerbaijani", "azerbaijan"},
104 {"azeri", "azerbaijan"}, {"bangladeshi", "bangladesh"},
105 {"belgian", "belgium"}, {"bermudan", "bermuda"}, {"bolivian", "bolivia"},
106 {"bosnian", "bosnia"}, {"botswanan", "botswana"},
107 {"brazilian", "brazil"}, {"british", "britain"},
108 {"bulgarian", "bulgaria"}, {"burmese", "burma"},
109 {"californian", "california"}, {"cambodian", "cambodia"},
110 {"canadian", "canada"}, {"chadian", "chad"}, {"chilean", "chile"},
111 {"chinese", "china"}, {"colombian", "colombia"}, {"croat", "croatia"},
112 {"croatian", "croatia"}, {"cuban", "cuba"}, {"cypriot", "cyprus"},
113 {"czechoslovakian", "czechoslovakia"}, {"danish", "denmark"},
114 {"egyptian", "egypt"}, {"equadorian", "equador"},
115 {"eritrean", "eritrea"}, {"estonian", "estonia"},
116 {"ethiopian", "ethiopia"}, {"european", "europe"}, {"fijian", "fiji"},
117 {"filipino", "philippines"}, {"finnish", "finland"},
118 {"french", "france"}, {"gambian", "gambia"}, {"georgian", "georgia"},
119 {"german", "germany"}, {"ghanian", "ghana"}, {"greek", "greece"},
120 {"grenadan", "grenada"}, {"guamian", "guam"},
121 {"guatemalan", "guatemala"}, {"guinean", "guinea"},
122 {"guyanan", "guyana"}, {"haitian", "haiti"}, {"hawaiian", "hawaii"},
123 {"holland", "dutch"}, {"honduran", "honduras"}, {"hungarian", "hungary"},
124 {"icelandic", "iceland"}, {"indonesian", "indonesia"},
125 {"iranian", "iran"}, {"iraqi", "iraq"}, {"iraqui", "iraq"},
126 {"irish", "ireland"}, {"israeli", "israel"},
127 {"italian", "italy"},
128 {"jamaican", "jamaica"},
129 {"japanese", "japan"},
130 {"jordanian", "jordan"},
131 {"kampuchean", "cambodia"},
134 {"kuwaiti", "kuwait"},
137 {"latvian", "latvia"},
138 {"lebanese", "lebanon"},
139 {"liberian", "liberia"},
141 {"lithuanian", "lithuania"},
142 {"macedonian", "macedonia"},
143 {"madagascan", "madagascar"},
144 {"malaysian", "malaysia"},
145 {"maltese", "malta"},
146 {"mauritanian", "mauritania"},
147 {"mexican", "mexico"},
148 {"micronesian", "micronesia"},
149 {"moldovan", "moldova"},
150 {"monacan", "monaco"},
151 {"mongolian", "mongolia"},
152 {"montenegran", "montenegro"},
153 {"moroccan", "morocco"},
154 {"myanmar", "burma"},
155 {"namibian", "namibia"},
156 {"nepalese", "nepal"},
157 // {"netherlands", "dutch"},
158 {"nicaraguan", "nicaragua"}, {"nigerian", "nigeria"},
159 {"norwegian", "norway"}, {"omani", "oman"}, {"pakistani", "pakistan"},
160 {"panamanian", "panama"}, {"papuan", "papua"},
161 {"paraguayan", "paraguay"}, {"peruvian", "peru"},
162 {"portuguese", "portugal"}, {"romanian", "romania"},
163 {"rumania", "romania"}, {"rumanian", "romania"}, {"russian", "russia"},
164 {"rwandan", "rwanda"}, {"samoan", "samoa"}, {"scottish", "scotland"},
165 {"serb", "serbia"}, {"serbian", "serbia"}, {"siam", "thailand"},
166 {"siamese", "thailand"}, {"slovakia", "slovak"}, {"slovakian", "slovak"},
167 {"slovenian", "slovenia"}, {"somali", "somalia"},
168 {"somalian", "somalia"}, {"spanish", "spain"}, {"swedish", "sweden"},
169 {"swiss", "switzerland"}, {"syrian", "syria"}, {"taiwanese", "taiwan"},
170 {"tanzanian", "tanzania"}, {"texan", "texas"}, {"thai", "thailand"},
171 {"tunisian", "tunisia"}, {"turkish", "turkey"}, {"ugandan", "uganda"},
172 {"ukrainian", "ukraine"}, {"uruguayan", "uruguay"},
173 {"uzbek", "uzbekistan"}, {"venezuelan", "venezuela"},
174 {"vietnamese", "viet"}, {"virginian", "virginia"}, {"yemeni", "yemen"},
175 {"yugoslav", "yugoslavia"}, {"yugoslavian", "yugoslavia"},
176 {"zambian", "zambia"}, {"zealander", "zealand"},
177 {"zimbabwean", "zimbabwe"}};
179 static private final String[] supplementDict = {"aids", "applicator",
180 "capacitor", "digitize", "electromagnet", "ellipsoid", "exosphere",
181 "extensible", "ferromagnet", "graphics", "hydromagnet", "polygraph",
182 "toroid", "superconduct", "backscatter", "connectionism"};
184 static private final String[] properNouns = {"abrams", "achilles",
185 "acropolis", "adams", "agnes", "aires", "alexander", "alexis", "alfred",
186 "algiers", "alps", "amadeus", "ames", "amos", "andes", "angeles",
187 "annapolis", "antilles", "aquarius", "archimedes", "arkansas", "asher",
188 "ashly", "athens", "atkins", "atlantis", "avis", "bahamas", "bangor",
189 "barbados", "barger", "bering", "brahms", "brandeis", "brussels",
190 "bruxelles", "cairns", "camoros", "camus", "carlos", "celts", "chalker",
191 "charles", "cheops", "ching", "christmas", "cocos", "collins",
192 "columbus", "confucius", "conners", "connolly", "copernicus", "cramer",
193 "cyclops", "cygnus", "cyprus", "dallas", "damascus", "daniels", "davies",
194 "davis", "decker", "denning", "dennis", "descartes", "dickens", "doris",
195 "douglas", "downs", "dreyfus", "dukakis", "dulles", "dumfries",
196 "ecclesiastes", "edwards", "emily", "erasmus", "euphrates", "evans",
197 "everglades", "fairbanks", "federales", "fisher", "fitzsimmons",
198 "fleming", "forbes", "fowler", "france", "francis", "goering",
199 "goodling", "goths", "grenadines", "guiness", "hades", "harding",
200 "harris", "hastings", "hawkes", "hawking", "hayes", "heights",
201 "hercules", "himalayas", "hippocrates", "hobbs", "holmes", "honduras",
202 "hopkins", "hughes", "humphreys", "illinois", "indianapolis",
203 "inverness", "iris", "iroquois", "irving", "isaacs", "italy", "james",
204 "jarvis", "jeffreys", "jesus", "jones", "josephus", "judas", "julius",
205 "kansas", "keynes", "kipling", "kiwanis", "lansing", "laos", "leeds",
206 "levis", "leviticus", "lewis", "louis", "maccabees", "madras",
207 "maimonides", "maldive", "massachusetts", "matthews", "mauritius",
208 "memphis", "mercedes", "midas", "mingus", "minneapolis", "mohammed",
209 "moines", "morris", "moses", "myers", "myknos", "nablus", "nanjing",
210 "nantes", "naples", "neal", "netherlands", "nevis", "nostradamus",
211 "oedipus", "olympus", "orleans", "orly", "papas", "paris", "parker",
212 "pauling", "peking", "pershing", "peter", "peters", "philippines",
213 "phineas", "pisces", "pryor", "pythagoras", "queens", "rabelais",
214 "ramses", "reynolds", "rhesus", "rhodes", "richards", "robins",
215 "rodgers", "rogers", "rubens", "sagittarius", "seychelles", "socrates",
216 "texas", "thames", "thomas", "tiberias", "tunis", "venus", "vilnius",
217 "wales", "warner", "wilkins", "williams", "wyoming", "xmas", "yonkers",
218 "zeus", "frances", "aarhus", "adonis", "andrews", "angus", "antares",
219 "aquinas", "arcturus", "ares", "artemis", "augustus", "ayers",
220 "barnabas", "barnes", "becker", "bejing", "biggs", "billings", "boeing",
221 "boris", "borroughs", "briggs", "buenos", "calais", "caracas", "cassius",
222 "cerberus", "ceres", "cervantes", "chantilly", "chartres", "chester",
223 "connally", "conner", "coors", "cummings", "curtis", "daedalus",
224 "dionysus", "dobbs", "dolores", "edmonds"};
226 static class DictEntry {
230 DictEntry(String root, boolean isException) {
232 this.exception = isException;
236 private static final CharArrayMap<DictEntry> dict_ht = initializeDictHash();
239 * caching off private int maxCacheSize; private CharArrayMap<String> cache =
240 * null; private static final String SAME = "SAME"; // use if stemmed form is
244 private final OpenStringBuilder word = new OpenStringBuilder();
245 private int j; /* index of final letter in stem (within word) */
247 * INDEX of final letter in word. You must add 1 to k to get
248 * the current length of word. When you want the length of
249 * word, use the method wordLength, which returns (k+1).
253 * private void initializeStemHash() { if (maxCacheSize > 0) cache = new
254 * CharArrayMap<String>(maxCacheSize,false); }
257 private char finalChar() {
258 return word.charAt(k);
261 private char penultChar() {
262 return word.charAt(k - 1);
265 private boolean isVowel(int index) {
266 return !isCons(index);
269 private boolean isCons(int index) {
272 ch = word.charAt(index);
274 if ((ch == 'a') || (ch == 'e') || (ch == 'i') || (ch == 'o') || (ch == 'u')) return false;
275 if ((ch != 'y') || (index == 0)) return true;
276 else return (!isCons(index - 1));
279 private static CharArrayMap<DictEntry> initializeDictHash() {
280 DictEntry defaultEntry;
283 CharArrayMap<DictEntry> d = new CharArrayMap<DictEntry>(
284 Version.LUCENE_31, 1000, false);
286 d = new CharArrayMap<DictEntry>(Version.LUCENE_31, 1000, false);
287 for (int i = 0; i < exceptionWords.length; i++) {
288 if (!d.containsKey(exceptionWords[i])) {
289 entry = new DictEntry(exceptionWords[i], true);
290 d.put(exceptionWords[i], entry);
292 System.out.println("Warning: Entry [" + exceptionWords[i]
293 + "] already in dictionary 1");
297 for (int i = 0; i < directConflations.length; i++) {
298 if (!d.containsKey(directConflations[i][0])) {
299 entry = new DictEntry(directConflations[i][1], false);
300 d.put(directConflations[i][0], entry);
302 System.out.println("Warning: Entry [" + directConflations[i][0]
303 + "] already in dictionary 2");
307 for (int i = 0; i < countryNationality.length; i++) {
308 if (!d.containsKey(countryNationality[i][0])) {
309 entry = new DictEntry(countryNationality[i][1], false);
310 d.put(countryNationality[i][0], entry);
312 System.out.println("Warning: Entry [" + countryNationality[i][0]
313 + "] already in dictionary 3");
317 defaultEntry = new DictEntry(null, false);
320 array = KStemData1.data;
322 for (int i = 0; i < array.length; i++) {
323 if (!d.containsKey(array[i])) {
324 d.put(array[i], defaultEntry);
326 System.out.println("Warning: Entry [" + array[i]
327 + "] already in dictionary 4");
331 array = KStemData2.data;
332 for (int i = 0; i < array.length; i++) {
333 if (!d.containsKey(array[i])) {
334 d.put(array[i], defaultEntry);
336 System.out.println("Warning: Entry [" + array[i]
337 + "] already in dictionary 4");
341 array = KStemData3.data;
342 for (int i = 0; i < array.length; i++) {
343 if (!d.containsKey(array[i])) {
344 d.put(array[i], defaultEntry);
346 System.out.println("Warning: Entry [" + array[i]
347 + "] already in dictionary 4");
351 array = KStemData4.data;
352 for (int i = 0; i < array.length; i++) {
353 if (!d.containsKey(array[i])) {
354 d.put(array[i], defaultEntry);
356 System.out.println("Warning: Entry [" + array[i]
357 + "] already in dictionary 4");
361 array = KStemData5.data;
362 for (int i = 0; i < array.length; i++) {
363 if (!d.containsKey(array[i])) {
364 d.put(array[i], defaultEntry);
366 System.out.println("Warning: Entry [" + array[i]
367 + "] already in dictionary 4");
371 array = KStemData6.data;
372 for (int i = 0; i < array.length; i++) {
373 if (!d.containsKey(array[i])) {
374 d.put(array[i], defaultEntry);
376 System.out.println("Warning: Entry [" + array[i]
377 + "] already in dictionary 4");
381 array = KStemData7.data;
382 for (int i = 0; i < array.length; i++) {
383 if (!d.containsKey(array[i])) {
384 d.put(array[i], defaultEntry);
386 System.out.println("Warning: Entry [" + array[i]
387 + "] already in dictionary 4");
391 for (int i = 0; i < KStemData8.data.length; i++) {
392 if (!d.containsKey(KStemData8.data[i])) {
393 d.put(KStemData8.data[i], defaultEntry);
395 System.out.println("Warning: Entry [" + KStemData8.data[i]
396 + "] already in dictionary 4");
400 for (int i = 0; i < supplementDict.length; i++) {
401 if (!d.containsKey(supplementDict[i])) {
402 d.put(supplementDict[i], defaultEntry);
404 System.out.println("Warning: Entry [" + supplementDict[i]
405 + "] already in dictionary 5");
409 for (int i = 0; i < properNouns.length; i++) {
410 if (!d.containsKey(properNouns[i])) {
411 d.put(properNouns[i], defaultEntry);
413 System.out.println("Warning: Entry [" + properNouns[i]
414 + "] already in dictionary 6");
421 private boolean isAlpha(char ch) {
422 return ch >= 'a' && ch <= 'z'; // terms must be lowercased already
425 /* length of stem within word */
426 private int stemLength() {
430 private boolean endsIn(char[] s) {
431 if (s.length > k) return false;
433 int r = word.length() - s.length; /* length of word before this suffix */
435 for (int r1 = r, i = 0; i < s.length; i++, r1++) {
436 if (s[i] != word.charAt(r1)) return false;
438 j = r - 1; /* index of the character BEFORE the posfix */
442 private boolean endsIn(char a, char b) {
443 if (2 > k) return false;
444 // check left to right since the endings have often already matched
445 if (word.charAt(k - 1) == a && word.charAt(k) == b) {
452 private boolean endsIn(char a, char b, char c) {
453 if (3 > k) return false;
454 if (word.charAt(k - 2) == a && word.charAt(k - 1) == b
455 && word.charAt(k) == c) {
462 private boolean endsIn(char a, char b, char c, char d) {
463 if (4 > k) return false;
464 if (word.charAt(k - 3) == a && word.charAt(k - 2) == b
465 && word.charAt(k - 1) == c && word.charAt(k) == d) {
472 private DictEntry wordInDict() {
474 * if (matchedEntry != null) { if (dict_ht.get(word.getArray(), 0,
475 * word.size()) != matchedEntry) {
476 * System.out.println("Uh oh... cached entry doesn't match"); } return
479 if (matchedEntry != null) return matchedEntry;
480 DictEntry e = dict_ht.get(word.getArray(), 0, word.length());
481 if (e != null && !e.exception) {
482 matchedEntry = e; // only cache if it's not an exception.
484 // lookups.add(word.toString());
488 /* Convert plurals to singular form, and '-ies' to 'y' */
489 private void plural() {
490 if (word.charAt(k) == 's') {
491 if (endsIn('i', 'e', 's')) {
492 word.setLength(j + 3);
494 if (lookup()) /* ensure calories -> calorie */
497 word.unsafeWrite('s');
500 } else if (endsIn('e', 's')) {
501 /* try just removing the "s" */
502 word.setLength(j + 2);
506 * note: don't check for exceptions here. So, `aides' -> `aide', but
507 * `aided' -> `aid'. The exception for double s is used to prevent
508 * crosses -> crosse. This is actually correct if crosses is a plural
509 * noun (a type of racket used in lacrosse), but the verb is much more
514 * YCS: this was the one place where lookup was not followed by return.
515 * So restructure it. if ((j>0)&&(lookup(word.toString())) &&
516 * !((word.charAt(j) == 's') && (word.charAt(j-1) == 's'))) return;
519 && !((word.charAt(j) == 's') && (word.charAt(j - 1) == 's'));
520 if (tryE && lookup()) return;
522 /* try removing the "es" */
524 word.setLength(j + 1);
526 if (lookup()) return;
528 /* the default is to retain the "e" */
529 word.unsafeWrite('e');
532 if (!tryE) lookup(); // if we didn't try the "e" ending before
535 if (word.length() > 3 && penultChar() != 's' && !endsIn('o', 'u', 's')) {
536 /* unless the word ends in "ous" or a double "s", remove the final "s" */
546 private void setSuffix(String s) {
547 setSuff(s, s.length());
550 /* replace old suffix with s */
551 private void setSuff(String s, int len) {
552 word.setLength(j + 1);
553 for (int l = 0; l < len; l++) {
554 word.unsafeWrite(s.charAt(l));
559 /* Returns true if the word is found in the dictionary */
560 // almost all uses of lookup() return immediately and are
561 // followed by another lookup in the dict. Store the match
562 // to avoid this double lookup.
563 DictEntry matchedEntry = null;
565 private boolean lookup() {
567 * debugging code String thisLookup = word.toString(); boolean added =
568 * lookups.add(thisLookup); if (!added) {
569 * System.out.println("######extra lookup:" + thisLookup); // occaasional
570 * extra lookups aren't necessarily errors... could happen by diff
571 * manipulations // throw new RuntimeException("######extra lookup:" +
572 * thisLookup); } else { // System.out.println("new lookup:" + thisLookup);
576 matchedEntry = dict_ht.get(word.getArray(), 0, word.size());
577 return matchedEntry != null;
580 // Set<String> lookups = new HashSet<String>();
582 /* convert past tense (-ed) to present, and `-ied' to `y' */
583 private void pastTense() {
585 * Handle words less than 5 letters with a direct mapping This prevents
588 if (word.length() <= 4) return;
590 if (endsIn('i', 'e', 'd')) {
591 word.setLength(j + 3);
593 if (lookup()) /* we almost always want to convert -ied to -y, but */
594 return; /* this isn't true for short words (died->die) */
595 k++; /* I don't know any long words that this applies to, */
596 word.unsafeWrite('d'); /* but just in case... */
602 /* the vowelInStem() is necessary so we don't stem acronyms */
603 if (endsIn('e', 'd') && vowelInStem()) {
604 /* see if the root ends in `e' */
605 word.setLength(j + 2);
608 DictEntry entry = wordInDict();
609 if (entry != null) if (!entry.exception) /*
610 * if it's in the dictionary and
615 /* try removing the "ed" */
616 word.setLength(j + 1);
618 if (lookup()) return;
621 * try removing a doubled consonant. if the root isn't found in the
622 * dictionary, the default is to leave it doubled. This will correctly
623 * capture `backfilled' -> `backfill' instead of `backfill' ->
624 * `backfille', and seems correct most of the time
630 if (lookup()) return;
631 word.unsafeWrite(word.charAt(k));
637 /* if we have a `un-' prefix, then leave the word alone */
638 /* (this will sometimes screw up with `under-', but we */
639 /* will take care of that later) */
641 if ((word.charAt(0) == 'u') && (word.charAt(1) == 'n')) {
642 word.unsafeWrite('e');
643 word.unsafeWrite('d');
650 * it wasn't found by just removing the `d' or the `ed', so prefer to end
651 * with an `e' (e.g., `microcoded' -> `microcode').
654 word.setLength(j + 1);
655 word.unsafeWrite('e');
657 // nolookup() - we already tried the "e" ending
662 /* return TRUE if word ends with a double consonant */
663 private boolean doubleC(int i) {
664 if (i < 1) return false;
666 if (word.charAt(i) != word.charAt(i - 1)) return false;
670 private boolean vowelInStem() {
671 for (int i = 0; i < stemLength(); i++) {
672 if (isVowel(i)) return true;
677 /* handle `-ing' endings */
678 private void aspect() {
680 * handle short words (aging -> age) via a direct mapping. This prevents
681 * (thing -> the) in the version of this routine that ignores inflectional
682 * variants that are mentioned in the dictionary (when the root is also
686 if (word.length() <= 5) return;
688 /* the vowelinstem() is necessary so we don't stem acronyms */
689 if (endsIn('i', 'n', 'g') && vowelInStem()) {
691 /* try adding an `e' to the stem and check against the dictionary */
692 word.setCharAt(j + 1, 'e');
693 word.setLength(j + 2);
696 DictEntry entry = wordInDict();
698 if (!entry.exception) /* if it's in the dictionary and not an exception */
702 /* adding on the `e' didn't work, so remove it */
704 k--; /* note that `ing' has also been removed */
706 if (lookup()) return;
708 /* if I can remove a doubled consonant and get a word, then do so */
711 word.setLength(k + 1);
712 if (lookup()) return;
713 word.unsafeWrite(word.charAt(k)); /* restore the doubled consonant */
715 /* the default is to leave the consonant doubled */
716 /* (e.g.,`fingerspelling' -> `fingerspell'). Unfortunately */
717 /* `bookselling' -> `booksell' and `mislabelling' -> `mislabell'). */
718 /* Without making the algorithm significantly more complicated, this */
719 /* is the best I can do */
726 * the word wasn't in the dictionary after removing the stem, and then
727 * checking with and without a final `e'. The default is to add an `e'
728 * unless the word ends in two consonants, so `microcoding' ->
729 * `microcode'. The two consonants restriction wouldn't normally be
730 * necessary, but is needed because we don't try to deal with prefixes and
731 * compounds, and most of the time it is correct (e.g., footstamping ->
732 * footstamp, not footstampe; however, decoupled -> decoupl). We can
733 * prevent almost all of the incorrect stems if we try to do some prefix
737 if ((j > 0) && isCons(j) && isCons(j - 1)) {
739 word.setLength(k + 1);
740 // nolookup() because we already did according to the comment
744 word.setLength(j + 1);
745 word.unsafeWrite('e');
747 // nolookup(); we already tried an 'e' ending
753 * this routine deals with -ity endings. It accepts -ability, -ibility, and
754 * -ality, even without checking the dictionary because they are so
755 * productive. The first two are mapped to -ble, and the -ity is remove for
758 private void ityEndings() {
761 if (endsIn('i', 't', 'y')) {
762 word.setLength(j + 1); /* try just removing -ity */
764 if (lookup()) return;
765 word.unsafeWrite('e'); /* try removing -ity and adding -e */
767 if (lookup()) return;
768 word.setCharAt(j + 1, 'i');
772 * the -ability and -ibility endings are highly productive, so just accept
775 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'l')) {
776 word.setLength(j - 1);
777 word.append("le"); /* convert to -ble */
783 /* ditto for -ivity */
784 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'v')) {
785 word.setLength(j + 1);
786 word.unsafeWrite('e'); /* convert to -ive */
791 /* ditto for -ality */
792 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) {
793 word.setLength(j + 1);
800 * if the root isn't in the dictionary, and the variant *is* there, then
801 * use the variant. This allows `immunity'->`immune', but prevents
802 * `capacity'->`capac'. If neither the variant nor the root form are in
803 * the dictionary, then remove the ending as a default
806 if (lookup()) return;
808 /* the default is to remove -ity altogether */
809 word.setLength(j + 1);
811 // nolookup(), we already did it.
816 /* handle -ence and -ance */
817 private void nceEndings() {
821 if (endsIn('n', 'c', 'e')) {
822 word_char = word.charAt(j);
823 if (!((word_char == 'e') || (word_char == 'a'))) return;
825 word.unsafeWrite('e'); /* try converting -e/ance to -e (adherance/adhere) */
827 if (lookup()) return;
828 word.setLength(j); /*
829 * try removing -e/ance altogether
830 * (disappearance/disappear)
833 if (lookup()) return;
834 word.unsafeWrite(word_char); /* restore the original ending */
837 // nolookup() because we restored the original ending
843 private void nessEndings() {
844 if (endsIn('n', 'e', 's', 's')) { /*
845 * this is a very productive endings, so
848 word.setLength(j + 1);
850 if (word.charAt(j) == 'i') word.setCharAt(j, 'y');
857 private void ismEndings() {
858 if (endsIn('i', 's', 'm')) { /*
859 * this is a very productive ending, so just
862 word.setLength(j + 1);
869 /* this routine deals with -ment endings. */
870 private void mentEndings() {
873 if (endsIn('m', 'e', 'n', 't')) {
874 word.setLength(j + 1);
876 if (lookup()) return;
884 /* this routine deals with -ize endings. */
885 private void izeEndings() {
888 if (endsIn('i', 'z', 'e')) {
889 word.setLength(j + 1); /* try removing -ize entirely */
891 if (lookup()) return;
892 word.unsafeWrite('i');
894 if (doubleC(j)) { /* allow for a doubled consonant */
897 if (lookup()) return;
898 word.unsafeWrite(word.charAt(j - 1));
901 word.setLength(j + 1);
902 word.unsafeWrite('e'); /* try removing -ize and adding -e */
904 if (lookup()) return;
905 word.setLength(j + 1);
913 /* handle -ency and -ancy */
914 private void ncyEndings() {
915 if (endsIn('n', 'c', 'y')) {
916 if (!((word.charAt(j) == 'e') || (word.charAt(j) == 'a'))) return;
917 word.setCharAt(j + 2, 't'); /* try converting -ncy to -nt */
918 word.setLength(j + 3);
921 if (lookup()) return;
923 word.setCharAt(j + 2, 'c'); /* the default is to convert it to -nce */
924 word.unsafeWrite('e');
931 /* handle -able and -ible */
932 private void bleEndings() {
936 if (endsIn('b', 'l', 'e')) {
937 if (!((word.charAt(j) == 'a') || (word.charAt(j) == 'i'))) return;
938 word_char = word.charAt(j);
939 word.setLength(j); /* try just removing the ending */
941 if (lookup()) return;
942 if (doubleC(k)) { /* allow for a doubled consonant */
945 if (lookup()) return;
947 word.unsafeWrite(word.charAt(k - 1));
950 word.unsafeWrite('e'); /* try removing -a/ible and adding -e */
952 if (lookup()) return;
954 word.append("ate"); /* try removing -able and adding -ate */
955 /* (e.g., compensable/compensate) */
957 if (lookup()) return;
959 word.unsafeWrite(word_char); /* restore the original values */
968 * handle -ic endings. This is fairly straightforward, but this is also the
969 * only place we try *expanding* an ending, -ic -> -ical. This is to handle
970 * cases like `canonic' -> `canonical'
972 private void icEndings() {
973 if (endsIn('i', 'c')) {
974 word.setLength(j + 3);
975 word.append("al"); /* try converting -ic to -ical */
977 if (lookup()) return;
979 word.setCharAt(j + 1, 'y'); /* try converting -ic to -y */
980 word.setLength(j + 2);
982 if (lookup()) return;
984 word.setCharAt(j + 1, 'e'); /* try converting -ic to -e */
985 if (lookup()) return;
987 word.setLength(j + 1); /* try removing -ic altogether */
989 if (lookup()) return;
990 word.append("ic"); /* restore the original ending */
997 private static char[] ization = "ization".toCharArray();
998 private static char[] ition = "ition".toCharArray();
999 private static char[] ation = "ation".toCharArray();
1000 private static char[] ication = "ication".toCharArray();
1002 /* handle some derivational endings */
1004 * this routine deals with -ion, -ition, -ation, -ization, and -ication. The
1005 * -ization ending is always converted to -ize
1007 private void ionEndings() {
1009 if (!endsIn('i', 'o', 'n')) {
1013 if (endsIn(ization)) { /*
1014 * the -ize ending is very productive, so simply
1015 * accept it as the root
1017 word.setLength(j + 3);
1018 word.unsafeWrite('e');
1024 if (endsIn(ition)) {
1025 word.setLength(j + 1);
1026 word.unsafeWrite('e');
1029 * remove -ition and add `e', and check against the
1032 return; /* (e.g., definition->define, opposition->oppose) */
1034 /* restore original values */
1035 word.setLength(j + 1);
1036 word.append("ition");
1039 } else if (endsIn(ation)) {
1040 word.setLength(j + 3);
1041 word.unsafeWrite('e');
1043 if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1044 return; /* (elmination -> eliminate) */
1046 word.setLength(j + 1);
1047 word.unsafeWrite('e'); /*
1048 * remove -ation and add `e', and check against the
1052 if (lookup()) return;
1054 word.setLength(j + 1);/*
1055 * just remove -ation (resignation->resign) and
1059 if (lookup()) return;
1061 /* restore original values */
1062 word.setLength(j + 1);
1063 word.append("ation");
1070 * test -ication after -ation is attempted (e.g., `complication->complicate'
1071 * rather than `complication->comply')
1074 if (endsIn(ication)) {
1075 word.setLength(j + 1);
1076 word.unsafeWrite('y');
1079 * remove -ication and add `y', and check against the
1082 return; /* (e.g., amplification -> amplify) */
1084 /* restore original values */
1085 word.setLength(j + 1);
1086 word.append("ication");
1091 // if (endsIn(ion)) {
1092 if (true) { // we checked for this earlier... just need to set "j"
1095 word.setLength(j + 1);
1096 word.unsafeWrite('e');
1098 if (lookup()) /* remove -ion and add `e', and check against the dictionary */
1101 word.setLength(j + 1);
1103 if (lookup()) /* remove -ion, and if it's found, treat that as the root */
1106 /* restore original values */
1107 word.setLength(j + 1);
1113 // nolookup(); all of the other paths restored original values
1118 * this routine deals with -er, -or, -ier, and -eer. The -izer ending is
1119 * always converted to -ize
1121 private void erAndOrEndings() {
1124 if (word.charAt(k) != 'r') return; // YCS
1126 char word_char; /* so we can remember if it was -er or -or */
1128 if (endsIn('i', 'z', 'e', 'r')) { /*
1129 * -ize is very productive, so accept it
1132 word.setLength(j + 4);
1138 if (endsIn('e', 'r') || endsIn('o', 'r')) {
1139 word_char = word.charAt(j + 1);
1143 if (lookup()) return;
1144 word.unsafeWrite(word.charAt(j - 1)); /* restore the doubled consonant */
1147 if (word.charAt(j) == 'i') { /* do we have a -ier ending? */
1148 word.setCharAt(j, 'y');
1149 word.setLength(j + 1);
1151 if (lookup()) /* yes, so check against the dictionary */
1153 word.setCharAt(j, 'i'); /* restore the endings */
1154 word.unsafeWrite('e');
1157 if (word.charAt(j) == 'e') { /* handle -eer */
1160 if (lookup()) return;
1161 word.unsafeWrite('e');
1164 word.setLength(j + 2); /* remove the -r ending */
1166 if (lookup()) return;
1167 word.setLength(j + 1); /* try removing -er/-or */
1169 if (lookup()) return;
1170 word.unsafeWrite('e'); /* try removing -or and adding -e */
1172 if (lookup()) return;
1173 word.setLength(j + 1);
1174 word.unsafeWrite(word_char);
1175 word.unsafeWrite('r'); /* restore the word to the way it was */
1183 * this routine deals with -ly endings. The -ally ending is always converted
1184 * to -al Sometimes this will temporarily leave us with a non-word (e.g.,
1185 * heuristically maps to heuristical), but then the -al is removed in the next
1188 private void lyEndings() {
1191 if (endsIn('l', 'y')) {
1193 word.setCharAt(j + 2, 'e'); /* try converting -ly to -le */
1195 if (lookup()) return;
1196 word.setCharAt(j + 2, 'y');
1198 word.setLength(j + 1); /* try just removing the -ly */
1201 if (lookup()) return;
1203 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'l')) /*
1216 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 'b')) { /*
1225 word.setCharAt(j + 2, 'e');
1230 if (word.charAt(j) == 'i') { /* e.g., militarily -> military */
1232 word.unsafeWrite('y');
1234 if (lookup()) return;
1240 word.setLength(j + 1); /* the default is to remove -ly */
1243 // nolookup()... we already tried removing the "ly" variant
1249 * this routine deals with -al endings. Some of the endings from the previous
1250 * routine are finished up here.
1252 private void alEndings() {
1255 if (word.length() < 4) return;
1256 if (endsIn('a', 'l')) {
1257 word.setLength(j + 1);
1259 if (lookup()) /* try just removing the -al */
1262 if (doubleC(j)) { /* allow for a doubled consonant */
1265 if (lookup()) return;
1266 word.unsafeWrite(word.charAt(j - 1));
1269 word.setLength(j + 1);
1270 word.unsafeWrite('e'); /* try removing the -al and adding -e */
1272 if (lookup()) return;
1274 word.setLength(j + 1);
1275 word.append("um"); /* try converting -al to -um */
1276 /* (e.g., optimal - > optimum ) */
1278 if (lookup()) return;
1280 word.setLength(j + 1);
1281 word.append("al"); /* restore the ending to the way it was */
1284 if ((j > 0) && (word.charAt(j - 1) == 'i') && (word.charAt(j) == 'c')) {
1285 word.setLength(j - 1); /* try removing -ical */
1287 if (lookup()) return;
1289 word.setLength(j - 1);
1290 word.unsafeWrite('y');/* try turning -ical to -y (e.g., bibliographical) */
1292 if (lookup()) return;
1294 word.setLength(j - 1);
1295 word.append("ic"); /* the default is to convert -ical to -ic */
1297 // nolookup() ... converting ical to ic means removing "al" which we
1304 if (word.charAt(j) == 'i') { /* sometimes -ial endings should be removed */
1305 word.setLength(j); /* (sometimes it gets turned into -y, but we */
1306 k = j - 1; /* aren't dealing with that case for now) */
1307 if (lookup()) return;
1318 * this routine deals with -ive endings. It normalizes some of the -ative
1319 * endings directly, and also maps some -ive endings to -ion.
1321 private void iveEndings() {
1324 if (endsIn('i', 'v', 'e')) {
1325 word.setLength(j + 1); /* try removing -ive entirely */
1327 if (lookup()) return;
1329 word.unsafeWrite('e'); /* try removing -ive and adding -e */
1331 if (lookup()) return;
1332 word.setLength(j + 1);
1334 if ((j > 0) && (word.charAt(j - 1) == 'a') && (word.charAt(j) == 't')) {
1335 word.setCharAt(j - 1, 'e'); /* try removing -ative and adding -e */
1336 word.setLength(j); /* (e.g., determinative -> determine) */
1338 if (lookup()) return;
1339 word.setLength(j - 1); /* try just removing -ative */
1340 if (lookup()) return;
1342 word.append("ative");
1346 /* try mapping -ive to -ion (e.g., injunctive/injunction) */
1347 word.setCharAt(j + 2, 'o');
1348 word.setCharAt(j + 3, 'n');
1349 if (lookup()) return;
1351 word.setCharAt(j + 2, 'v'); /* restore the original values */
1352 word.setCharAt(j + 3, 'e');
1361 String stem(String term) {
1362 boolean changed = stem(term.toCharArray(), term.length());
1363 if (!changed) return term;
1368 * Returns the result of the stem (assuming the word was changed) as a String.
1371 String s = getString();
1372 if (s != null) return s;
1373 return word.toString();
1376 CharSequence asCharSequence() {
1377 return result != null ? result : word;
1380 String getString() {
1385 return word.getArray();
1389 return word.length();
1394 private boolean matched() {
1396 * if (!lookups.contains(word.toString())) { throw new
1397 * RuntimeException("didn't look up "+word.toString()+" prev="+prevLookup);
1401 return matchedEntry != null;
1405 * Stems the text in the token. Returns true if changed.
1407 boolean stem(char[] term, int len) {
1412 if ((k <= 1) || (k >= MaxWordLen - 1)) {
1413 return false; // don't stem
1416 // first check the stemmer dictionaries, and avoid using the
1417 // cache if it's in there.
1418 DictEntry entry = dict_ht.get(term, 0, len);
1419 if (entry != null) {
1420 if (entry.root != null) {
1421 result = entry.root;
1428 * caching off is normally faster if (cache == null) initializeStemHash();
1430 * // now check the cache, before we copy chars to "word" if (cache != null)
1431 * { String val = cache.get(term, 0, len); if (val != null) { if (val !=
1432 * SAME) { result = val; return true; } return false; } }
1436 // allocate enough space so that an expansion is never needed
1437 word.reserve(len + 10);
1438 for (int i = 0; i < len; i++) {
1440 if (!isAlpha(ch)) return false; // don't stem
1441 // don't lowercase... it's a requirement that lowercase filter be
1442 // used before this stemmer.
1443 word.unsafeWrite(ch);
1446 matchedEntry = null;
1448 * lookups.clear(); lookups.add(word.toString());
1452 * This while loop will never be executed more than one time; it is here
1453 * only to allow the break statement to be used to escape as soon as a word
1457 // YCS: extra lookup()s were inserted so we don't need to
1458 // do an extra wordInDict() here.
1460 if (matched()) break;
1462 if (matched()) break;
1464 if (matched()) break;
1466 if (matched()) break;
1468 if (matched()) break;
1470 if (matched()) break;
1472 if (matched()) break;
1474 if (matched()) break;
1476 if (matched()) break;
1477 entry = wordInDict();
1479 if (matched()) break;
1481 if (matched()) break;
1483 if (matched()) break;
1485 if (matched()) break;
1487 if (matched()) break;
1489 if (matched()) break;
1491 if (matched()) break;
1498 * try for a direct mapping (allows for cases like `Italian'->`Italy' and
1499 * `Italians'->`Italy')
1501 entry = matchedEntry;
1502 if (entry != null) {
1503 result = entry.root; // may be null, which means that "word" is the stem
1507 * caching off is normally faster if (cache != null && cache.size() <
1508 * maxCacheSize) { char[] key = new char[len]; System.arraycopy(term, 0,
1509 * key, 0, len); if (result != null) { cache.put(key, result); } else {
1510 * cache.put(key, word.toString()); } }
1514 * if (entry == null) { if (!word.toString().equals(new String(term,0,len)))
1515 * { System.out.println("CASE:" + word.toString() + "," + new
1516 * String(term,0,len));
1521 // no entry matched means result is "word"