X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java deleted file mode 100644 index 9d460d2..0000000 --- a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/PorterStemmer.java +++ /dev/null @@ -1,547 +0,0 @@ -package org.apache.lucene.analysis; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - - Porter stemmer in Java. The original paper is in - - Porter, 1980, An algorithm for suffix stripping, Program, Vol. 14, - no. 3, pp 130-137, - - See also http://www.tartarus.org/~martin/PorterStemmer/index.html - - Bug 1 (reported by Gonzalo Parra 16/10/99) fixed as marked below. - Tthe words 'aed', 'eed', 'oed' leave k at 'a' for step 3, and b[k-1] - is then out outside the bounds of b. - - Similarly, - - Bug 2 (reported by Steve Dyrdahl 22/2/00) fixed as marked below. - 'ion' by itself leaves j = -1 in the test for 'ion' in step 5, and - b[j] is then outside the bounds of b. - - Release 3. - - [ This version is derived from Release 3, modified by Brian Goetz to - optimize for fewer object creations. ] - -*/ - - -import java.io.IOException; -import java.io.InputStream; -import java.io.FileInputStream; - -import static org.apache.lucene.util.RamUsageEstimator.NUM_BYTES_CHAR; -import org.apache.lucene.util.ArrayUtil; - -/** - * - * Stemmer, implementing the Porter Stemming Algorithm - * - * The Stemmer class transforms a word into its root form. The input - * word can be provided a character at time (by calling add()), or at once - * by calling one of the various stem(something) methods. - */ - -class PorterStemmer -{ - private char[] b; - private int i, /* offset into b */ - j, k, k0; - private boolean dirty = false; - private static final int INITIAL_SIZE = 50; - - public PorterStemmer() { - b = new char[INITIAL_SIZE]; - i = 0; - } - - /** - * reset() resets the stemmer so it can stem another word. If you invoke - * the stemmer by calling add(char) and then stem(), you must call reset() - * before starting another word. - */ - public void reset() { i = 0; dirty = false; } - - /** - * Add a character to the word being stemmed. When you are finished - * adding characters, you can call stem(void) to process the word. - */ - public void add(char ch) { - if (b.length <= i) { - b = ArrayUtil.grow(b, i+1); - } - b[i++] = ch; - } - - /** - * After a word has been stemmed, it can be retrieved by toString(), - * or a reference to the internal buffer can be retrieved by getResultBuffer - * and getResultLength (which is generally more efficient.) - */ - @Override - public String toString() { return new String(b,0,i); } - - /** - * Returns the length of the word resulting from the stemming process. - */ - public int getResultLength() { return i; } - - /** - * Returns a reference to a character buffer containing the results of - * the stemming process. You also need to consult getResultLength() - * to determine the length of the result. - */ - public char[] getResultBuffer() { return b; } - - /* cons(i) is true <=> b[i] is a consonant. */ - - private final boolean cons(int i) { - switch (b[i]) { - case 'a': case 'e': case 'i': case 'o': case 'u': - return false; - case 'y': - return (i==k0) ? true : !cons(i-1); - default: - return true; - } - } - - /* m() measures the number of consonant sequences between k0 and j. if c is - a consonant sequence and v a vowel sequence, and <..> indicates arbitrary - presence, - - gives 0 - vc gives 1 - vcvc gives 2 - vcvcvc gives 3 - .... - */ - - private final int m() { - int n = 0; - int i = k0; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - while(true) { - while(true) { - if (i > j) - return n; - if (cons(i)) - break; - i++; - } - i++; - n++; - while(true) { - if (i > j) - return n; - if (! cons(i)) - break; - i++; - } - i++; - } - } - - /* vowelinstem() is true <=> k0,...j contains a vowel */ - - private final boolean vowelinstem() { - int i; - for (i = k0; i <= j; i++) - if (! cons(i)) - return true; - return false; - } - - /* doublec(j) is true <=> j,(j-1) contain a double consonant. */ - - private final boolean doublec(int j) { - if (j < k0+1) - return false; - if (b[j] != b[j-1]) - return false; - return cons(j); - } - - /* cvc(i) is true <=> i-2,i-1,i has the form consonant - vowel - consonant - and also if the second c is not w,x or y. this is used when trying to - restore an e at the end of a short word. e.g. - - cav(e), lov(e), hop(e), crim(e), but - snow, box, tray. - - */ - - private final boolean cvc(int i) { - if (i < k0+2 || !cons(i) || cons(i-1) || !cons(i-2)) - return false; - else { - int ch = b[i]; - if (ch == 'w' || ch == 'x' || ch == 'y') return false; - } - return true; - } - - private final boolean ends(String s) { - int l = s.length(); - int o = k-l+1; - if (o < k0) - return false; - for (int i = 0; i < l; i++) - if (b[o+i] != s.charAt(i)) - return false; - j = k-l; - return true; - } - - /* setto(s) sets (j+1),...k to the characters in the string s, readjusting - k. */ - - void setto(String s) { - int l = s.length(); - int o = j+1; - for (int i = 0; i < l; i++) - b[o+i] = s.charAt(i); - k = j+l; - dirty = true; - } - - /* r(s) is used further down. */ - - void r(String s) { if (m() > 0) setto(s); } - - /* step1() gets rid of plurals and -ed or -ing. e.g. - - caresses -> caress - ponies -> poni - ties -> ti - caress -> caress - cats -> cat - - feed -> feed - agreed -> agree - disabled -> disable - - matting -> mat - mating -> mate - meeting -> meet - milling -> mill - messing -> mess - - meetings -> meet - - */ - - private final void step1() { - if (b[k] == 's') { - if (ends("sses")) k -= 2; - else if (ends("ies")) setto("i"); - else if (b[k-1] != 's') k--; - } - if (ends("eed")) { - if (m() > 0) - k--; - } - else if ((ends("ed") || ends("ing")) && vowelinstem()) { - k = j; - if (ends("at")) setto("ate"); - else if (ends("bl")) setto("ble"); - else if (ends("iz")) setto("ize"); - else if (doublec(k)) { - int ch = b[k--]; - if (ch == 'l' || ch == 's' || ch == 'z') - k++; - } - else if (m() == 1 && cvc(k)) - setto("e"); - } - } - - /* step2() turns terminal y to i when there is another vowel in the stem. */ - - private final void step2() { - if (ends("y") && vowelinstem()) { - b[k] = 'i'; - dirty = true; - } - } - - /* step3() maps double suffices to single ones. so -ization ( = -ize plus - -ation) maps to -ize etc. note that the string before the suffix must give - m() > 0. */ - - private final void step3() { - if (k == k0) return; /* For Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("ational")) { r("ate"); break; } - if (ends("tional")) { r("tion"); break; } - break; - case 'c': - if (ends("enci")) { r("ence"); break; } - if (ends("anci")) { r("ance"); break; } - break; - case 'e': - if (ends("izer")) { r("ize"); break; } - break; - case 'l': - if (ends("bli")) { r("ble"); break; } - if (ends("alli")) { r("al"); break; } - if (ends("entli")) { r("ent"); break; } - if (ends("eli")) { r("e"); break; } - if (ends("ousli")) { r("ous"); break; } - break; - case 'o': - if (ends("ization")) { r("ize"); break; } - if (ends("ation")) { r("ate"); break; } - if (ends("ator")) { r("ate"); break; } - break; - case 's': - if (ends("alism")) { r("al"); break; } - if (ends("iveness")) { r("ive"); break; } - if (ends("fulness")) { r("ful"); break; } - if (ends("ousness")) { r("ous"); break; } - break; - case 't': - if (ends("aliti")) { r("al"); break; } - if (ends("iviti")) { r("ive"); break; } - if (ends("biliti")) { r("ble"); break; } - break; - case 'g': - if (ends("logi")) { r("log"); break; } - } - } - - /* step4() deals with -ic-, -full, -ness etc. similar strategy to step3. */ - - private final void step4() { - switch (b[k]) { - case 'e': - if (ends("icate")) { r("ic"); break; } - if (ends("ative")) { r(""); break; } - if (ends("alize")) { r("al"); break; } - break; - case 'i': - if (ends("iciti")) { r("ic"); break; } - break; - case 'l': - if (ends("ical")) { r("ic"); break; } - if (ends("ful")) { r(""); break; } - break; - case 's': - if (ends("ness")) { r(""); break; } - break; - } - } - - /* step5() takes off -ant, -ence etc., in context vcvc. */ - - private final void step5() { - if (k == k0) return; /* for Bug 1 */ - switch (b[k-1]) { - case 'a': - if (ends("al")) break; - return; - case 'c': - if (ends("ance")) break; - if (ends("ence")) break; - return; - case 'e': - if (ends("er")) break; return; - case 'i': - if (ends("ic")) break; return; - case 'l': - if (ends("able")) break; - if (ends("ible")) break; return; - case 'n': - if (ends("ant")) break; - if (ends("ement")) break; - if (ends("ment")) break; - /* element etc. not stripped before the m */ - if (ends("ent")) break; - return; - case 'o': - if (ends("ion") && j >= 0 && (b[j] == 's' || b[j] == 't')) break; - /* j >= 0 fixes Bug 2 */ - if (ends("ou")) break; - return; - /* takes care of -ous */ - case 's': - if (ends("ism")) break; - return; - case 't': - if (ends("ate")) break; - if (ends("iti")) break; - return; - case 'u': - if (ends("ous")) break; - return; - case 'v': - if (ends("ive")) break; - return; - case 'z': - if (ends("ize")) break; - return; - default: - return; - } - if (m() > 1) - k = j; - } - - /* step6() removes a final -e if m() > 1. */ - - private final void step6() { - j = k; - if (b[k] == 'e') { - int a = m(); - if (a > 1 || a == 1 && !cvc(k-1)) - k--; - } - if (b[k] == 'l' && doublec(k) && m() > 1) - k--; - } - - - /** - * Stem a word provided as a String. Returns the result as a String. - */ - public String stem(String s) { - if (stem(s.toCharArray(), s.length())) - return toString(); - else - return s; - } - - /** Stem a word contained in a char[]. Returns true if the stemming process - * resulted in a word different from the input. You can retrieve the - * result with getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word) { - return stem(word, word.length); - } - - /** Stem a word contained in a portion of a char[] array. Returns - * true if the stemming process resulted in a word different from - * the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] wordBuffer, int offset, int wordLen) { - reset(); - if (b.length < wordLen) { - b = new char[ArrayUtil.oversize(wordLen, NUM_BYTES_CHAR)]; - } - System.arraycopy(wordBuffer, offset, b, 0, wordLen); - i = wordLen; - return stem(0); - } - - /** Stem a word contained in a leading portion of a char[] array. - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem(char[] word, int wordLen) { - return stem(word, 0, wordLen); - } - - /** Stem the word placed into the Stemmer buffer through calls to add(). - * Returns true if the stemming process resulted in a word different - * from the input. You can retrieve the result with - * getResultLength()/getResultBuffer() or toString(). - */ - public boolean stem() { - return stem(0); - } - - public boolean stem(int i0) { - k = i - 1; - k0 = i0; - if (k > k0+1) { - step1(); step2(); step3(); step4(); step5(); step6(); - } - // Also, a word is considered dirty if we lopped off letters - // Thanks to Ifigenia Vairelles for pointing this out. - if (i != k+1) - dirty = true; - i = k+1; - return dirty; - } - - /** Test program for demonstrating the Stemmer. It reads a file and - * stems each word, writing the result to standard out. - * Usage: Stemmer file-name - */ - public static void main(String[] args) { - PorterStemmer s = new PorterStemmer(); - - for (int i = 0; i < args.length; i++) { - try { - InputStream in = new FileInputStream(args[i]); - byte[] buffer = new byte[1024]; - int bufferLen, offset, ch; - - bufferLen = in.read(buffer); - offset = 0; - s.reset(); - - while(true) { - if (offset < bufferLen) - ch = buffer[offset++]; - else { - bufferLen = in.read(buffer); - offset = 0; - if (bufferLen < 0) - ch = -1; - else - ch = buffer[offset++]; - } - - if (Character.isLetter((char) ch)) { - s.add(Character.toLowerCase((char) ch)); - } - else { - s.stem(); - System.out.print(s.toString()); - s.reset(); - if (ch < 0) - break; - else { - System.out.print((char) ch); - } - } - } - - in.close(); - } - catch (IOException e) { - System.out.println("error reading " + args[i]); - } - } - } -} -