--- /dev/null
+/*
+
+Copyright (c) 2001, Dr Martin Porter
+Copyright (c) 2002, Richard Boulton
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+ * Redistributions of source code must retain the above copyright notice,
+ * this list of conditions and the following disclaimer.
+ * Redistributions in binary form must reproduce the above copyright
+ * notice, this list of conditions and the following disclaimer in the
+ * documentation and/or other materials provided with the distribution.
+ * Neither the name of the copyright holders nor the names of its contributors
+ * may be used to endorse or promote products derived from this software
+ * without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
+DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
+FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
+DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
+SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
+CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
+OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
+OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+
+ */
+
+
+package org.tartarus.snowball;
+
+import java.lang.reflect.InvocationTargetException;
+
+import org.apache.lucene.util.ArrayUtil;
+import org.apache.lucene.util.RamUsageEstimator;
+
+/**
+ * This is the rev 502 of the Snowball SVN trunk,
+ * but modified:
+ * made abstract and introduced abstract method stem to avoid expensive reflection in filter class.
+ * refactored StringBuffers to StringBuilder
+ * uses char[] as buffer instead of StringBuffer/StringBuilder
+ * eq_s,eq_s_b,insert,replace_s take CharSequence like eq_v and eq_v_b
+ * reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
+ */
+public abstract class SnowballProgram {
+ private static final Object[] EMPTY_ARGS = new Object[0];
+
+ protected SnowballProgram()
+ {
+ current = new char[8];
+ setCurrent("");
+ }
+
+ public abstract boolean stem();
+
+ /**
+ * Set the current string.
+ */
+ public void setCurrent(String value)
+ {
+ current = value.toCharArray();
+ cursor = 0;
+ limit = value.length();
+ limit_backward = 0;
+ bra = cursor;
+ ket = limit;
+ }
+
+ /**
+ * Get the current string.
+ */
+ public String getCurrent()
+ {
+ return new String(current, 0, limit);
+ }
+
+ /**
+ * Set the current string.
+ * @param text character array containing input
+ * @param length valid length of text.
+ */
+ public void setCurrent(char text[], int length) {
+ current = text;
+ cursor = 0;
+ limit = length;
+ limit_backward = 0;
+ bra = cursor;
+ ket = limit;
+ }
+
+ /**
+ * Get the current buffer containing the stem.
+ * <p>
+ * NOTE: this may be a reference to a different character array than the
+ * one originally provided with setCurrent, in the exceptional case that
+ * stemming produced a longer intermediate or result string.
+ * </p>
+ * <p>
+ * It is necessary to use {@link #getCurrentBufferLength()} to determine
+ * the valid length of the returned buffer. For example, many words are
+ * stemmed simply by subtracting from the length to remove suffixes.
+ * </p>
+ * @see #getCurrentBufferLength()
+ */
+ public char[] getCurrentBuffer() {
+ return current;
+ }
+
+ /**
+ * Get the valid length of the character array in
+ * {@link #getCurrentBuffer()}.
+ * @return valid length of the array.
+ */
+ public int getCurrentBufferLength() {
+ return limit;
+ }
+
+ // current string
+ private char current[];
+
+ protected int cursor;
+ protected int limit;
+ protected int limit_backward;
+ protected int bra;
+ protected int ket;
+
+ protected void copy_from(SnowballProgram other)
+ {
+ current = other.current;
+ cursor = other.cursor;
+ limit = other.limit;
+ limit_backward = other.limit_backward;
+ bra = other.bra;
+ ket = other.ket;
+ }
+
+ protected boolean in_grouping(char [] s, int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean in_grouping_b(char [] s, int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) return false;
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean out_grouping(char [] s, int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) {
+ cursor++;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor ++;
+ return true;
+ }
+ return false;
+ }
+
+ protected boolean out_grouping_b(char [] s, int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) {
+ cursor--;
+ return true;
+ }
+ ch -= min;
+ if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
+ cursor--;
+ return true;
+ }
+ return false;
+ }
+
+ protected boolean in_range(int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (ch > max || ch < min) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean in_range_b(int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if (ch > max || ch < min) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean out_range(int min, int max)
+ {
+ if (cursor >= limit) return false;
+ char ch = current[cursor];
+ if (!(ch > max || ch < min)) return false;
+ cursor++;
+ return true;
+ }
+
+ protected boolean out_range_b(int min, int max)
+ {
+ if (cursor <= limit_backward) return false;
+ char ch = current[cursor - 1];
+ if(!(ch > max || ch < min)) return false;
+ cursor--;
+ return true;
+ }
+
+ protected boolean eq_s(int s_size, CharSequence s)
+ {
+ if (limit - cursor < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current[cursor + i] != s.charAt(i)) return false;
+ }
+ cursor += s_size;
+ return true;
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected boolean eq_s(int s_size, String s)
+ {
+ return eq_s(s_size, (CharSequence)s);
+ }
+
+ protected boolean eq_s_b(int s_size, CharSequence s)
+ {
+ if (cursor - limit_backward < s_size) return false;
+ int i;
+ for (i = 0; i != s_size; i++) {
+ if (current[cursor - s_size + i] != s.charAt(i)) return false;
+ }
+ cursor -= s_size;
+ return true;
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected boolean eq_s_b(int s_size, String s)
+ {
+ return eq_s_b(s_size, (CharSequence)s);
+ }
+
+ protected boolean eq_v(CharSequence s)
+ {
+ return eq_s(s.length(), s);
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected boolean eq_v(StringBuilder s)
+ {
+ return eq_s(s.length(), (CharSequence)s);
+ }
+
+ protected boolean eq_v_b(CharSequence s)
+ { return eq_s_b(s.length(), s);
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected boolean eq_v_b(StringBuilder s)
+ { return eq_s_b(s.length(), (CharSequence)s);
+ }
+
+ protected int find_among(Among v[], int v_size)
+ {
+ int i = 0;
+ int j = v_size;
+
+ int c = cursor;
+ int l = limit;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ boolean first_key_inspected = false;
+
+ while(true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j; // smaller
+ Among w = v[k];
+ int i2;
+ for (i2 = common; i2 < w.s_size; i2++) {
+ if (c + common == l) {
+ diff = -1;
+ break;
+ }
+ diff = current[c + common] - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break; // v->s has been inspected
+ if (j == i) break; // only one item in v
+
+ // - but now we need to go round once more to get
+ // v->s inspected. This looks messy, but is actually
+ // the optimal approach.
+
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while(true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c + w.s_size;
+ if (w.method == null) return w.result;
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c + w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
+ }
+
+ // find_among_b is for backwards processing. Same comments apply
+ protected int find_among_b(Among v[], int v_size)
+ {
+ int i = 0;
+ int j = v_size;
+
+ int c = cursor;
+ int lb = limit_backward;
+
+ int common_i = 0;
+ int common_j = 0;
+
+ boolean first_key_inspected = false;
+
+ while(true) {
+ int k = i + ((j - i) >> 1);
+ int diff = 0;
+ int common = common_i < common_j ? common_i : common_j;
+ Among w = v[k];
+ int i2;
+ for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
+ if (c - common == lb) {
+ diff = -1;
+ break;
+ }
+ diff = current[c - 1 - common] - w.s[i2];
+ if (diff != 0) break;
+ common++;
+ }
+ if (diff < 0) {
+ j = k;
+ common_j = common;
+ } else {
+ i = k;
+ common_i = common;
+ }
+ if (j - i <= 1) {
+ if (i > 0) break;
+ if (j == i) break;
+ if (first_key_inspected) break;
+ first_key_inspected = true;
+ }
+ }
+ while(true) {
+ Among w = v[i];
+ if (common_i >= w.s_size) {
+ cursor = c - w.s_size;
+ if (w.method == null) return w.result;
+
+ boolean res;
+ try {
+ Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
+ res = resobj.toString().equals("true");
+ } catch (InvocationTargetException e) {
+ res = false;
+ // FIXME - debug message
+ } catch (IllegalAccessException e) {
+ res = false;
+ // FIXME - debug message
+ }
+ cursor = c - w.s_size;
+ if (res) return w.result;
+ }
+ i = w.substring_i;
+ if (i < 0) return 0;
+ }
+ }
+
+ /* to replace chars between c_bra and c_ket in current by the
+ * chars in s.
+ */
+ protected int replace_s(int c_bra, int c_ket, CharSequence s)
+ {
+ final int adjustment = s.length() - (c_ket - c_bra);
+ final int newLength = limit + adjustment;
+ //resize if necessary
+ if (newLength > current.length) {
+ char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
+ System.arraycopy(current, 0, newBuffer, 0, limit);
+ current = newBuffer;
+ }
+ // if the substring being replaced is longer or shorter than the
+ // replacement, need to shift things around
+ if (adjustment != 0 && c_ket < limit) {
+ System.arraycopy(current, c_ket, current, c_bra + s.length(),
+ limit - c_ket);
+ }
+ // insert the replacement text
+ // Note, faster is s.getChars(0, s.length(), current, c_bra);
+ // but would have to duplicate this method for both String and StringBuilder
+ for (int i = 0; i < s.length(); i++)
+ current[c_bra + i] = s.charAt(i);
+
+ limit += adjustment;
+ if (cursor >= c_ket) cursor += adjustment;
+ else if (cursor > c_bra) cursor = c_bra;
+ return adjustment;
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected int replace_s(int c_bra, int c_ket, String s) {
+ return replace_s(c_bra, c_ket, (CharSequence)s);
+ }
+
+ protected void slice_check()
+ {
+ if (bra < 0 ||
+ bra > ket ||
+ ket > limit)
+ {
+ System.err.println("faulty slice operation");
+ // FIXME: report error somehow.
+ /*
+ fprintf(stderr, "faulty slice operation:\n");
+ debug(z, -1, 0);
+ exit(1);
+ */
+ }
+ }
+
+ protected void slice_from(CharSequence s)
+ {
+ slice_check();
+ replace_s(bra, ket, s);
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected void slice_from(String s)
+ {
+ slice_from((CharSequence)s);
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected void slice_from(StringBuilder s)
+ {
+ slice_from((CharSequence)s);
+ }
+
+ protected void slice_del()
+ {
+ slice_from((CharSequence)"");
+ }
+
+ protected void insert(int c_bra, int c_ket, CharSequence s)
+ {
+ int adjustment = replace_s(c_bra, c_ket, s);
+ if (c_bra <= bra) bra += adjustment;
+ if (c_bra <= ket) ket += adjustment;
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected void insert(int c_bra, int c_ket, String s)
+ {
+ insert(c_bra, c_ket, (CharSequence)s);
+ }
+
+ /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
+ @Deprecated
+ protected void insert(int c_bra, int c_ket, StringBuilder s)
+ {
+ insert(c_bra, c_ket, (CharSequence)s);
+ }
+
+ /* Copy the slice into the supplied StringBuffer */
+ protected StringBuilder slice_to(StringBuilder s)
+ {
+ slice_check();
+ int len = ket - bra;
+ s.setLength(0);
+ s.append(current, bra, len);
+ return s;
+ }
+
+ protected StringBuilder assign_to(StringBuilder s)
+ {
+ s.setLength(0);
+ s.append(current, 0, limit);
+ return s;
+ }
+
+/*
+extern void debug(struct SN_env * z, int number, int line_count)
+{ int i;
+ int limit = SIZE(z->p);
+ //if (number >= 0) printf("%3d (line %4d): '", number, line_count);
+ if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
+ for (i = 0; i <= limit; i++)
+ { if (z->lb == i) printf("{");
+ if (z->bra == i) printf("[");
+ if (z->c == i) printf("|");
+ if (z->ket == i) printf("]");
+ if (z->l == i) printf("}");
+ if (i < limit)
+ { int ch = z->p[i];
+ if (ch == 0) ch = '#';
+ printf("%c", ch);
+ }
+ }
+ printf("'\n");
+}
+*/
+
+};
+