3 Copyright (c) 2001, Dr Martin Porter
4 Copyright (c) 2002, Richard Boulton
7 Redistribution and use in source and binary forms, with or without
8 modification, are permitted provided that the following conditions are met:
10 * Redistributions of source code must retain the above copyright notice,
11 * this list of conditions and the following disclaimer.
12 * Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 * Neither the name of the copyright holders nor the names of its contributors
16 * may be used to endorse or promote products derived from this software
17 * without specific prior written permission.
19 THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
20 AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
21 IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE
22 DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE LIABLE
23 FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
24 DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR
25 SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER
26 CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY,
27 OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE
28 OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
33 package org.tartarus.snowball;
35 import java.lang.reflect.InvocationTargetException;
37 import org.apache.lucene.util.ArrayUtil;
38 import org.apache.lucene.util.RamUsageEstimator;
41 * This is the rev 502 of the Snowball SVN trunk,
43 * made abstract and introduced abstract method stem to avoid expensive reflection in filter class.
44 * refactored StringBuffers to StringBuilder
45 * uses char[] as buffer instead of StringBuffer/StringBuilder
46 * eq_s,eq_s_b,insert,replace_s take CharSequence like eq_v and eq_v_b
47 * reflection calls (Lovins, etc) use EMPTY_ARGS/EMPTY_PARAMS
49 public abstract class SnowballProgram {
50 private static final Object[] EMPTY_ARGS = new Object[0];
52 protected SnowballProgram()
54 current = new char[8];
58 public abstract boolean stem();
61 * Set the current string.
63 public void setCurrent(String value)
65 current = value.toCharArray();
67 limit = value.length();
74 * Get the current string.
76 public String getCurrent()
78 return new String(current, 0, limit);
82 * Set the current string.
83 * @param text character array containing input
84 * @param length valid length of text.
86 public void setCurrent(char text[], int length) {
96 * Get the current buffer containing the stem.
98 * NOTE: this may be a reference to a different character array than the
99 * one originally provided with setCurrent, in the exceptional case that
100 * stemming produced a longer intermediate or result string.
103 * It is necessary to use {@link #getCurrentBufferLength()} to determine
104 * the valid length of the returned buffer. For example, many words are
105 * stemmed simply by subtracting from the length to remove suffixes.
107 * @see #getCurrentBufferLength()
109 public char[] getCurrentBuffer() {
114 * Get the valid length of the character array in
115 * {@link #getCurrentBuffer()}.
116 * @return valid length of the array.
118 public int getCurrentBufferLength() {
123 private char current[];
125 protected int cursor;
127 protected int limit_backward;
131 protected void copy_from(SnowballProgram other)
133 current = other.current;
134 cursor = other.cursor;
136 limit_backward = other.limit_backward;
141 protected boolean in_grouping(char [] s, int min, int max)
143 if (cursor >= limit) return false;
144 char ch = current[cursor];
145 if (ch > max || ch < min) return false;
147 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
152 protected boolean in_grouping_b(char [] s, int min, int max)
154 if (cursor <= limit_backward) return false;
155 char ch = current[cursor - 1];
156 if (ch > max || ch < min) return false;
158 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) return false;
163 protected boolean out_grouping(char [] s, int min, int max)
165 if (cursor >= limit) return false;
166 char ch = current[cursor];
167 if (ch > max || ch < min) {
172 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
179 protected boolean out_grouping_b(char [] s, int min, int max)
181 if (cursor <= limit_backward) return false;
182 char ch = current[cursor - 1];
183 if (ch > max || ch < min) {
188 if ((s[ch >> 3] & (0X1 << (ch & 0X7))) == 0) {
195 protected boolean in_range(int min, int max)
197 if (cursor >= limit) return false;
198 char ch = current[cursor];
199 if (ch > max || ch < min) return false;
204 protected boolean in_range_b(int min, int max)
206 if (cursor <= limit_backward) return false;
207 char ch = current[cursor - 1];
208 if (ch > max || ch < min) return false;
213 protected boolean out_range(int min, int max)
215 if (cursor >= limit) return false;
216 char ch = current[cursor];
217 if (!(ch > max || ch < min)) return false;
222 protected boolean out_range_b(int min, int max)
224 if (cursor <= limit_backward) return false;
225 char ch = current[cursor - 1];
226 if(!(ch > max || ch < min)) return false;
231 protected boolean eq_s(int s_size, CharSequence s)
233 if (limit - cursor < s_size) return false;
235 for (i = 0; i != s_size; i++) {
236 if (current[cursor + i] != s.charAt(i)) return false;
242 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
244 protected boolean eq_s(int s_size, String s)
246 return eq_s(s_size, (CharSequence)s);
249 protected boolean eq_s_b(int s_size, CharSequence s)
251 if (cursor - limit_backward < s_size) return false;
253 for (i = 0; i != s_size; i++) {
254 if (current[cursor - s_size + i] != s.charAt(i)) return false;
260 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
262 protected boolean eq_s_b(int s_size, String s)
264 return eq_s_b(s_size, (CharSequence)s);
267 protected boolean eq_v(CharSequence s)
269 return eq_s(s.length(), s);
272 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
274 protected boolean eq_v(StringBuilder s)
276 return eq_s(s.length(), (CharSequence)s);
279 protected boolean eq_v_b(CharSequence s)
280 { return eq_s_b(s.length(), s);
283 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
285 protected boolean eq_v_b(StringBuilder s)
286 { return eq_s_b(s.length(), (CharSequence)s);
289 protected int find_among(Among v[], int v_size)
300 boolean first_key_inspected = false;
303 int k = i + ((j - i) >> 1);
305 int common = common_i < common_j ? common_i : common_j; // smaller
308 for (i2 = common; i2 < w.s_size; i2++) {
309 if (c + common == l) {
313 diff = current[c + common] - w.s[i2];
314 if (diff != 0) break;
325 if (i > 0) break; // v->s has been inspected
326 if (j == i) break; // only one item in v
328 // - but now we need to go round once more to get
329 // v->s inspected. This looks messy, but is actually
330 // the optimal approach.
332 if (first_key_inspected) break;
333 first_key_inspected = true;
338 if (common_i >= w.s_size) {
339 cursor = c + w.s_size;
340 if (w.method == null) return w.result;
343 Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
344 res = resobj.toString().equals("true");
345 } catch (InvocationTargetException e) {
347 // FIXME - debug message
348 } catch (IllegalAccessException e) {
350 // FIXME - debug message
352 cursor = c + w.s_size;
353 if (res) return w.result;
360 // find_among_b is for backwards processing. Same comments apply
361 protected int find_among_b(Among v[], int v_size)
367 int lb = limit_backward;
372 boolean first_key_inspected = false;
375 int k = i + ((j - i) >> 1);
377 int common = common_i < common_j ? common_i : common_j;
380 for (i2 = w.s_size - 1 - common; i2 >= 0; i2--) {
381 if (c - common == lb) {
385 diff = current[c - 1 - common] - w.s[i2];
386 if (diff != 0) break;
399 if (first_key_inspected) break;
400 first_key_inspected = true;
405 if (common_i >= w.s_size) {
406 cursor = c - w.s_size;
407 if (w.method == null) return w.result;
411 Object resobj = w.method.invoke(w.methodobject, EMPTY_ARGS);
412 res = resobj.toString().equals("true");
413 } catch (InvocationTargetException e) {
415 // FIXME - debug message
416 } catch (IllegalAccessException e) {
418 // FIXME - debug message
420 cursor = c - w.s_size;
421 if (res) return w.result;
428 /* to replace chars between c_bra and c_ket in current by the
431 protected int replace_s(int c_bra, int c_ket, CharSequence s)
433 final int adjustment = s.length() - (c_ket - c_bra);
434 final int newLength = limit + adjustment;
435 //resize if necessary
436 if (newLength > current.length) {
437 char newBuffer[] = new char[ArrayUtil.oversize(newLength, RamUsageEstimator.NUM_BYTES_CHAR)];
438 System.arraycopy(current, 0, newBuffer, 0, limit);
441 // if the substring being replaced is longer or shorter than the
442 // replacement, need to shift things around
443 if (adjustment != 0 && c_ket < limit) {
444 System.arraycopy(current, c_ket, current, c_bra + s.length(),
447 // insert the replacement text
448 // Note, faster is s.getChars(0, s.length(), current, c_bra);
449 // but would have to duplicate this method for both String and StringBuilder
450 for (int i = 0; i < s.length(); i++)
451 current[c_bra + i] = s.charAt(i);
454 if (cursor >= c_ket) cursor += adjustment;
455 else if (cursor > c_bra) cursor = c_bra;
459 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
461 protected int replace_s(int c_bra, int c_ket, String s) {
462 return replace_s(c_bra, c_ket, (CharSequence)s);
465 protected void slice_check()
471 System.err.println("faulty slice operation");
472 // FIXME: report error somehow.
474 fprintf(stderr, "faulty slice operation:\n");
481 protected void slice_from(CharSequence s)
484 replace_s(bra, ket, s);
487 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
489 protected void slice_from(String s)
491 slice_from((CharSequence)s);
494 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
496 protected void slice_from(StringBuilder s)
498 slice_from((CharSequence)s);
501 protected void slice_del()
503 slice_from((CharSequence)"");
506 protected void insert(int c_bra, int c_ket, CharSequence s)
508 int adjustment = replace_s(c_bra, c_ket, s);
509 if (c_bra <= bra) bra += adjustment;
510 if (c_bra <= ket) ket += adjustment;
513 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
515 protected void insert(int c_bra, int c_ket, String s)
517 insert(c_bra, c_ket, (CharSequence)s);
520 /** @deprecated for binary back compat. Will be removed in Lucene 4.0 */
522 protected void insert(int c_bra, int c_ket, StringBuilder s)
524 insert(c_bra, c_ket, (CharSequence)s);
527 /* Copy the slice into the supplied StringBuffer */
528 protected StringBuilder slice_to(StringBuilder s)
533 s.append(current, bra, len);
537 protected StringBuilder assign_to(StringBuilder s)
540 s.append(current, 0, limit);
545 extern void debug(struct SN_env * z, int number, int line_count)
547 int limit = SIZE(z->p);
548 //if (number >= 0) printf("%3d (line %4d): '", number, line_count);
549 if (number >= 0) printf("%3d (line %4d): [%d]'", number, line_count,limit);
550 for (i = 0; i <= limit; i++)
551 { if (z->lb == i) printf("{");
552 if (z->bra == i) printf("[");
553 if (z->c == i) printf("|");
554 if (z->ket == i) printf("]");
555 if (z->l == i) printf("}");
558 if (ch == 0) ch = '#';