X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java new file mode 100644 index 0000000..4d14479 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/util/CharArrayIterator.java @@ -0,0 +1,175 @@ +package org.apache.lucene.analysis.util; + +import java.text.BreakIterator; // javadoc +import java.text.CharacterIterator; +import java.util.Locale; + +/** + * A CharacterIterator used internally for use with {@link BreakIterator} + * @lucene.internal + */ +public abstract class CharArrayIterator implements CharacterIterator { + private char array[]; + private int start; + private int index; + private int length; + private int limit; + + public char [] getText() { + return array; + } + + public int getStart() { + return start; + } + + public int getLength() { + return length; + } + + /** + * Set a new region of text to be examined by this iterator + * + * @param array text buffer to examine + * @param start offset into buffer + * @param length maximum length to examine + */ + public void setText(final char array[], int start, int length) { + this.array = array; + this.start = start; + this.index = start; + this.length = length; + this.limit = start + length; + } + + public char current() { + return (index == limit) ? DONE : jreBugWorkaround(array[index]); + } + + protected abstract char jreBugWorkaround(char ch); + + public char first() { + index = start; + return current(); + } + + public int getBeginIndex() { + return 0; + } + + public int getEndIndex() { + return length; + } + + public int getIndex() { + return index - start; + } + + public char last() { + index = (limit == start) ? limit : limit - 1; + return current(); + } + + public char next() { + if (++index >= limit) { + index = limit; + return DONE; + } else { + return current(); + } + } + + public char previous() { + if (--index < start) { + index = start; + return DONE; + } else { + return current(); + } + } + + public char setIndex(int position) { + if (position < getBeginIndex() || position > getEndIndex()) + throw new IllegalArgumentException("Illegal Position: " + position); + index = start + position; + return current(); + } + + @Override + public Object clone() { + try { + return super.clone(); + } catch (CloneNotSupportedException e) { + // CharacterIterator does not allow you to throw CloneNotSupported + throw new RuntimeException(e); + } + } + + /** + * Create a new CharArrayIterator that works around JRE bugs + * in a manner suitable for {@link BreakIterator#getSentenceInstance()} + */ + public static CharArrayIterator newSentenceInstance() { + if (HAS_BUGGY_BREAKITERATORS) { + return new CharArrayIterator() { + // work around this for now by lying about all surrogates to + // the sentence tokenizer, instead we treat them all as + // SContinue so we won't break around them. + @Override + protected char jreBugWorkaround(char ch) { + return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch; + } + }; + } else { + return new CharArrayIterator() { + // no bugs + @Override + protected char jreBugWorkaround(char ch) { + return ch; + } + }; + } + } + + /** + * Create a new CharArrayIterator that works around JRE bugs + * in a manner suitable for {@link BreakIterator#getWordInstance()} + */ + public static CharArrayIterator newWordInstance() { + if (HAS_BUGGY_BREAKITERATORS) { + return new CharArrayIterator() { + // work around this for now by lying about all surrogates to the word, + // instead we treat them all as ALetter so we won't break around them. + @Override + protected char jreBugWorkaround(char ch) { + return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch; + } + }; + } else { + return new CharArrayIterator() { + // no bugs + @Override + protected char jreBugWorkaround(char ch) { + return ch; + } + }; + } + } + + /** + * True if this JRE has a buggy BreakIterator implementation + */ + public static final boolean HAS_BUGGY_BREAKITERATORS; + static { + boolean v; + try { + BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US); + bi.setText("\udb40\udc53"); + bi.next(); + v = false; + } catch (Exception e) { + v = true; + } + HAS_BUGGY_BREAKITERATORS = v; + } +}