1 package org.apache.lucene.analysis.util;
3 import java.text.BreakIterator; // javadoc
4 import java.text.CharacterIterator;
5 import java.util.Locale;
8 * A CharacterIterator used internally for use with {@link BreakIterator}
11 public abstract class CharArrayIterator implements CharacterIterator {
18 public char [] getText() {
22 public int getStart() {
26 public int getLength() {
31 * Set a new region of text to be examined by this iterator
33 * @param array text buffer to examine
34 * @param start offset into buffer
35 * @param length maximum length to examine
37 public void setText(final char array[], int start, int length) {
42 this.limit = start + length;
45 public char current() {
46 return (index == limit) ? DONE : jreBugWorkaround(array[index]);
49 protected abstract char jreBugWorkaround(char ch);
56 public int getBeginIndex() {
60 public int getEndIndex() {
64 public int getIndex() {
69 index = (limit == start) ? limit : limit - 1;
74 if (++index >= limit) {
82 public char previous() {
83 if (--index < start) {
91 public char setIndex(int position) {
92 if (position < getBeginIndex() || position > getEndIndex())
93 throw new IllegalArgumentException("Illegal Position: " + position);
94 index = start + position;
99 public Object clone() {
101 return super.clone();
102 } catch (CloneNotSupportedException e) {
103 // CharacterIterator does not allow you to throw CloneNotSupported
104 throw new RuntimeException(e);
109 * Create a new CharArrayIterator that works around JRE bugs
110 * in a manner suitable for {@link BreakIterator#getSentenceInstance()}
112 public static CharArrayIterator newSentenceInstance() {
113 if (HAS_BUGGY_BREAKITERATORS) {
114 return new CharArrayIterator() {
115 // work around this for now by lying about all surrogates to
116 // the sentence tokenizer, instead we treat them all as
117 // SContinue so we won't break around them.
119 protected char jreBugWorkaround(char ch) {
120 return ch >= 0xD800 && ch <= 0xDFFF ? 0x002C : ch;
124 return new CharArrayIterator() {
127 protected char jreBugWorkaround(char ch) {
135 * Create a new CharArrayIterator that works around JRE bugs
136 * in a manner suitable for {@link BreakIterator#getWordInstance()}
138 public static CharArrayIterator newWordInstance() {
139 if (HAS_BUGGY_BREAKITERATORS) {
140 return new CharArrayIterator() {
141 // work around this for now by lying about all surrogates to the word,
142 // instead we treat them all as ALetter so we won't break around them.
144 protected char jreBugWorkaround(char ch) {
145 return ch >= 0xD800 && ch <= 0xDFFF ? 0x0041 : ch;
149 return new CharArrayIterator() {
152 protected char jreBugWorkaround(char ch) {
160 * True if this JRE has a buggy BreakIterator implementation
162 public static final boolean HAS_BUGGY_BREAKITERATORS;
166 BreakIterator bi = BreakIterator.getSentenceInstance(Locale.US);
167 bi.setText("\udb40\udc53");
170 } catch (Exception e) {
173 HAS_BUGGY_BREAKITERATORS = v;