1 package org.apache.lucene.analysis.icu.segmentation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.text.CharacterIterator;
22 import com.ibm.icu.lang.UCharacter;
23 import com.ibm.icu.text.BreakIterator;
24 import com.ibm.icu.text.DictionaryBasedBreakIterator;
25 import com.ibm.icu.text.RuleBasedBreakIterator;
26 import com.ibm.icu.text.UTF16;
29 * Contain all the issues surrounding BreakIterators in ICU in one place.
30 * Basically this boils down to the fact that they aren't very friendly to any
33 * http://bugs.icu-project.org/trac/ticket/5901: RBBI.getRuleStatus(), hoist to
34 * BreakIterator from RuleBasedBreakIterator
36 * DictionaryBasedBreakIterator is a subclass of RuleBasedBreakIterator, but
37 * doesn't actually behave as a subclass: it always returns 0 for
39 * http://bugs.icu-project.org/trac/ticket/4730: Thai RBBI, no boundary type
41 * @lucene.experimental
43 abstract class BreakIteratorWrapper {
44 protected final CharArrayIterator textIterator = new CharArrayIterator();
45 protected char text[];
50 abstract int current();
51 abstract int getRuleStatus();
52 abstract void setText(CharacterIterator text);
54 void setText(char text[], int start, int length) {
58 textIterator.setText(text, start, length);
59 setText(textIterator);
63 * If its a DictionaryBasedBreakIterator, it doesn't return rulestatus, so
64 * treat it like a generic BreakIterator If its any other
65 * RuleBasedBreakIterator, the rule status can be used for token type. If its
66 * any other BreakIterator, the rulestatus method is not available, so treat
67 * it like a generic BreakIterator.
69 static BreakIteratorWrapper wrap(BreakIterator breakIterator) {
70 if (breakIterator instanceof RuleBasedBreakIterator
71 && !(breakIterator instanceof DictionaryBasedBreakIterator))
72 return new RBBIWrapper((RuleBasedBreakIterator) breakIterator);
74 return new BIWrapper(breakIterator);
78 * RuleBasedBreakIterator wrapper: RuleBasedBreakIterator (as long as its not
79 * a DictionaryBasedBreakIterator) behaves correctly.
81 static final class RBBIWrapper extends BreakIteratorWrapper {
82 private final RuleBasedBreakIterator rbbi;
84 RBBIWrapper(RuleBasedBreakIterator rbbi) {
90 return rbbi.current();
95 return rbbi.getRuleStatus();
104 void setText(CharacterIterator text) {
110 * Generic BreakIterator wrapper: Either the rulestatus method is not
111 * available or always returns 0. Calculate a rulestatus here so it behaves
112 * like RuleBasedBreakIterator.
114 * Note: This is slower than RuleBasedBreakIterator.
116 static final class BIWrapper extends BreakIteratorWrapper {
117 private final BreakIterator bi;
120 BIWrapper(BreakIterator bi) {
130 int getRuleStatus() {
136 int current = bi.current();
137 int next = bi.next();
138 status = calcStatus(current, next);
142 private int calcStatus(int current, int next) {
143 if (current == BreakIterator.DONE || next == BreakIterator.DONE)
144 return RuleBasedBreakIterator.WORD_NONE;
146 int begin = start + current;
147 int end = start + next;
150 for (int i = begin; i < end; i += UTF16.getCharCount(codepoint)) {
151 codepoint = UTF16.charAt(text, 0, end, begin);
153 if (UCharacter.isDigit(codepoint))
154 return RuleBasedBreakIterator.WORD_NUMBER;
155 else if (UCharacter.isLetter(codepoint)) {
156 // TODO: try to separately specify ideographic, kana?
157 // [currently all bundled as letter for this case]
158 return RuleBasedBreakIterator.WORD_LETTER;
162 return RuleBasedBreakIterator.WORD_NONE;
166 void setText(CharacterIterator text) {
168 status = RuleBasedBreakIterator.WORD_NONE;