1 package org.apache.lucene.analysis.icu.segmentation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
23 import org.apache.lucene.analysis.Tokenizer;
24 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
29 import com.ibm.icu.lang.UCharacter;
30 import com.ibm.icu.text.BreakIterator;
33 * Breaks text into words according to UAX #29: Unicode Text Segmentation
34 * (http://www.unicode.org/reports/tr29/)
36 * Words are broken across script boundaries, then segmented according to
37 * the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
39 * @see ICUTokenizerConfig
40 * @lucene.experimental
42 public final class ICUTokenizer extends Tokenizer {
43 private static final int IOBUFFER = 4096;
44 private final char buffer[] = new char[IOBUFFER];
45 /** true length of text in the buffer */
46 private int length = 0;
47 /** length in buffer that can be evaluated safely, up to a safe end point */
48 private int usableLength = 0;
49 /** accumulated offset of previous buffers for this reader, for offsetAtt */
50 private int offset = 0;
52 private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
53 private final ICUTokenizerConfig config;
54 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
55 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
56 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
57 private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
60 * Construct a new ICUTokenizer that breaks text into words from the given
63 * The default script-specific handling is used.
65 * @param input Reader containing text to tokenize.
66 * @see DefaultICUTokenizerConfig
68 public ICUTokenizer(Reader input) {
69 this(input, new DefaultICUTokenizerConfig());
73 * Construct a new ICUTokenizer that breaks text into words from the given
74 * Reader, using a tailored BreakIterator configuration.
76 * @param input Reader containing text to tokenize.
77 * @param config Tailored BreakIterator configuration
79 public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
82 breaker = new CompositeBreakIterator(config);
86 public boolean incrementToken() throws IOException {
90 while (!incrementTokenBuffer()) {
92 if (length <= 0) // no more bytes to read;
99 public void reset() throws IOException {
101 breaker.setText(buffer, 0, 0);
102 length = usableLength = offset = 0;
106 public void reset(Reader input) throws IOException {
112 public void end() throws IOException {
113 final int finalOffset = (length < 0) ? offset : offset + length;
114 offsetAtt.setOffset(finalOffset, finalOffset);
118 * This tokenizes text based upon the longest matching rule, and because of
119 * this, isn't friendly to a Reader.
121 * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
122 * text, the last unambiguous break point is found (in this implementation:
123 * white space character) Any remaining characters represent possible partial
124 * words, so are appended to the front of the next chunk.
126 * There is the possibility that there are no unambiguous break points within
127 * an entire 4kB chunk of text (binary data). So there is a maximum word limit
128 * of 4kB since it will not try to grow the buffer in this case.
132 * Returns the last unambiguous break position in the text.
134 * @return position of character, or -1 if one does not exist
136 private int findSafeEnd() {
137 for (int i = length - 1; i >= 0; i--)
138 if (UCharacter.isWhitespace(buffer[i]))
144 * Refill the buffer, accumulating the offset and setting usableLength to the
145 * last unambiguous break position
147 * @throws IOException
149 private void refill() throws IOException {
150 offset += usableLength;
151 int leftover = length - usableLength;
152 System.arraycopy(buffer, usableLength, buffer, 0, leftover);
153 int requested = buffer.length - leftover;
154 int returned = input.read(buffer, leftover, requested);
155 length = returned < 0 ? leftover : returned + leftover;
156 if (returned < requested) /* reader has been emptied, process the rest */
157 usableLength = length;
158 else { /* still more data to be read, find a safe-stopping place */
159 usableLength = findSafeEnd();
160 if (usableLength < 0)
161 usableLength = length; /*
162 * more than IOBUFFER of text without space,
163 * gonna possibly truncate tokens
167 breaker.setText(buffer, 0, Math.max(0, usableLength));
171 * return true if there is a token from the buffer, or null if it is
174 private boolean incrementTokenBuffer() {
175 int start = breaker.current();
176 if (start == BreakIterator.DONE)
177 return false; // BreakIterator exhausted
179 // find the next set of boundaries, skipping over non-tokens (rule status 0)
180 int end = breaker.next();
181 while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
183 end = breaker.next();
186 if (start == BreakIterator.DONE)
187 return false; // BreakIterator exhausted
189 termAtt.copyBuffer(buffer, start, end - start);
190 offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
191 typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
192 scriptAtt.setCode(breaker.getScriptCode());