1 package org.apache.lucene.util;
3 import java.io.IOException;
7 * Licensed to the Apache Software Foundation (ASF) under one or more
8 * contributor license agreements. See the NOTICE file distributed with
9 * this work for additional information regarding copyright ownership.
10 * The ASF licenses this file to You under the Apache License, Version 2.0
11 * (the "License"); you may not use this file except in compliance with
12 * the License. You may obtain a copy of the License at
14 * http://www.apache.org/licenses/LICENSE-2.0
16 * Unless required by applicable law or agreed to in writing, software
17 * distributed under the License is distributed on an "AS IS" BASIS,
18 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 * See the License for the specific language governing permissions and
20 * limitations under the License.
24 * {@link CharacterUtils} provides a unified interface to Character-related
25 * operations to implement backwards compatible character operations based on a
26 * {@link Version} instance.
30 public abstract class CharacterUtils {
31 private static final Java4CharacterUtils JAVA_4 = new Java4CharacterUtils();
32 private static final Java5CharacterUtils JAVA_5 = new Java5CharacterUtils();
35 * Returns a {@link CharacterUtils} implementation according to the given
36 * {@link Version} instance.
40 * @return a {@link CharacterUtils} implementation according to the given
41 * {@link Version} instance.
43 public static CharacterUtils getInstance(final Version matchVersion) {
44 return matchVersion.onOrAfter(Version.LUCENE_31) ? JAVA_5 : JAVA_4;
48 * Returns the code point at the given index of the char array.
49 * Depending on the {@link Version} passed to
50 * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
51 * of {@link Character#codePointAt(char[], int)} as it would have been
52 * available on a Java 1.4 JVM or on a later virtual machine version.
57 * the offset to the char values in the chars array to be converted
59 * @return the Unicode code point at the given index
60 * @throws NullPointerException
61 * - if the array is null.
62 * @throws IndexOutOfBoundsException
63 * - if the value offset is negative or not less than the length of
66 public abstract int codePointAt(final char[] chars, final int offset);
69 * Returns the code point at the given index of the {@link CharSequence}.
70 * Depending on the {@link Version} passed to
71 * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
72 * of {@link Character#codePointAt(char[], int)} as it would have been
73 * available on a Java 1.4 JVM or on a later virtual machine version.
76 * a character sequence
78 * the offset to the char values in the chars array to be converted
80 * @return the Unicode code point at the given index
81 * @throws NullPointerException
82 * - if the sequence is null.
83 * @throws IndexOutOfBoundsException
84 * - if the value offset is negative or not less than the length of
85 * the character sequence.
87 public abstract int codePointAt(final CharSequence seq, final int offset);
90 * Returns the code point at the given index of the char array where only elements
91 * with index less than the limit are used.
92 * Depending on the {@link Version} passed to
93 * {@link CharacterUtils#getInstance(Version)} this method mimics the behavior
94 * of {@link Character#codePointAt(char[], int)} as it would have been
95 * available on a Java 1.4 JVM or on a later virtual machine version.
100 * the offset to the char values in the chars array to be converted
101 * @param limit the index afer the last element that should be used to calculate
104 * @return the Unicode code point at the given index
105 * @throws NullPointerException
106 * - if the array is null.
107 * @throws IndexOutOfBoundsException
108 * - if the value offset is negative or not less than the length of
111 public abstract int codePointAt(final char[] chars, final int offset, final int limit);
114 * Creates a new {@link CharacterBuffer} and allocates a <code>char[]</code>
115 * of the given bufferSize.
118 * the internal char buffer size, must be <code>>= 2</code>
119 * @return a new {@link CharacterBuffer} instance.
121 public static CharacterBuffer newCharacterBuffer(final int bufferSize) {
123 throw new IllegalArgumentException("buffersize must be >= 2");
124 return new CharacterBuffer(new char[bufferSize], 0, 0);
128 * Fills the {@link CharacterBuffer} with characters read from the given
129 * reader {@link Reader}. This method tries to read as many characters into
130 * the {@link CharacterBuffer} as possible, each call to fill will start
131 * filling the buffer from offset <code>0</code> up to the length of the size
132 * of the internal character array.
134 * Depending on the {@link Version} passed to
135 * {@link CharacterUtils#getInstance(Version)} this method implements
136 * supplementary character awareness when filling the given buffer. For all
137 * {@link Version} > 3.0 {@link #fill(CharacterBuffer, Reader)} guarantees
138 * that the given {@link CharacterBuffer} will never contain a high surrogate
139 * character as the last element in the buffer unless it is the last available
140 * character in the reader. In other words, high and low surrogate pairs will
141 * always be preserved across buffer boarders.
145 * the buffer to fill.
147 * the reader to read characters from.
148 * @return <code>true</code> if and only if no more characters are available
149 * in the reader, otherwise <code>false</code>.
150 * @throws IOException
151 * if the reader throws an {@link IOException}.
153 public abstract boolean fill(CharacterBuffer buffer, Reader reader) throws IOException;
155 private static final class Java5CharacterUtils extends CharacterUtils {
156 Java5CharacterUtils() {
160 public final int codePointAt(final char[] chars, final int offset) {
161 return Character.codePointAt(chars, offset);
165 public int codePointAt(final CharSequence seq, final int offset) {
166 return Character.codePointAt(seq, offset);
170 public int codePointAt(final char[] chars, final int offset, final int limit) {
171 return Character.codePointAt(chars, offset, limit);
175 public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
176 final char[] charBuffer = buffer.buffer;
178 charBuffer[0] = buffer.lastTrailingHighSurrogate;
179 final int offset = buffer.lastTrailingHighSurrogate == 0 ? 0 : 1;
180 buffer.lastTrailingHighSurrogate = 0;
181 final int read = reader.read(charBuffer, offset, charBuffer.length
184 buffer.length = offset;
187 buffer.length = read + offset;
188 // special case if the read returns 0 and the lastTrailingHighSurrogate was set
189 if (buffer.length > 1
190 && Character.isHighSurrogate(charBuffer[buffer.length - 1])) {
191 buffer.lastTrailingHighSurrogate = charBuffer[--buffer.length];
197 private static final class Java4CharacterUtils extends CharacterUtils {
198 Java4CharacterUtils() {
202 public final int codePointAt(final char[] chars, final int offset) {
203 return chars[offset];
207 public int codePointAt(final CharSequence seq, final int offset) {
208 return seq.charAt(offset);
212 public int codePointAt(final char[] chars, final int offset, final int limit) {
214 throw new IndexOutOfBoundsException("offset must be less than limit");
215 return chars[offset];
219 public boolean fill(final CharacterBuffer buffer, final Reader reader) throws IOException {
221 final int read = reader.read(buffer.buffer);
224 buffer.length = read;
231 * A simple IO buffer to use with
232 * {@link CharacterUtils#fill(CharacterBuffer, Reader)}.
234 public static final class CharacterBuffer {
236 private final char[] buffer;
239 private char lastTrailingHighSurrogate = 0;
241 CharacterBuffer(char[] buffer, int offset, int length) {
242 this.buffer = buffer;
243 this.offset = offset;
244 this.length = length;
248 * Returns the internal buffer
252 public char[] getBuffer() {
257 * Returns the data offset in the internal buffer.
261 public int getOffset() {
266 * Return the length of the data in the internal buffer starting at
267 * {@link #getOffset()}
271 public int getLength() {
276 * Resets the CharacterBuffer. All internals are reset to its default
279 public void reset() {
282 lastTrailingHighSurrogate = 0;