1 package org.apache.lucene.util;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.Comparator;
21 import java.io.UnsupportedEncodingException;
23 /** Represents byte[], as a slice (offset + length) into an
26 * @lucene.experimental */
27 public final class BytesRef implements Comparable<BytesRef> {
29 static final int HASH_PRIME = 31;
30 public static final byte[] EMPTY_BYTES = new byte[0];
32 /** The contents of the BytesRef. Should never be {@code null}. */
35 /** Offset of first valid byte. */
38 /** Length of used bytes. */
45 /** This instance will directly reference bytes w/o making a copy.
46 * bytes should not be null.
48 public BytesRef(byte[] bytes, int offset, int length) {
55 /** This instance will directly reference bytes w/o making a copy.
56 * bytes should not be null */
57 public BytesRef(byte[] bytes) {
61 this.length = bytes.length;
64 public BytesRef(int capacity) {
65 this.bytes = new byte[capacity];
69 * @param text Initialize the byte[] from the UTF8 bytes
70 * for the provided Sring. This must be well-formed
71 * unicode text, with no unpaired surrogates or U+FFFF.
73 public BytesRef(CharSequence text) {
79 * @param text Initialize the byte[] from the UTF8 bytes
80 * for the provided array. This must be well-formed
81 * unicode text, with no unpaired surrogates or U+FFFF.
83 public BytesRef(char text[], int offset, int length) {
85 copy(text, offset, length);
88 public BytesRef(BytesRef other) {
94 public BytesRef(BytesRef other, boolean shallow) {
97 offset = other.offset;
98 length = other.length;
107 * Copies the UTF8 bytes for this string.
109 * @param text Must be well-formed unicode text, with no
110 * unpaired surrogates or invalid UTF16 code units.
112 public void copy(CharSequence text) {
113 UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
117 * Copies the UTF8 bytes for this string.
119 * @param text Must be well-formed unicode text, with no
120 * unpaired surrogates or invalid UTF16 code units.
122 public void copy(char text[], int offset, int length) {
123 UnicodeUtil.UTF16toUTF8(text, offset, length, this);
126 public boolean bytesEquals(BytesRef other) {
127 if (length == other.length) {
128 int otherUpto = other.offset;
129 final byte[] otherBytes = other.bytes;
130 final int end = offset + length;
131 for(int upto=offset;upto<end;upto++,otherUpto++) {
132 if (bytes[upto] != otherBytes[otherUpto]) {
143 public Object clone() {
144 return new BytesRef(this);
147 private boolean sliceEquals(BytesRef other, int pos) {
148 if (pos < 0 || length - pos < other.length) {
151 int i = offset + pos;
152 int j = other.offset;
153 final int k = other.offset + other.length;
156 if (bytes[i++] != other.bytes[j++]) {
164 public boolean startsWith(BytesRef other) {
165 return sliceEquals(other, 0);
168 public boolean endsWith(BytesRef other) {
169 return sliceEquals(other, length - other.length);
172 /** Calculates the hash code as required by TermsHash during indexing.
173 * <p>It is defined as:
176 * for (int i = offset; i < offset + length; i++) {
177 * hash = 31*hash + bytes[i];
182 public int hashCode() {
184 final int end = offset + length;
185 for(int i=offset;i<end;i++) {
186 result = HASH_PRIME * result + bytes[i];
192 public boolean equals(Object other) {
196 return this.bytesEquals((BytesRef) other);
199 /** Interprets stored bytes as UTF8 bytes, returning the
200 * resulting string */
201 public String utf8ToString() {
203 return new String(bytes, offset, length, "UTF-8");
204 } catch (UnsupportedEncodingException uee) {
205 // should not happen -- UTF8 is presumably supported
207 throw new RuntimeException(uee);
211 /** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */
213 public String toString() {
214 StringBuilder sb = new StringBuilder();
216 final int end = offset + length;
217 for(int i=offset;i<end;i++) {
221 sb.append(Integer.toHexString(bytes[i]&0xff));
224 return sb.toString();
227 public void copy(BytesRef other) {
228 if (bytes.length < other.length) {
229 bytes = new byte[other.length];
231 System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
232 length = other.length;
236 public void append(BytesRef other) {
237 int newLen = length + other.length;
238 if (bytes.length < newLen) {
239 byte[] newBytes = new byte[newLen];
240 System.arraycopy(bytes, offset, newBytes, 0, length);
244 System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length);
248 public void grow(int newLength) {
249 bytes = ArrayUtil.grow(bytes, newLength);
252 /** Unsigned byte order comparison */
253 public int compareTo(BytesRef other) {
254 if (this == other) return 0;
256 final byte[] aBytes = this.bytes;
257 int aUpto = this.offset;
258 final byte[] bBytes = other.bytes;
259 int bUpto = other.offset;
261 final int aStop = aUpto + Math.min(this.length, other.length);
263 while(aUpto < aStop) {
264 int aByte = aBytes[aUpto++] & 0xff;
265 int bByte = bBytes[bUpto++] & 0xff;
266 int diff = aByte - bByte;
267 if (diff != 0) return diff;
270 // One is a prefix of the other, or, they are equal:
271 return this.length - other.length;
274 private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
276 public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
277 return utf8SortedAsUnicodeSortOrder;
280 private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
282 private UTF8SortedAsUnicodeComparator() {};
284 public int compare(BytesRef a, BytesRef b) {
285 final byte[] aBytes = a.bytes;
286 int aUpto = a.offset;
287 final byte[] bBytes = b.bytes;
288 int bUpto = b.offset;
291 if (a.length < b.length) {
292 aStop = aUpto + a.length;
294 aStop = aUpto + b.length;
297 while(aUpto < aStop) {
298 int aByte = aBytes[aUpto++] & 0xff;
299 int bByte = bBytes[bUpto++] & 0xff;
301 int diff = aByte - bByte;
307 // One is a prefix of the other, or, they are equal:
308 return a.length - b.length;
312 private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
314 public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
315 return utf8SortedAsUTF16SortOrder;
318 private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
320 private UTF8SortedAsUTF16Comparator() {};
322 public int compare(BytesRef a, BytesRef b) {
324 final byte[] aBytes = a.bytes;
325 int aUpto = a.offset;
326 final byte[] bBytes = b.bytes;
327 int bUpto = b.offset;
330 if (a.length < b.length) {
331 aStop = aUpto + a.length;
333 aStop = aUpto + b.length;
336 while(aUpto < aStop) {
337 int aByte = aBytes[aUpto++] & 0xff;
338 int bByte = bBytes[bUpto++] & 0xff;
340 if (aByte != bByte) {
342 // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
344 // We know the terms are not equal, but, we may
345 // have to carefully fixup the bytes at the
346 // difference to match UTF16's sort order:
348 // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff,
349 // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences]
350 // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such
351 // that 6-byte sequences are needed we have much bigger problems anyway.
352 if (aByte >= 0xee && bByte >= 0xee) {
353 if ((aByte & 0xfe) == 0xee) {
356 if ((bByte&0xfe) == 0xee) {
360 return aByte - bByte;
364 // One is a prefix of the other, or, they are equal:
365 return a.length - b.length;