lucene-java-3.4.0/lucene/src/java/org/apache/lucene/util/BytesRef.java

   1 package org.apache.lucene.util;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.util.Comparator;
  21 import java.io.UnsupportedEncodingException;
  22
  23 /** Represents byte[], as a slice (offset + length) into an
  24  *  existing byte[].
  25  *
  26  *  @lucene.experimental */
  27 public final class BytesRef implements Comparable<BytesRef> {
  28
  29   static final int HASH_PRIME = 31;
  30   public static final byte[] EMPTY_BYTES = new byte[0];
  31
  32   /** The contents of the BytesRef. Should never be {@code null}. */
  33   public byte[] bytes;
  34
  35   /** Offset of first valid byte. */
  36   public int offset;
  37
  38   /** Length of used bytes. */
  39   public int length;
  40
  41   public BytesRef() {
  42     bytes = EMPTY_BYTES;
  43   }
  44
  45   /** This instance will directly reference bytes w/o making a copy.
  46    * bytes should not be null.
  47    */
  48   public BytesRef(byte[] bytes, int offset, int length) {
  49     assert bytes != null;
  50     this.bytes = bytes;
  51     this.offset = offset;
  52     this.length = length;
  53   }
  54
  55   /** This instance will directly reference bytes w/o making a copy.
  56    * bytes should not be null */
  57   public BytesRef(byte[] bytes) {
  58     assert bytes != null;
  59     this.bytes = bytes;
  60     this.offset = 0;
  61     this.length = bytes.length;
  62   }
  63
  64   public BytesRef(int capacity) {
  65     this.bytes = new byte[capacity];
  66   }
  67
  68   /**
  69    * @param text Initialize the byte[] from the UTF8 bytes
  70    * for the provided Sring.  This must be well-formed
  71    * unicode text, with no unpaired surrogates or U+FFFF.
  72    */
  73   public BytesRef(CharSequence text) {
  74     this();
  75     copy(text);
  76   }
  77
  78   /**
  79    * @param text Initialize the byte[] from the UTF8 bytes
  80    * for the provided array.  This must be well-formed
  81    * unicode text, with no unpaired surrogates or U+FFFF.
  82    */
  83   public BytesRef(char text[], int offset, int length) {
  84     this(length * 4);
  85     copy(text, offset, length);
  86   }
  87
  88   public BytesRef(BytesRef other) {
  89     this();
  90     copy(other);
  91   }
  92
  93   /* // maybe?
  94   public BytesRef(BytesRef other, boolean shallow) {
  95     this();
  96     if (shallow) {
  97       offset = other.offset;
  98       length = other.length;
  99       bytes = other.bytes;
 100     } else {
 101       copy(other);
 102     }
 103   }
 104   */
 105
 106   /**
 107    * Copies the UTF8 bytes for this string.
 108    *
 109    * @param text Must be well-formed unicode text, with no
 110    * unpaired surrogates or invalid UTF16 code units.
 111    */
 112   public void copy(CharSequence text) {
 113     UnicodeUtil.UTF16toUTF8(text, 0, text.length(), this);
 114   }
 115
 116   /**
 117    * Copies the UTF8 bytes for this string.
 118    *
 119    * @param text Must be well-formed unicode text, with no
 120    * unpaired surrogates or invalid UTF16 code units.
 121    */
 122   public void copy(char text[], int offset, int length) {
 123     UnicodeUtil.UTF16toUTF8(text, offset, length, this);
 124   }
 125
 126   public boolean bytesEquals(BytesRef other) {
 127     if (length == other.length) {
 128       int otherUpto = other.offset;
 129       final byte[] otherBytes = other.bytes;
 130       final int end = offset + length;
 131       for(int upto=offset;upto<end;upto++,otherUpto++) {
 132         if (bytes[upto] != otherBytes[otherUpto]) {
 133           return false;
 134         }
 135       }
 136       return true;
 137     } else {
 138       return false;
 139     }
 140   }
 141
 142   @Override
 143   public Object clone() {
 144     return new BytesRef(this);
 145   }
 146
 147   private boolean sliceEquals(BytesRef other, int pos) {
 148     if (pos < 0 || length - pos < other.length) {
 149       return false;
 150     }
 151     int i = offset + pos;
 152     int j = other.offset;
 153     final int k = other.offset + other.length;
 154
 155     while (j < k) {
 156       if (bytes[i++] != other.bytes[j++]) {
 157         return false;
 158       }
 159     }
 160
 161     return true;
 162   }
 163
 164   public boolean startsWith(BytesRef other) {
 165     return sliceEquals(other, 0);
 166   }
 167
 168   public boolean endsWith(BytesRef other) {
 169     return sliceEquals(other, length - other.length);
 170   }
 171
 172   /** Calculates the hash code as required by TermsHash during indexing.
 173    * <p>It is defined as:
 174    * <pre>
 175    *  int hash = 0;
 176    *  for (int i = offset; i &lt; offset + length; i++) {
 177    *    hash = 31*hash + bytes[i];
 178    *  }
 179    * </pre>
 180    */
 181   @Override
 182   public int hashCode() {
 183     int result = 0;
 184     final int end = offset + length;
 185     for(int i=offset;i<end;i++) {
 186       result = HASH_PRIME * result + bytes[i];
 187     }
 188     return result;
 189   }
 190
 191   @Override
 192   public boolean equals(Object other) {
 193     if (other == null) {
 194       return false;
 195     }
 196     return this.bytesEquals((BytesRef) other);
 197   }
 198
 199   /** Interprets stored bytes as UTF8 bytes, returning the
 200    *  resulting string */
 201   public String utf8ToString() {
 202     try {
 203       return new String(bytes, offset, length, "UTF-8");
 204     } catch (UnsupportedEncodingException uee) {
 205       // should not happen -- UTF8 is presumably supported
 206       // by all JREs
 207       throw new RuntimeException(uee);
 208     }
 209   }
 210
 211   /** Returns hex encoded bytes, eg [0x6c 0x75 0x63 0x65 0x6e 0x65] */
 212   @Override
 213   public String toString() {
 214     StringBuilder sb = new StringBuilder();
 215     sb.append('[');
 216     final int end = offset + length;
 217     for(int i=offset;i<end;i++) {
 218       if (i > offset) {
 219         sb.append(' ');
 220       }
 221       sb.append(Integer.toHexString(bytes[i]&0xff));
 222     }
 223     sb.append(']');
 224     return sb.toString();
 225   }
 226
 227   public void copy(BytesRef other) {
 228     if (bytes.length < other.length) {
 229       bytes = new byte[other.length];
 230     }
 231     System.arraycopy(other.bytes, other.offset, bytes, 0, other.length);
 232     length = other.length;
 233     offset = 0;
 234   }
 235
 236   public void append(BytesRef other) {
 237     int newLen = length + other.length;
 238     if (bytes.length < newLen) {
 239       byte[] newBytes = new byte[newLen];
 240       System.arraycopy(bytes, offset, newBytes, 0, length);
 241       offset = 0;
 242       bytes = newBytes;
 243     }
 244     System.arraycopy(other.bytes, other.offset, bytes, length+offset, other.length);
 245     length = newLen;
 246   }
 247
 248   public void grow(int newLength) {
 249     bytes = ArrayUtil.grow(bytes, newLength);
 250   }
 251
 252   /** Unsigned byte order comparison */
 253   public int compareTo(BytesRef other) {
 254     if (this == other) return 0;
 255
 256     final byte[] aBytes = this.bytes;
 257     int aUpto = this.offset;
 258     final byte[] bBytes = other.bytes;
 259     int bUpto = other.offset;
 260
 261     final int aStop = aUpto + Math.min(this.length, other.length);
 262
 263     while(aUpto < aStop) {
 264       int aByte = aBytes[aUpto++] & 0xff;
 265       int bByte = bBytes[bUpto++] & 0xff;
 266       int diff = aByte - bByte;
 267       if (diff != 0) return diff;
 268     }
 269
 270     // One is a prefix of the other, or, they are equal:
 271     return this.length - other.length;
 272   }
 273
 274   private final static Comparator<BytesRef> utf8SortedAsUnicodeSortOrder = new UTF8SortedAsUnicodeComparator();
 275
 276   public static Comparator<BytesRef> getUTF8SortedAsUnicodeComparator() {
 277     return utf8SortedAsUnicodeSortOrder;
 278   }
 279
 280   private static class UTF8SortedAsUnicodeComparator implements Comparator<BytesRef> {
 281     // Only singleton
 282     private UTF8SortedAsUnicodeComparator() {};
 283
 284     public int compare(BytesRef a, BytesRef b) {
 285       final byte[] aBytes = a.bytes;
 286       int aUpto = a.offset;
 287       final byte[] bBytes = b.bytes;
 288       int bUpto = b.offset;
 289
 290       final int aStop;
 291       if (a.length < b.length) {
 292         aStop = aUpto + a.length;
 293       } else {
 294         aStop = aUpto + b.length;
 295       }
 296
 297       while(aUpto < aStop) {
 298         int aByte = aBytes[aUpto++] & 0xff;
 299         int bByte = bBytes[bUpto++] & 0xff;
 300
 301         int diff = aByte - bByte;
 302         if (diff != 0) {
 303           return diff;
 304         }
 305       }
 306
 307       // One is a prefix of the other, or, they are equal:
 308       return a.length - b.length;
 309     }
 310   }
 311
 312   private final static Comparator<BytesRef> utf8SortedAsUTF16SortOrder = new UTF8SortedAsUTF16Comparator();
 313
 314   public static Comparator<BytesRef> getUTF8SortedAsUTF16Comparator() {
 315     return utf8SortedAsUTF16SortOrder;
 316   }
 317
 318   private static class UTF8SortedAsUTF16Comparator implements Comparator<BytesRef> {
 319     // Only singleton
 320     private UTF8SortedAsUTF16Comparator() {};
 321
 322     public int compare(BytesRef a, BytesRef b) {
 323
 324       final byte[] aBytes = a.bytes;
 325       int aUpto = a.offset;
 326       final byte[] bBytes = b.bytes;
 327       int bUpto = b.offset;
 328
 329       final int aStop;
 330       if (a.length < b.length) {
 331         aStop = aUpto + a.length;
 332       } else {
 333         aStop = aUpto + b.length;
 334       }
 335
 336       while(aUpto < aStop) {
 337         int aByte = aBytes[aUpto++] & 0xff;
 338         int bByte = bBytes[bUpto++] & 0xff;
 339
 340         if (aByte != bByte) {
 341
 342           // See http://icu-project.org/docs/papers/utf16_code_point_order.html#utf-8-in-utf-16-order
 343
 344           // We know the terms are not equal, but, we may
 345           // have to carefully fixup the bytes at the
 346           // difference to match UTF16's sort order:
 347
 348           // NOTE: instead of moving supplementary code points (0xee and 0xef) to the unused 0xfe and 0xff,
 349           // we move them to the unused 0xfc and 0xfd [reserved for future 6-byte character sequences]
 350           // this reserves 0xff for preflex's term reordering (surrogate dance), and if unicode grows such
 351           // that 6-byte sequences are needed we have much bigger problems anyway.
 352           if (aByte >= 0xee && bByte >= 0xee) {
 353             if ((aByte & 0xfe) == 0xee) {
 354               aByte += 0xe;
 355             }
 356             if ((bByte&0xfe) == 0xee) {
 357               bByte += 0xe;
 358             }
 359           }
 360           return aByte - bByte;
 361         }
 362       }
 363
 364       // One is a prefix of the other, or, they are equal:
 365       return a.length - b.length;
 366     }
 367   }
 368 }