lucene-java-3.4.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java

   1 package org.apache.lucene.store.instantiated;
   2
   3 /**
   4  * Copyright 2006 The Apache Software Foundation
   5  *
   6  * Licensed under the Apache License, Version 2.0 (the "License");
   7  * you may not use this file except in compliance with the License.
   8  * You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import java.io.Serializable;
  20 import java.util.Comparator;
  21
  22 import org.apache.lucene.index.Term;
  23
  24 /**
  25  * A term in the inverted index, coupled to the documents it occurs in.
  26  *
  27  * @see org.apache.lucene.index.Term
  28  */
  29 public class InstantiatedTerm
  30     implements Serializable {
  31
  32   private static final long serialVersionUID = 1l;
  33
  34   public static final Comparator<InstantiatedTerm> comparator = new Comparator<InstantiatedTerm>() {
  35     public int compare(InstantiatedTerm instantiatedTerm, InstantiatedTerm instantiatedTerm1) {
  36       return instantiatedTerm.getTerm().compareTo(instantiatedTerm1.getTerm());
  37     }
  38   };
  39
  40   public static final Comparator<Object> termComparator = new Comparator<Object>() {
  41     public int compare(Object o, Object o1) {
  42       return ((InstantiatedTerm)o).getTerm().compareTo((Term)o1);
  43     }
  44   };
  45
  46   private Term term;
  47
  48   /**
  49    * index of term in InstantiatedIndex
  50    * @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */
  51   private int termIndex;
  52
  53   /**
  54    * @return Term associated with this entry of the index object graph
  55    */
  56   public Term getTerm() {
  57     return term;
  58   }
  59
  60   InstantiatedTerm(String field, String text) {
  61     this.term = new Term(field, text);
  62   }
  63
  64 //  this could speed up TermDocs.skipTo even more
  65 //  private Map</** document number*/Integer, /** index in associatedDocuments */Integer> associatedDocumentIndexByDocumentNumber = new HashMap<Integer, Integer>();
  66 //
  67 //  public Map</** document number*/Integer, /** index in associatedDocuments */Integer> getAssociatedDocumentIndexByDocumentNumber() {
  68 //    return associatedDocumentIndexByDocumentNumber;
  69 //  }
  70
  71   /** Ordered by document number */
  72   private InstantiatedTermDocumentInformation[] associatedDocuments;
  73
  74   /**
  75    * Meta data per document in which this term is occurring.
  76    * Ordered by document number.
  77    *
  78    * @return Meta data per document in which this term is occurring.
  79    */
  80   public InstantiatedTermDocumentInformation[] getAssociatedDocuments() {
  81     return associatedDocuments;
  82   }
  83
  84
  85   /**
  86    * Meta data per document in which this term is occurring.
  87    * Ordered by document number.
  88    *
  89    * @param associatedDocuments meta data per document in which this term is occurring, ordered by document number
  90    */
  91   void setAssociatedDocuments(InstantiatedTermDocumentInformation[] associatedDocuments) {
  92     this.associatedDocuments = associatedDocuments;
  93   }
  94
  95   /**
  96    * Finds index to the first beyond the current whose document number is
  97    * greater than or equal to <i>target</i>, -1 if there is no such element.
  98    *
  99    * @param target the document number to match
 100    * @return -1 if there is no such element
 101    */
 102   public int seekCeilingDocumentInformationIndex(int target) {
 103     return seekCeilingDocumentInformationIndex(target, 0, getAssociatedDocuments().length);
 104   }
 105
 106   /**
 107    * Finds index to the first beyond the current whose document number is
 108    * greater than or equal to <i>target</i>, -1 if there is no such element.
 109    *
 110    * @param target the document number to match
 111    * @param startOffset associated documents index start offset
 112    * @return -1 if there is no such element
 113    */
 114   public int seekCeilingDocumentInformationIndex(int target, int startOffset) {
 115     return seekCeilingDocumentInformationIndex(target, startOffset, getAssociatedDocuments().length);
 116   }
 117
 118   /**
 119    * Finds index to the first beyond the current whose document number is
 120    * greater than or equal to <i>target</i>, -1 if there is no such element.
 121    *
 122    * @param target the document number to match
 123    * @param startOffset associated documents index start offset
 124    * @param endPosition associated documents index end position
 125    * @return -1 if there is no such element
 126    */
 127   public int seekCeilingDocumentInformationIndex(int target, int startOffset, int endPosition) {
 128
 129     int pos = binarySearchAssociatedDocuments(target, startOffset, endPosition - startOffset);
 130
 131 //    int pos = Arrays.binarySearch(getAssociatedDocuments(), target, InstantiatedTermDocumentInformation.doumentNumberIntegerComparator);
 132
 133     if (pos < 0) {
 134       pos = -1 - pos;
 135     }
 136     if (getAssociatedDocuments().length <= pos) {
 137       return -1;
 138     } else {
 139       return pos;
 140     }
 141   }
 142
 143   public int binarySearchAssociatedDocuments(int target) {
 144     return binarySearchAssociatedDocuments(target, 0);
 145   }
 146
 147   public int binarySearchAssociatedDocuments(int target, int offset) {
 148     return binarySearchAssociatedDocuments(target, offset, associatedDocuments.length - offset);
 149   }
 150
 151   /**
 152    * @param target value to search for in the array
 153    * @param offset index of the first valid value in the array
 154    * @param length number of valid values in the array
 155    * @return index of an occurrence of key in array, or -(insertionIndex + 1) if key is not contained in array (<i>insertionIndex</i> is then the index at which key could be inserted).
 156    */
 157   public int binarySearchAssociatedDocuments(int target, int offset, int length) {
 158
 159     // implementation originally from http://ochafik.free.fr/blog/?p=106
 160
 161     if (length == 0) {
 162       return -1 - offset;
 163     }
 164     int min = offset, max = offset + length - 1;
 165     int minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber();
 166     int maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber();
 167
 168
 169     int nPreviousSteps = 0;
 170
 171     for (; ;) {
 172
 173       // be careful not to compute key - minVal, for there might be an integer overflow.
 174       if (target <= minVal) return target == minVal ? min : -1 - min;
 175       if (target >= maxVal) return target == maxVal ? max : -2 - max;
 176
 177       assert min != max;
 178
 179       int pivot;
 180       // A typical binarySearch algorithm uses pivot = (min + max) / 2.
 181       // The pivot we use here tries to be smarter and to choose a pivot close to the expectable location of the key.
 182       // This reduces dramatically the number of steps needed to get to the key.
 183       // However, it does not work well with a logarithmic distribution of values, for instance.
 184       // When the key is not found quickly the smart way, we switch to the standard pivot.
 185       if (nPreviousSteps > 2) {
 186         pivot = (min + max) >> 1;
 187         // stop increasing nPreviousSteps from now on
 188       } else {
 189         // NOTE: We cannot do the following operations in int precision, because there might be overflows.
 190         //       long operations are slower than float operations with the hardware this was tested on (intel core duo 2, JVM 1.6.0).
 191         //       Overall, using float proved to be the safest and fastest approach.
 192         pivot = min + (int) ((target - (float) minVal) / (maxVal - (float) minVal) * (max - min));
 193         nPreviousSteps++;
 194       }
 195
 196       int pivotVal = getAssociatedDocuments()[pivot].getDocument().getDocumentNumber();
 197
 198       // NOTE: do not store key - pivotVal because of overflows
 199       if (target > pivotVal) {
 200         min = pivot + 1;
 201         max--;
 202       } else if (target == pivotVal) {
 203         return pivot;
 204       } else {
 205         min++;
 206         max = pivot - 1;
 207       }
 208       maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber();
 209       minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber();
 210     }
 211   }
 212
 213
 214   /**
 215    * Navigates to the view of this occurrences of this term in a specific document.
 216    *
 217    * This method is only used by InstantiatedIndex(IndexReader) and
 218    * should not be optimized for less CPU at the cost of more RAM.
 219    *
 220    * @param documentNumber the n:th document in the index
 221    * @return view of this term from specified document
 222    */
 223   public InstantiatedTermDocumentInformation getAssociatedDocument(int documentNumber) {
 224     int pos = binarySearchAssociatedDocuments(documentNumber);
 225     return pos < 0 ? null : getAssociatedDocuments()[pos];
 226   }
 227
 228   public final String field() {
 229     return term.field();
 230   }
 231
 232   public final String text() {
 233     return term.text();
 234   }
 235
 236   @Override
 237   public String toString() {
 238     return term.toString();
 239   }
 240
 241
 242   public int getTermIndex() {
 243     return termIndex;
 244   }
 245
 246   public void setTermIndex(int termIndex) {
 247     this.termIndex = termIndex;
 248   }
 249 }