X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java diff --git a/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java b/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java new file mode 100644 index 0000000..b893e04 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/instantiated/src/java/org/apache/lucene/store/instantiated/InstantiatedTerm.java @@ -0,0 +1,249 @@ +package org.apache.lucene.store.instantiated; + +/** + * Copyright 2006 The Apache Software Foundation + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Serializable; +import java.util.Comparator; + +import org.apache.lucene.index.Term; + +/** + * A term in the inverted index, coupled to the documents it occurs in. + * + * @see org.apache.lucene.index.Term + */ +public class InstantiatedTerm + implements Serializable { + + private static final long serialVersionUID = 1l; + + public static final Comparator comparator = new Comparator() { + public int compare(InstantiatedTerm instantiatedTerm, InstantiatedTerm instantiatedTerm1) { + return instantiatedTerm.getTerm().compareTo(instantiatedTerm1.getTerm()); + } + }; + + public static final Comparator termComparator = new Comparator() { + public int compare(Object o, Object o1) { + return ((InstantiatedTerm)o).getTerm().compareTo((Term)o1); + } + }; + + private Term term; + + /** + * index of term in InstantiatedIndex + * @see org.apache.lucene.store.instantiated.InstantiatedIndex#getOrderedTerms() */ + private int termIndex; + + /** + * @return Term associated with this entry of the index object graph + */ + public Term getTerm() { + return term; + } + + InstantiatedTerm(String field, String text) { + this.term = new Term(field, text); + } + +// this could speed up TermDocs.skipTo even more +// private Map associatedDocumentIndexByDocumentNumber = new HashMap(); +// +// public Map getAssociatedDocumentIndexByDocumentNumber() { +// return associatedDocumentIndexByDocumentNumber; +// } + + /** Ordered by document number */ + private InstantiatedTermDocumentInformation[] associatedDocuments; + + /** + * Meta data per document in which this term is occurring. + * Ordered by document number. + * + * @return Meta data per document in which this term is occurring. + */ + public InstantiatedTermDocumentInformation[] getAssociatedDocuments() { + return associatedDocuments; + } + + + /** + * Meta data per document in which this term is occurring. + * Ordered by document number. + * + * @param associatedDocuments meta data per document in which this term is occurring, ordered by document number + */ + void setAssociatedDocuments(InstantiatedTermDocumentInformation[] associatedDocuments) { + this.associatedDocuments = associatedDocuments; + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target) { + return seekCeilingDocumentInformationIndex(target, 0, getAssociatedDocuments().length); + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @param startOffset associated documents index start offset + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target, int startOffset) { + return seekCeilingDocumentInformationIndex(target, startOffset, getAssociatedDocuments().length); + } + + /** + * Finds index to the first beyond the current whose document number is + * greater than or equal to target, -1 if there is no such element. + * + * @param target the document number to match + * @param startOffset associated documents index start offset + * @param endPosition associated documents index end position + * @return -1 if there is no such element + */ + public int seekCeilingDocumentInformationIndex(int target, int startOffset, int endPosition) { + + int pos = binarySearchAssociatedDocuments(target, startOffset, endPosition - startOffset); + +// int pos = Arrays.binarySearch(getAssociatedDocuments(), target, InstantiatedTermDocumentInformation.doumentNumberIntegerComparator); + + if (pos < 0) { + pos = -1 - pos; + } + if (getAssociatedDocuments().length <= pos) { + return -1; + } else { + return pos; + } + } + + public int binarySearchAssociatedDocuments(int target) { + return binarySearchAssociatedDocuments(target, 0); + } + + public int binarySearchAssociatedDocuments(int target, int offset) { + return binarySearchAssociatedDocuments(target, offset, associatedDocuments.length - offset); + } + + /** + * @param target value to search for in the array + * @param offset index of the first valid value in the array + * @param length number of valid values in the array + * @return index of an occurrence of key in array, or -(insertionIndex + 1) if key is not contained in array (insertionIndex is then the index at which key could be inserted). + */ + public int binarySearchAssociatedDocuments(int target, int offset, int length) { + + // implementation originally from http://ochafik.free.fr/blog/?p=106 + + if (length == 0) { + return -1 - offset; + } + int min = offset, max = offset + length - 1; + int minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber(); + int maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber(); + + + int nPreviousSteps = 0; + + for (; ;) { + + // be careful not to compute key - minVal, for there might be an integer overflow. + if (target <= minVal) return target == minVal ? min : -1 - min; + if (target >= maxVal) return target == maxVal ? max : -2 - max; + + assert min != max; + + int pivot; + // A typical binarySearch algorithm uses pivot = (min + max) / 2. + // The pivot we use here tries to be smarter and to choose a pivot close to the expectable location of the key. + // This reduces dramatically the number of steps needed to get to the key. + // However, it does not work well with a logarithmic distribution of values, for instance. + // When the key is not found quickly the smart way, we switch to the standard pivot. + if (nPreviousSteps > 2) { + pivot = (min + max) >> 1; + // stop increasing nPreviousSteps from now on + } else { + // NOTE: We cannot do the following operations in int precision, because there might be overflows. + // long operations are slower than float operations with the hardware this was tested on (intel core duo 2, JVM 1.6.0). + // Overall, using float proved to be the safest and fastest approach. + pivot = min + (int) ((target - (float) minVal) / (maxVal - (float) minVal) * (max - min)); + nPreviousSteps++; + } + + int pivotVal = getAssociatedDocuments()[pivot].getDocument().getDocumentNumber(); + + // NOTE: do not store key - pivotVal because of overflows + if (target > pivotVal) { + min = pivot + 1; + max--; + } else if (target == pivotVal) { + return pivot; + } else { + min++; + max = pivot - 1; + } + maxVal = getAssociatedDocuments()[max].getDocument().getDocumentNumber(); + minVal = getAssociatedDocuments()[min].getDocument().getDocumentNumber(); + } + } + + + /** + * Navigates to the view of this occurrences of this term in a specific document. + * + * This method is only used by InstantiatedIndex(IndexReader) and + * should not be optimized for less CPU at the cost of more RAM. + * + * @param documentNumber the n:th document in the index + * @return view of this term from specified document + */ + public InstantiatedTermDocumentInformation getAssociatedDocument(int documentNumber) { + int pos = binarySearchAssociatedDocuments(documentNumber); + return pos < 0 ? null : getAssociatedDocuments()[pos]; + } + + public final String field() { + return term.field(); + } + + public final String text() { + return term.text(); + } + + @Override + public String toString() { + return term.toString(); + } + + + public int getTermIndex() { + return termIndex; + } + + public void setTermIndex(int termIndex) { + this.termIndex = termIndex; + } +}