X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReader.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReader.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReader.java new file mode 100644 index 0000000..10a1054 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/index/TermInfosReader.java @@ -0,0 +1,307 @@ +package org.apache.lucene.index; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.Closeable; +import java.io.IOException; + +import org.apache.lucene.store.Directory; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.DoubleBarrelLRUCache; +import org.apache.lucene.util.CloseableThreadLocal; + +/** This stores a monotonically increasing set of pairs in a + * Directory. Pairs are accessed either by Term or by ordinal position the + * set. */ + +final class TermInfosReader implements Closeable { + private final Directory directory; + private final String segment; + private final FieldInfos fieldInfos; + + private final CloseableThreadLocal threadResources = new CloseableThreadLocal(); + private final SegmentTermEnum origEnum; + private final long size; + + private final TermInfosReaderIndex index; + private final int indexLength; + + private final int totalIndexInterval; + + private final static int DEFAULT_CACHE_SIZE = 1024; + + // Just adds term's ord to TermInfo + private final static class TermInfoAndOrd extends TermInfo { + final long termOrd; + public TermInfoAndOrd(TermInfo ti, long termOrd) { + super(ti); + assert termOrd >= 0; + this.termOrd = termOrd; + } + } + + private static class CloneableTerm extends DoubleBarrelLRUCache.CloneableKey { + private final Term term; + + public CloneableTerm(Term t) { + this.term = new Term(t.field(), t.text()); + } + + @Override + public Object clone() { + return new CloneableTerm(term); + } + + @Override + public boolean equals(Object _other) { + CloneableTerm other = (CloneableTerm) _other; + return term.equals(other.term); + } + + @Override + public int hashCode() { + return term.hashCode(); + } + } + + private final DoubleBarrelLRUCache termsCache = new DoubleBarrelLRUCache(DEFAULT_CACHE_SIZE); + + /** + * Per-thread resources managed by ThreadLocal + */ + private static final class ThreadResources { + SegmentTermEnum termEnum; + } + + TermInfosReader(Directory dir, String seg, FieldInfos fis, int readBufferSize, int indexDivisor) + throws CorruptIndexException, IOException { + boolean success = false; + + if (indexDivisor < 1 && indexDivisor != -1) { + throw new IllegalArgumentException("indexDivisor must be -1 (don't load terms index) or greater than 0: got " + indexDivisor); + } + + try { + directory = dir; + segment = seg; + fieldInfos = fis; + + origEnum = new SegmentTermEnum(directory.openInput(IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_EXTENSION), + readBufferSize), fieldInfos, false); + size = origEnum.size; + + + if (indexDivisor != -1) { + // Load terms index + totalIndexInterval = origEnum.indexInterval * indexDivisor; + final String indexFileName = IndexFileNames.segmentFileName(segment, IndexFileNames.TERMS_INDEX_EXTENSION); + final SegmentTermEnum indexEnum = new SegmentTermEnum(directory.openInput(indexFileName, + readBufferSize), fieldInfos, true); + try { + index = new TermInfosReaderIndex(indexEnum, indexDivisor, dir.fileLength(indexFileName), totalIndexInterval); + indexLength = index.length(); + } finally { + indexEnum.close(); + } + } else { + // Do not load terms index: + totalIndexInterval = -1; + index = null; + indexLength = -1; + } + success = true; + } finally { + // With lock-less commits, it's entirely possible (and + // fine) to hit a FileNotFound exception above. In + // this case, we want to explicitly close any subset + // of things that were opened so that we don't have to + // wait for a GC to do so. + if (!success) { + close(); + } + } + } + + public int getSkipInterval() { + return origEnum.skipInterval; + } + + public int getMaxSkipLevels() { + return origEnum.maxSkipLevels; + } + + public final void close() throws IOException { + if (origEnum != null) + origEnum.close(); + threadResources.close(); + } + + /** Returns the number of term/value pairs in the set. */ + final long size() { + return size; + } + + private ThreadResources getThreadResources() { + ThreadResources resources = threadResources.get(); + if (resources == null) { + resources = new ThreadResources(); + resources.termEnum = terms(); + threadResources.set(resources); + } + return resources; + } + + /** Returns the TermInfo for a Term in the set, or null. */ + TermInfo get(Term term) throws IOException { + BytesRef termBytesRef = new BytesRef(term.text); + return get(term, false, termBytesRef); + } + + /** Returns the TermInfo for a Term in the set, or null. */ + private TermInfo get(Term term, boolean mustSeekEnum, BytesRef termBytesRef) throws IOException { + if (size == 0) return null; + + ensureIndexIsRead(); + + final CloneableTerm cacheKey = new CloneableTerm(term); + + TermInfoAndOrd tiOrd = termsCache.get(cacheKey); + ThreadResources resources = getThreadResources(); + + if (!mustSeekEnum && tiOrd != null) { + return tiOrd; + } + + // optimize sequential access: first try scanning cached enum w/o seeking + SegmentTermEnum enumerator = resources.termEnum; + if (enumerator.term() != null // term is at or past current + && ((enumerator.prev() != null && term.compareTo(enumerator.prev())> 0) + || term.compareTo(enumerator.term()) >= 0)) { + int enumOffset = (int)(enumerator.position/totalIndexInterval)+1; + if (indexLength == enumOffset // but before end of block + || index.compareTo(term,termBytesRef,enumOffset) < 0) { + // no need to seek + + final TermInfo ti; + + int numScans = enumerator.scanTo(term); + if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + ti = enumerator.termInfo(); + if (numScans > 1) { + // we only want to put this TermInfo into the cache if + // scanEnum skipped more than one dictionary entry. + // This prevents RangeQueries or WildcardQueries to + // wipe out the cache when they iterate over a large numbers + // of terms in order + if (tiOrd == null) { + termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position)); + } else { + assert sameTermInfo(ti, tiOrd, enumerator); + assert (int) enumerator.position == tiOrd.termOrd; + } + } + } else { + ti = null; + } + + return ti; + } + } + + // random-access: must seek + final int indexPos; + if (tiOrd != null) { + indexPos = (int) (tiOrd.termOrd / totalIndexInterval); + } else { + // Must do binary search: + indexPos = index.getIndexOffset(term,termBytesRef); + } + + index.seekEnum(enumerator, indexPos); + enumerator.scanTo(term); + final TermInfo ti; + if (enumerator.term() != null && term.compareTo(enumerator.term()) == 0) { + ti = enumerator.termInfo(); + if (tiOrd == null) { + termsCache.put(cacheKey, new TermInfoAndOrd(ti, enumerator.position)); + } else { + assert sameTermInfo(ti, tiOrd, enumerator); + assert enumerator.position == tiOrd.termOrd; + } + } else { + ti = null; + } + return ti; + } + + // called only from asserts + private final boolean sameTermInfo(TermInfo ti1, TermInfo ti2, SegmentTermEnum enumerator) { + if (ti1.docFreq != ti2.docFreq) { + return false; + } + if (ti1.freqPointer != ti2.freqPointer) { + return false; + } + if (ti1.proxPointer != ti2.proxPointer) { + return false; + } + // skipOffset is only valid when docFreq >= skipInterval: + if (ti1.docFreq >= enumerator.skipInterval && + ti1.skipOffset != ti2.skipOffset) { + return false; + } + return true; + } + + private void ensureIndexIsRead() { + if (index == null) { + throw new IllegalStateException("terms index was not loaded when this reader was created"); + } + } + + /** Returns the position of a Term in the set or -1. */ + final long getPosition(Term term) throws IOException { + if (size == 0) return -1; + + ensureIndexIsRead(); + BytesRef termBytesRef = new BytesRef(term.text); + int indexOffset = index.getIndexOffset(term,termBytesRef); + + SegmentTermEnum enumerator = getThreadResources().termEnum; + index.seekEnum(enumerator, indexOffset); + + while(term.compareTo(enumerator.term()) > 0 && enumerator.next()) {} + + if (term.compareTo(enumerator.term()) == 0) + return enumerator.position; + else + return -1; + } + + /** Returns an enumeration of all the Terms and TermInfos in the set. */ + public SegmentTermEnum terms() { + return (SegmentTermEnum)origEnum.clone(); + } + + /** Returns an enumeration of terms starting at or after the named term. */ + public SegmentTermEnum terms(Term term) throws IOException { + BytesRef termBytesRef = new BytesRef(term.text); + get(term, true, termBytesRef); + return (SegmentTermEnum)getThreadResources().termEnum.clone(); + } +}