pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / src / java / org / apache / lucene / search / ExactPhraseScorer.java
diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java

new file mode 100644 (file)

index 0000000..9d927c3
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java
@@ -0,0 +1,354 @@
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.index.*;
+
+final class ExactPhraseScorer extends Scorer {
+  private final byte[] norms;
+  private final float value;
+
+  private static final int SCORE_CACHE_SIZE = 32;
+  private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
+
+  private final int endMinus1;
+
+  private final static int CHUNK = 4096;
+
+  private int gen;
+  private final int[] counts = new int[CHUNK];
+  private final int[] gens = new int[CHUNK];
+
+  boolean noDocs;
+
+  private final static class ChunkState {
+    final TermPositions posEnum;
+    final int offset;
+    final boolean useAdvance;
+    int posUpto;
+    int posLimit;
+    int pos;
+    int lastPos;
+
+    public ChunkState(TermPositions posEnum, int offset, boolean useAdvance) {
+      this.posEnum = posEnum;
+      this.offset = offset;
+      this.useAdvance = useAdvance;
+    }
+  }
+
+  private final ChunkState[] chunkStates;
+
+  private int docID = -1;
+  private int freq;
+
+  ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
+                    Similarity similarity, byte[] norms) throws IOException {
+    super(similarity, weight);
+    this.norms = norms;
+    this.value = weight.getValue();
+
+    chunkStates = new ChunkState[postings.length];
+
+    endMinus1 = postings.length-1;
+
+    for(int i=0;i<postings.length;i++) {
+
+      // Coarse optimization: advance(target) is fairly
+      // costly, so, if the relative freq of the 2nd
+      // rarest term is not that much (> 1/5th) rarer than
+      // the first term, then we just use .nextDoc() when
+      // ANDing.  This buys ~15% gain for phrases where
+      // freq of rarest 2 terms is close:
+      final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
+      chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
+      if (i > 0 && !postings[i].postings.next()) {
+        noDocs = true;
+        return;
+      }
+    }
+
+    for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
+      scoreCache[i] = getSimilarity().tf((float) i) * value;
+    }
+  }
+
+  @Override
+  public int nextDoc() throws IOException {
+    while(true) {
+
+      // first (rarest) term
+      if (!chunkStates[0].posEnum.next()) {
+        docID = DocIdSetIterator.NO_MORE_DOCS;
+        return docID;
+      }
+      
+      final int doc = chunkStates[0].posEnum.doc();
+
+      // not-first terms
+      int i = 1;
+      while(i < chunkStates.length) {
+        final ChunkState cs = chunkStates[i];
+        int doc2 = cs.posEnum.doc();
+        if (cs.useAdvance) {
+          if (doc2 < doc) {
+            if (!cs.posEnum.skipTo(doc)) {
+              docID = DocIdSetIterator.NO_MORE_DOCS;
+              return docID;
+            } else {
+              doc2 = cs.posEnum.doc();
+            }
+          }
+        } else {
+          int iter = 0;
+          while(doc2 < doc) {
+            // safety net -- fallback to .skipTo if we've
+            // done too many .nextDocs
+            if (++iter == 50) {
+              if (!cs.posEnum.skipTo(doc)) {
+                docID = DocIdSetIterator.NO_MORE_DOCS;
+                return docID;
+              } else {
+                doc2 = cs.posEnum.doc();
+              }
+              break;
+            } else {
+              if (cs.posEnum.next()) {
+                doc2 = cs.posEnum.doc();
+              } else {
+                docID = DocIdSetIterator.NO_MORE_DOCS;
+                return docID;
+              }
+            }
+          }
+        }
+        if (doc2 > doc) {
+          break;
+        }
+        i++;
+      }
+
+      if (i == chunkStates.length) {
+        // this doc has all the terms -- now test whether
+        // phrase occurs
+        docID = doc;
+
+        freq = phraseFreq();
+        if (freq != 0) {
+          return docID;
+        }
+      }
+    }
+  }
+
+  @Override
+  public int advance(int target) throws IOException {
+
+    // first term
+    if (!chunkStates[0].posEnum.skipTo(target)) {
+      docID = DocIdSetIterator.NO_MORE_DOCS;
+      return docID;
+    }
+    int doc = chunkStates[0].posEnum.doc();
+
+    while(true) {
+      
+      // not-first terms
+      int i = 1;
+      while(i < chunkStates.length) {
+        int doc2 = chunkStates[i].posEnum.doc();
+        if (doc2 < doc) {
+          if (!chunkStates[i].posEnum.skipTo(doc)) {
+            docID = DocIdSetIterator.NO_MORE_DOCS;
+            return docID;
+          } else {
+            doc2 = chunkStates[i].posEnum.doc();
+          }
+        }
+        if (doc2 > doc) {
+          break;
+        }
+        i++;
+      }
+
+      if (i == chunkStates.length) {
+        // this doc has all the terms -- now test whether
+        // phrase occurs
+        docID = doc;
+        freq = phraseFreq();
+        if (freq != 0) {
+          return docID;
+        }
+      }
+
+      if (!chunkStates[0].posEnum.next()) {
+        docID = DocIdSetIterator.NO_MORE_DOCS;
+        return docID;
+      } else {
+        doc = chunkStates[0].posEnum.doc();
+      }
+    }
+  }
+
+  @Override
+  public String toString() {
+    return "ExactPhraseScorer(" + weight + ")";
+  }
+
+  @Override
+  public float freq() {
+    return freq;
+  }
+
+  @Override
+  public int docID() {
+    return docID;
+  }
+
+  @Override
+  public float score() throws IOException {
+    final float raw; // raw score
+    if (freq < SCORE_CACHE_SIZE) {
+      raw = scoreCache[freq];
+    } else {
+      raw = getSimilarity().tf((float) freq) * value;
+    }
+    return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+  }
+
+  private int phraseFreq() throws IOException {
+
+    freq = 0;
+
+    // init chunks
+    for(int i=0;i<chunkStates.length;i++) {
+      final ChunkState cs = chunkStates[i];
+      cs.posLimit = cs.posEnum.freq();
+      cs.pos = cs.offset + cs.posEnum.nextPosition();
+      cs.posUpto = 1;
+      cs.lastPos = -1;
+    }
+
+    int chunkStart = 0;
+    int chunkEnd = CHUNK;
+
+    // process chunk by chunk
+    boolean end = false;
+
+    // TODO: we could fold in chunkStart into offset and
+    // save one subtract per pos incr
+
+    while(!end) {
+
+      gen++;
+
+      if (gen == 0) {
+        // wraparound
+        Arrays.fill(gens, 0);
+        gen++;
+      }
+
+      // first term
+      {
+        final ChunkState cs = chunkStates[0];
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            counts[posIndex] = 1;
+            assert gens[posIndex] != gen;
+            gens[posIndex] = gen;
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+      }
+
+      // middle terms
+      boolean any = true;
+      for(int t=1;t<endMinus1;t++) {
+        final ChunkState cs = chunkStates[t];
+        any = false;
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
+              // viable
+              counts[posIndex]++;
+              any = true;
+            }
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+
+        if (!any) {
+          break;
+        }
+      }
+
+      if (!any) {
+        // petered out for this chunk
+        chunkStart += CHUNK;
+        chunkEnd += CHUNK;
+        continue;
+      }
+
+      // last term
+
+      {
+        final ChunkState cs = chunkStates[endMinus1];
+        while(cs.pos < chunkEnd) {
+          if (cs.pos > cs.lastPos) {
+            cs.lastPos = cs.pos;
+            final int posIndex = cs.pos - chunkStart;
+            if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
+              freq++;
+            }
+          }
+
+          if (cs.posUpto == cs.posLimit) {
+            end = true;
+            break;
+          }
+          cs.posUpto++;
+          cs.pos = cs.offset + cs.posEnum.nextPosition();
+        }
+      }
+
+      chunkStart += CHUNK;
+      chunkEnd += CHUNK;
+    }
+
+    return freq;
+  }
+}