--- /dev/null
+package org.apache.lucene.search;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.Arrays;
+
+import org.apache.lucene.index.*;
+
+final class ExactPhraseScorer extends Scorer {
+ private final byte[] norms;
+ private final float value;
+
+ private static final int SCORE_CACHE_SIZE = 32;
+ private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
+
+ private final int endMinus1;
+
+ private final static int CHUNK = 4096;
+
+ private int gen;
+ private final int[] counts = new int[CHUNK];
+ private final int[] gens = new int[CHUNK];
+
+ boolean noDocs;
+
+ private final static class ChunkState {
+ final TermPositions posEnum;
+ final int offset;
+ final boolean useAdvance;
+ int posUpto;
+ int posLimit;
+ int pos;
+ int lastPos;
+
+ public ChunkState(TermPositions posEnum, int offset, boolean useAdvance) {
+ this.posEnum = posEnum;
+ this.offset = offset;
+ this.useAdvance = useAdvance;
+ }
+ }
+
+ private final ChunkState[] chunkStates;
+
+ private int docID = -1;
+ private int freq;
+
+ ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
+ Similarity similarity, byte[] norms) throws IOException {
+ super(similarity, weight);
+ this.norms = norms;
+ this.value = weight.getValue();
+
+ chunkStates = new ChunkState[postings.length];
+
+ endMinus1 = postings.length-1;
+
+ for(int i=0;i<postings.length;i++) {
+
+ // Coarse optimization: advance(target) is fairly
+ // costly, so, if the relative freq of the 2nd
+ // rarest term is not that much (> 1/5th) rarer than
+ // the first term, then we just use .nextDoc() when
+ // ANDing. This buys ~15% gain for phrases where
+ // freq of rarest 2 terms is close:
+ final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
+ chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
+ if (i > 0 && !postings[i].postings.next()) {
+ noDocs = true;
+ return;
+ }
+ }
+
+ for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
+ scoreCache[i] = getSimilarity().tf((float) i) * value;
+ }
+ }
+
+ @Override
+ public int nextDoc() throws IOException {
+ while(true) {
+
+ // first (rarest) term
+ if (!chunkStates[0].posEnum.next()) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ }
+
+ final int doc = chunkStates[0].posEnum.doc();
+
+ // not-first terms
+ int i = 1;
+ while(i < chunkStates.length) {
+ final ChunkState cs = chunkStates[i];
+ int doc2 = cs.posEnum.doc();
+ if (cs.useAdvance) {
+ if (doc2 < doc) {
+ if (!cs.posEnum.skipTo(doc)) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ } else {
+ doc2 = cs.posEnum.doc();
+ }
+ }
+ } else {
+ int iter = 0;
+ while(doc2 < doc) {
+ // safety net -- fallback to .skipTo if we've
+ // done too many .nextDocs
+ if (++iter == 50) {
+ if (!cs.posEnum.skipTo(doc)) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ } else {
+ doc2 = cs.posEnum.doc();
+ }
+ break;
+ } else {
+ if (cs.posEnum.next()) {
+ doc2 = cs.posEnum.doc();
+ } else {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ }
+ }
+ }
+ }
+ if (doc2 > doc) {
+ break;
+ }
+ i++;
+ }
+
+ if (i == chunkStates.length) {
+ // this doc has all the terms -- now test whether
+ // phrase occurs
+ docID = doc;
+
+ freq = phraseFreq();
+ if (freq != 0) {
+ return docID;
+ }
+ }
+ }
+ }
+
+ @Override
+ public int advance(int target) throws IOException {
+
+ // first term
+ if (!chunkStates[0].posEnum.skipTo(target)) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ }
+ int doc = chunkStates[0].posEnum.doc();
+
+ while(true) {
+
+ // not-first terms
+ int i = 1;
+ while(i < chunkStates.length) {
+ int doc2 = chunkStates[i].posEnum.doc();
+ if (doc2 < doc) {
+ if (!chunkStates[i].posEnum.skipTo(doc)) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ } else {
+ doc2 = chunkStates[i].posEnum.doc();
+ }
+ }
+ if (doc2 > doc) {
+ break;
+ }
+ i++;
+ }
+
+ if (i == chunkStates.length) {
+ // this doc has all the terms -- now test whether
+ // phrase occurs
+ docID = doc;
+ freq = phraseFreq();
+ if (freq != 0) {
+ return docID;
+ }
+ }
+
+ if (!chunkStates[0].posEnum.next()) {
+ docID = DocIdSetIterator.NO_MORE_DOCS;
+ return docID;
+ } else {
+ doc = chunkStates[0].posEnum.doc();
+ }
+ }
+ }
+
+ @Override
+ public String toString() {
+ return "ExactPhraseScorer(" + weight + ")";
+ }
+
+ @Override
+ public float freq() {
+ return freq;
+ }
+
+ @Override
+ public int docID() {
+ return docID;
+ }
+
+ @Override
+ public float score() throws IOException {
+ final float raw; // raw score
+ if (freq < SCORE_CACHE_SIZE) {
+ raw = scoreCache[freq];
+ } else {
+ raw = getSimilarity().tf((float) freq) * value;
+ }
+ return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
+ }
+
+ private int phraseFreq() throws IOException {
+
+ freq = 0;
+
+ // init chunks
+ for(int i=0;i<chunkStates.length;i++) {
+ final ChunkState cs = chunkStates[i];
+ cs.posLimit = cs.posEnum.freq();
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ cs.posUpto = 1;
+ cs.lastPos = -1;
+ }
+
+ int chunkStart = 0;
+ int chunkEnd = CHUNK;
+
+ // process chunk by chunk
+ boolean end = false;
+
+ // TODO: we could fold in chunkStart into offset and
+ // save one subtract per pos incr
+
+ while(!end) {
+
+ gen++;
+
+ if (gen == 0) {
+ // wraparound
+ Arrays.fill(gens, 0);
+ gen++;
+ }
+
+ // first term
+ {
+ final ChunkState cs = chunkStates[0];
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ counts[posIndex] = 1;
+ assert gens[posIndex] != gen;
+ gens[posIndex] = gen;
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+ }
+
+ // middle terms
+ boolean any = true;
+ for(int t=1;t<endMinus1;t++) {
+ final ChunkState cs = chunkStates[t];
+ any = false;
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
+ // viable
+ counts[posIndex]++;
+ any = true;
+ }
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+
+ if (!any) {
+ break;
+ }
+ }
+
+ if (!any) {
+ // petered out for this chunk
+ chunkStart += CHUNK;
+ chunkEnd += CHUNK;
+ continue;
+ }
+
+ // last term
+
+ {
+ final ChunkState cs = chunkStates[endMinus1];
+ while(cs.pos < chunkEnd) {
+ if (cs.pos > cs.lastPos) {
+ cs.lastPos = cs.pos;
+ final int posIndex = cs.pos - chunkStart;
+ if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
+ freq++;
+ }
+ }
+
+ if (cs.posUpto == cs.posLimit) {
+ end = true;
+ break;
+ }
+ cs.posUpto++;
+ cs.pos = cs.offset + cs.posEnum.nextPosition();
+ }
+ }
+
+ chunkStart += CHUNK;
+ chunkEnd += CHUNK;
+ }
+
+ return freq;
+ }
+}