lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/ExactPhraseScorer.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Arrays;
  22
  23 import org.apache.lucene.index.*;
  24
  25 final class ExactPhraseScorer extends Scorer {
  26   private final byte[] norms;
  27   private final float value;
  28
  29   private static final int SCORE_CACHE_SIZE = 32;
  30   private final float[] scoreCache = new float[SCORE_CACHE_SIZE];
  31
  32   private final int endMinus1;
  33
  34   private final static int CHUNK = 4096;
  35
  36   private int gen;
  37   private final int[] counts = new int[CHUNK];
  38   private final int[] gens = new int[CHUNK];
  39
  40   boolean noDocs;
  41
  42   private final static class ChunkState {
  43     final TermPositions posEnum;
  44     final int offset;
  45     final boolean useAdvance;
  46     int posUpto;
  47     int posLimit;
  48     int pos;
  49     int lastPos;
  50
  51     public ChunkState(TermPositions posEnum, int offset, boolean useAdvance) {
  52       this.posEnum = posEnum;
  53       this.offset = offset;
  54       this.useAdvance = useAdvance;
  55     }
  56   }
  57
  58   private final ChunkState[] chunkStates;
  59
  60   private int docID = -1;
  61   private int freq;
  62
  63   ExactPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings,
  64                     Similarity similarity, byte[] norms) throws IOException {
  65     super(similarity, weight);
  66     this.norms = norms;
  67     this.value = weight.getValue();
  68
  69     chunkStates = new ChunkState[postings.length];
  70
  71     endMinus1 = postings.length-1;
  72
  73     for(int i=0;i<postings.length;i++) {
  74
  75       // Coarse optimization: advance(target) is fairly
  76       // costly, so, if the relative freq of the 2nd
  77       // rarest term is not that much (> 1/5th) rarer than
  78       // the first term, then we just use .nextDoc() when
  79       // ANDing.  This buys ~15% gain for phrases where
  80       // freq of rarest 2 terms is close:
  81       final boolean useAdvance = postings[i].docFreq > 5*postings[0].docFreq;
  82       chunkStates[i] = new ChunkState(postings[i].postings, -postings[i].position, useAdvance);
  83       if (i > 0 && !postings[i].postings.next()) {
  84         noDocs = true;
  85         return;
  86       }
  87     }
  88
  89     for (int i = 0; i < SCORE_CACHE_SIZE; i++) {
  90       scoreCache[i] = getSimilarity().tf((float) i) * value;
  91     }
  92   }
  93
  94   @Override
  95   public int nextDoc() throws IOException {
  96     while(true) {
  97
  98       // first (rarest) term
  99       if (!chunkStates[0].posEnum.next()) {
 100         docID = DocIdSetIterator.NO_MORE_DOCS;
 101         return docID;
 102       }
 103
 104       final int doc = chunkStates[0].posEnum.doc();
 105
 106       // not-first terms
 107       int i = 1;
 108       while(i < chunkStates.length) {
 109         final ChunkState cs = chunkStates[i];
 110         int doc2 = cs.posEnum.doc();
 111         if (cs.useAdvance) {
 112           if (doc2 < doc) {
 113             if (!cs.posEnum.skipTo(doc)) {
 114               docID = DocIdSetIterator.NO_MORE_DOCS;
 115               return docID;
 116             } else {
 117               doc2 = cs.posEnum.doc();
 118             }
 119           }
 120         } else {
 121           int iter = 0;
 122           while(doc2 < doc) {
 123             // safety net -- fallback to .skipTo if we've
 124             // done too many .nextDocs
 125             if (++iter == 50) {
 126               if (!cs.posEnum.skipTo(doc)) {
 127                 docID = DocIdSetIterator.NO_MORE_DOCS;
 128                 return docID;
 129               } else {
 130                 doc2 = cs.posEnum.doc();
 131               }
 132               break;
 133             } else {
 134               if (cs.posEnum.next()) {
 135                 doc2 = cs.posEnum.doc();
 136               } else {
 137                 docID = DocIdSetIterator.NO_MORE_DOCS;
 138                 return docID;
 139               }
 140             }
 141           }
 142         }
 143         if (doc2 > doc) {
 144           break;
 145         }
 146         i++;
 147       }
 148
 149       if (i == chunkStates.length) {
 150         // this doc has all the terms -- now test whether
 151         // phrase occurs
 152         docID = doc;
 153
 154         freq = phraseFreq();
 155         if (freq != 0) {
 156           return docID;
 157         }
 158       }
 159     }
 160   }
 161
 162   @Override
 163   public int advance(int target) throws IOException {
 164
 165     // first term
 166     if (!chunkStates[0].posEnum.skipTo(target)) {
 167       docID = DocIdSetIterator.NO_MORE_DOCS;
 168       return docID;
 169     }
 170     int doc = chunkStates[0].posEnum.doc();
 171
 172     while(true) {
 173
 174       // not-first terms
 175       int i = 1;
 176       while(i < chunkStates.length) {
 177         int doc2 = chunkStates[i].posEnum.doc();
 178         if (doc2 < doc) {
 179           if (!chunkStates[i].posEnum.skipTo(doc)) {
 180             docID = DocIdSetIterator.NO_MORE_DOCS;
 181             return docID;
 182           } else {
 183             doc2 = chunkStates[i].posEnum.doc();
 184           }
 185         }
 186         if (doc2 > doc) {
 187           break;
 188         }
 189         i++;
 190       }
 191
 192       if (i == chunkStates.length) {
 193         // this doc has all the terms -- now test whether
 194         // phrase occurs
 195         docID = doc;
 196         freq = phraseFreq();
 197         if (freq != 0) {
 198           return docID;
 199         }
 200       }
 201
 202       if (!chunkStates[0].posEnum.next()) {
 203         docID = DocIdSetIterator.NO_MORE_DOCS;
 204         return docID;
 205       } else {
 206         doc = chunkStates[0].posEnum.doc();
 207       }
 208     }
 209   }
 210
 211   @Override
 212   public String toString() {
 213     return "ExactPhraseScorer(" + weight + ")";
 214   }
 215
 216   @Override
 217   public float freq() {
 218     return freq;
 219   }
 220
 221   @Override
 222   public int docID() {
 223     return docID;
 224   }
 225
 226   @Override
 227   public float score() throws IOException {
 228     final float raw; // raw score
 229     if (freq < SCORE_CACHE_SIZE) {
 230       raw = scoreCache[freq];
 231     } else {
 232       raw = getSimilarity().tf((float) freq) * value;
 233     }
 234     return norms == null ? raw : raw * getSimilarity().decodeNormValue(norms[docID]); // normalize
 235   }
 236
 237   private int phraseFreq() throws IOException {
 238
 239     freq = 0;
 240
 241     // init chunks
 242     for(int i=0;i<chunkStates.length;i++) {
 243       final ChunkState cs = chunkStates[i];
 244       cs.posLimit = cs.posEnum.freq();
 245       cs.pos = cs.offset + cs.posEnum.nextPosition();
 246       cs.posUpto = 1;
 247       cs.lastPos = -1;
 248     }
 249
 250     int chunkStart = 0;
 251     int chunkEnd = CHUNK;
 252
 253     // process chunk by chunk
 254     boolean end = false;
 255
 256     // TODO: we could fold in chunkStart into offset and
 257     // save one subtract per pos incr
 258
 259     while(!end) {
 260
 261       gen++;
 262
 263       if (gen == 0) {
 264         // wraparound
 265         Arrays.fill(gens, 0);
 266         gen++;
 267       }
 268
 269       // first term
 270       {
 271         final ChunkState cs = chunkStates[0];
 272         while(cs.pos < chunkEnd) {
 273           if (cs.pos > cs.lastPos) {
 274             cs.lastPos = cs.pos;
 275             final int posIndex = cs.pos - chunkStart;
 276             counts[posIndex] = 1;
 277             assert gens[posIndex] != gen;
 278             gens[posIndex] = gen;
 279           }
 280
 281           if (cs.posUpto == cs.posLimit) {
 282             end = true;
 283             break;
 284           }
 285           cs.posUpto++;
 286           cs.pos = cs.offset + cs.posEnum.nextPosition();
 287         }
 288       }
 289
 290       // middle terms
 291       boolean any = true;
 292       for(int t=1;t<endMinus1;t++) {
 293         final ChunkState cs = chunkStates[t];
 294         any = false;
 295         while(cs.pos < chunkEnd) {
 296           if (cs.pos > cs.lastPos) {
 297             cs.lastPos = cs.pos;
 298             final int posIndex = cs.pos - chunkStart;
 299             if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == t) {
 300               // viable
 301               counts[posIndex]++;
 302               any = true;
 303             }
 304           }
 305
 306           if (cs.posUpto == cs.posLimit) {
 307             end = true;
 308             break;
 309           }
 310           cs.posUpto++;
 311           cs.pos = cs.offset + cs.posEnum.nextPosition();
 312         }
 313
 314         if (!any) {
 315           break;
 316         }
 317       }
 318
 319       if (!any) {
 320         // petered out for this chunk
 321         chunkStart += CHUNK;
 322         chunkEnd += CHUNK;
 323         continue;
 324       }
 325
 326       // last term
 327
 328       {
 329         final ChunkState cs = chunkStates[endMinus1];
 330         while(cs.pos < chunkEnd) {
 331           if (cs.pos > cs.lastPos) {
 332             cs.lastPos = cs.pos;
 333             final int posIndex = cs.pos - chunkStart;
 334             if (posIndex >= 0 && gens[posIndex] == gen && counts[posIndex] == endMinus1) {
 335               freq++;
 336             }
 337           }
 338
 339           if (cs.posUpto == cs.posLimit) {
 340             end = true;
 341             break;
 342           }
 343           cs.posUpto++;
 344           cs.pos = cs.offset + cs.posEnum.nextPosition();
 345         }
 346       }
 347
 348       chunkStart += CHUNK;
 349       chunkEnd += CHUNK;
 350     }
 351
 352     return freq;
 353   }
 354 }