lucene-java-3.4.0/lucene/src/java/org/apache/lucene/search/SloppyPhraseScorer.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.HashSet;
  22
  23 final class SloppyPhraseScorer extends PhraseScorer {
  24     private int slop;
  25     private PhrasePositions repeats[];
  26     private PhrasePositions tmpPos[]; // for flipping repeating pps.
  27     private boolean checkedRepeats;
  28
  29     SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
  30                        int slop, byte[] norms) {
  31         super(weight, postings, similarity, norms);
  32         this.slop = slop;
  33     }
  34
  35     /**
  36      * Score a candidate doc for all slop-valid position-combinations (matches)
  37      * encountered while traversing/hopping the PhrasePositions.
  38      * <br> The score contribution of a match depends on the distance:
  39      * <br> - highest score for distance=0 (exact match).
  40      * <br> - score gets lower as distance gets higher.
  41      * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
  42      * once for "a b" (distance=0), and once for "b a" (distance=2).
  43      * <br>Possibly not all valid combinations are encountered, because for efficiency
  44      * we always propagate the least PhrasePosition. This allows to base on
  45      * PriorityQueue and move forward faster.
  46      * As result, for example, document "a b c b a"
  47      * would score differently for queries "a b c"~4 and "c b a"~4, although
  48      * they really are equivalent.
  49      * Similarly, for doc "a b c b a f g", query "c b"~2
  50      * would get same score as "g f"~2, although "c b"~2 could be matched twice.
  51      * We may want to fix this in the future (currently not, for performance reasons).
  52      */
  53     @Override
  54     protected float phraseFreq() throws IOException {
  55         int end = initPhrasePositions();
  56
  57         float freq = 0.0f;
  58         boolean done = (end<0);
  59         while (!done) {
  60             PhrasePositions pp = pq.pop();
  61             int start = pp.position;
  62             int next = pq.top().position;
  63
  64             boolean tpsDiffer = true;
  65             for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
  66                 if (pos<=next && tpsDiffer)
  67                     start = pos;                  // advance pp to min window
  68                 if (!pp.nextPosition()) {
  69                     done = true;          // ran out of a term -- done
  70                     break;
  71                 }
  72                 PhrasePositions pp2 = null;
  73                 tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null;
  74                 if (pp2!=null && pp2!=pp) {
  75                   pp = flip(pp,pp2); // flip pp to pp2
  76                 }
  77             }
  78
  79             int matchLength = end - start;
  80             if (matchLength <= slop)
  81                 freq += getSimilarity().sloppyFreq(matchLength); // score match
  82
  83             if (pp.position > end)
  84                 end = pp.position;
  85             pq.add(pp);               // restore pq
  86         }
  87
  88         return freq;
  89     }
  90
  91     // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
  92     // assumes: pp!=pp2, pp2 in pq, pp not in pq.
  93     // called only when there are repeating pps.
  94     private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) {
  95       int n=0;
  96       PhrasePositions pp3;
  97       //pop until finding pp2
  98       while ((pp3=pq.pop()) != pp2) {
  99         tmpPos[n++] = pp3;
 100       }
 101       //insert back all but pp2
 102       for (n--; n>=0; n--) {
 103         pq.insertWithOverflow(tmpPos[n]);
 104       }
 105       //insert pp back
 106       pq.add(pp);
 107       return pp2;
 108     }
 109
 110     /**
 111      * Init PhrasePositions in place.
 112      * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
 113      * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
 114      *       This relies on that the position in PP is computed as (TP.position - offset) and
 115      *       so by adding offset we actually compare positions and identify that the two are
 116      *       the same term.
 117      *       An exclusion to this is two distinct terms in the same offset in query and same
 118      *       position in doc. This case is detected by comparing just the (query) offsets,
 119      *       and two such PPs are not considered "repeating".
 120      * <br>- Also mark each such pp by pp.repeats = true.
 121      * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
 122      * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
 123      * <br>- Example 1 - query with no repetitions: "ho my"~2
 124      * <br>- Example 2 - query with repetitions: "ho my my"~2
 125      * <br>- Example 3 - query with repetitions: "my ho my"~2
 126      * <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
 127      * @return end (max position), or -1 if any term ran out (i.e. done)
 128      * @throws IOException
 129      */
 130     private int initPhrasePositions() throws IOException {
 131         int end = 0;
 132
 133         // no repeats at all (most common case is also the simplest one)
 134         if (checkedRepeats && repeats==null) {
 135             // build queue from list
 136             pq.clear();
 137             for (PhrasePositions pp = first; pp != null; pp = pp.next) {
 138                 pp.firstPosition();
 139                 if (pp.position > end)
 140                     end = pp.position;
 141                 pq.add(pp);         // build pq from list
 142             }
 143             return end;
 144         }
 145
 146         // position the pp's
 147         for (PhrasePositions pp = first; pp != null; pp = pp.next)
 148             pp.firstPosition();
 149
 150         // one time initializatin for this scorer
 151         if (!checkedRepeats) {
 152             checkedRepeats = true;
 153             // check for repeats
 154             HashSet<PhrasePositions> m = null;
 155             for (PhrasePositions pp = first; pp != null; pp = pp.next) {
 156                 int tpPos = pp.position + pp.offset;
 157                 for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
 158                     if (pp.offset == pp2.offset) {
 159                       continue; // not a repetition: the two PPs are originally in same offset in the query!
 160                     }
 161                     int tpPos2 = pp2.position + pp2.offset;
 162                     if (tpPos2 == tpPos) {
 163                         if (m == null)
 164                             m = new HashSet<PhrasePositions>();
 165                         pp.repeats = true;
 166                         pp2.repeats = true;
 167                         m.add(pp);
 168                         m.add(pp2);
 169                     }
 170                 }
 171             }
 172             if (m!=null)
 173                 repeats = m.toArray(new PhrasePositions[0]);
 174         }
 175
 176         // with repeats must advance some repeating pp's so they all start with differing tp's
 177         if (repeats!=null) {
 178             for (int i = 0; i < repeats.length; i++) {
 179                 PhrasePositions pp = repeats[i];
 180                 PhrasePositions pp2;
 181                 while ((pp2 = termPositionsDiffer(pp)) != null) {
 182                   if (!pp2.nextPosition())  // out of pps that do not differ, advance the pp with higher offset
 183                       return -1;           // ran out of a term -- done
 184                 }
 185             }
 186         }
 187
 188         // build queue from list
 189         pq.clear();
 190         for (PhrasePositions pp = first; pp != null; pp = pp.next) {
 191             if (pp.position > end)
 192                 end = pp.position;
 193             pq.add(pp);         // build pq from list
 194         }
 195
 196         if (repeats!=null) {
 197           tmpPos = new PhrasePositions[pq.size()];
 198         }
 199         return end;
 200     }
 201
 202     /**
 203      * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
 204      * in the query of the same word would go elsewhere in the matched doc.
 205      * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
 206      * out of the first two PPs found to not differ.
 207      */
 208     private PhrasePositions termPositionsDiffer(PhrasePositions pp) {
 209         // efficiency note: a more efficient implementation could keep a map between repeating
 210         // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
 211         // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
 212         // However this would complicate code, for a rather rare case, so choice is to compromise here.
 213         int tpPos = pp.position + pp.offset;
 214         for (int i = 0; i < repeats.length; i++) {
 215             PhrasePositions pp2 = repeats[i];
 216             if (pp2 == pp) {
 217                 continue;
 218             }
 219             if (pp.offset == pp2.offset) {
 220               continue; // not a repetition: the two PPs are originally in same offset in the query!
 221             }
 222             int tpPos2 = pp2.position + pp2.offset;
 223             if (tpPos2 == tpPos) {
 224                 return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.
 225             }
 226         }
 227         return null;
 228     }
 229 }