1 package org.apache.lucene.search;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.HashSet;
23 final class SloppyPhraseScorer extends PhraseScorer {
25 private PhrasePositions repeats[];
26 private PhrasePositions tmpPos[]; // for flipping repeating pps.
27 private boolean checkedRepeats;
29 SloppyPhraseScorer(Weight weight, PhraseQuery.PostingsAndFreq[] postings, Similarity similarity,
30 int slop, byte[] norms) {
31 super(weight, postings, similarity, norms);
36 * Score a candidate doc for all slop-valid position-combinations (matches)
37 * encountered while traversing/hopping the PhrasePositions.
38 * <br> The score contribution of a match depends on the distance:
39 * <br> - highest score for distance=0 (exact match).
40 * <br> - score gets lower as distance gets higher.
41 * <br>Example: for query "a b"~2, a document "x a b a y" can be scored twice:
42 * once for "a b" (distance=0), and once for "b a" (distance=2).
43 * <br>Possibly not all valid combinations are encountered, because for efficiency
44 * we always propagate the least PhrasePosition. This allows to base on
45 * PriorityQueue and move forward faster.
46 * As result, for example, document "a b c b a"
47 * would score differently for queries "a b c"~4 and "c b a"~4, although
48 * they really are equivalent.
49 * Similarly, for doc "a b c b a f g", query "c b"~2
50 * would get same score as "g f"~2, although "c b"~2 could be matched twice.
51 * We may want to fix this in the future (currently not, for performance reasons).
54 protected float phraseFreq() throws IOException {
55 int end = initPhrasePositions();
58 boolean done = (end<0);
60 PhrasePositions pp = pq.pop();
61 int start = pp.position;
62 int next = pq.top().position;
64 boolean tpsDiffer = true;
65 for (int pos = start; pos <= next || !tpsDiffer; pos = pp.position) {
66 if (pos<=next && tpsDiffer)
67 start = pos; // advance pp to min window
68 if (!pp.nextPosition()) {
69 done = true; // ran out of a term -- done
72 PhrasePositions pp2 = null;
73 tpsDiffer = !pp.repeats || (pp2 = termPositionsDiffer(pp))==null;
74 if (pp2!=null && pp2!=pp) {
75 pp = flip(pp,pp2); // flip pp to pp2
79 int matchLength = end - start;
80 if (matchLength <= slop)
81 freq += getSimilarity().sloppyFreq(matchLength); // score match
83 if (pp.position > end)
85 pq.add(pp); // restore pq
91 // flip pp2 and pp in the queue: pop until finding pp2, insert back all but pp2, insert pp back.
92 // assumes: pp!=pp2, pp2 in pq, pp not in pq.
93 // called only when there are repeating pps.
94 private PhrasePositions flip(PhrasePositions pp, PhrasePositions pp2) {
97 //pop until finding pp2
98 while ((pp3=pq.pop()) != pp2) {
101 //insert back all but pp2
102 for (n--; n>=0; n--) {
103 pq.insertWithOverflow(tmpPos[n]);
111 * Init PhrasePositions in place.
112 * There is a one time initialization for this scorer (taking place at the first doc that matches all terms):
113 * <br>- Put in repeats[] each pp that has another pp with same position in the doc.
114 * This relies on that the position in PP is computed as (TP.position - offset) and
115 * so by adding offset we actually compare positions and identify that the two are
117 * An exclusion to this is two distinct terms in the same offset in query and same
118 * position in doc. This case is detected by comparing just the (query) offsets,
119 * and two such PPs are not considered "repeating".
120 * <br>- Also mark each such pp by pp.repeats = true.
121 * <br>Later can consult with repeats[] in termPositionsDiffer(pp), making that check efficient.
122 * In particular, this allows to score queries with no repetitions with no overhead due to this computation.
123 * <br>- Example 1 - query with no repetitions: "ho my"~2
124 * <br>- Example 2 - query with repetitions: "ho my my"~2
125 * <br>- Example 3 - query with repetitions: "my ho my"~2
126 * <br>Init per doc w/repeats in query, includes propagating some repeating pp's to avoid false phrase detection.
127 * @return end (max position), or -1 if any term ran out (i.e. done)
128 * @throws IOException
130 private int initPhrasePositions() throws IOException {
133 // no repeats at all (most common case is also the simplest one)
134 if (checkedRepeats && repeats==null) {
135 // build queue from list
137 for (PhrasePositions pp = first; pp != null; pp = pp.next) {
139 if (pp.position > end)
141 pq.add(pp); // build pq from list
147 for (PhrasePositions pp = first; pp != null; pp = pp.next)
150 // one time initializatin for this scorer
151 if (!checkedRepeats) {
152 checkedRepeats = true;
154 HashSet<PhrasePositions> m = null;
155 for (PhrasePositions pp = first; pp != null; pp = pp.next) {
156 int tpPos = pp.position + pp.offset;
157 for (PhrasePositions pp2 = pp.next; pp2 != null; pp2 = pp2.next) {
158 if (pp.offset == pp2.offset) {
159 continue; // not a repetition: the two PPs are originally in same offset in the query!
161 int tpPos2 = pp2.position + pp2.offset;
162 if (tpPos2 == tpPos) {
164 m = new HashSet<PhrasePositions>();
173 repeats = m.toArray(new PhrasePositions[0]);
176 // with repeats must advance some repeating pp's so they all start with differing tp's
178 for (int i = 0; i < repeats.length; i++) {
179 PhrasePositions pp = repeats[i];
181 while ((pp2 = termPositionsDiffer(pp)) != null) {
182 if (!pp2.nextPosition()) // out of pps that do not differ, advance the pp with higher offset
183 return -1; // ran out of a term -- done
188 // build queue from list
190 for (PhrasePositions pp = first; pp != null; pp = pp.next) {
191 if (pp.position > end)
193 pq.add(pp); // build pq from list
197 tmpPos = new PhrasePositions[pq.size()];
203 * We disallow two pp's to have the same TermPosition, thereby verifying multiple occurrences
204 * in the query of the same word would go elsewhere in the matched doc.
205 * @return null if differ (i.e. valid) otherwise return the higher offset PhrasePositions
206 * out of the first two PPs found to not differ.
208 private PhrasePositions termPositionsDiffer(PhrasePositions pp) {
209 // efficiency note: a more efficient implementation could keep a map between repeating
210 // pp's, so that if pp1a, pp1b, pp1c are repeats term1, and pp2a, pp2b are repeats
211 // of term2, pp2a would only be checked against pp2b but not against pp1a, pp1b, pp1c.
212 // However this would complicate code, for a rather rare case, so choice is to compromise here.
213 int tpPos = pp.position + pp.offset;
214 for (int i = 0; i < repeats.length; i++) {
215 PhrasePositions pp2 = repeats[i];
219 if (pp.offset == pp2.offset) {
220 continue; // not a repetition: the two PPs are originally in same offset in the query!
222 int tpPos2 = pp2.position + pp2.offset;
223 if (tpPos2 == tpPos) {
224 return pp.offset > pp2.offset ? pp : pp2; // do not differ: return the one with higher offset.