1 package org.apache.lucene.search.spans;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.index.IndexReader;
21 import org.apache.lucene.util.ArrayUtil;
23 import java.io.IOException;
24 import java.util.ArrayList;
25 import java.util.Comparator;
26 import java.util.HashSet;
27 import java.util.LinkedList;
28 import java.util.List;
29 import java.util.Collection;
32 /** A Spans that is formed from the ordered subspans of a SpanNearQuery
33 * where the subspans do not overlap and have a maximum slop between them.
35 * The formed spans only contains minimum slop matches.<br>
36 * The matching slop is computed from the distance(s) between
37 * the non overlapping matching Spans.<br>
38 * Successive matches are always formed from the successive Spans
39 * of the SpanNearQuery.
41 * The formed spans may contain overlaps when the slop is at least 1.
42 * For example, when querying using
44 * with slop at least 1, the fragment:
45 * <pre>t1 t2 t1 t3 t2 t3</pre>
47 * <pre>t1 t2 .. t3 </pre>
48 * <pre> t1 .. t2 t3</pre>
52 * Only public for subclassing. Most implementations should not need this class
54 public class NearSpansOrdered extends Spans {
55 private final int allowedSlop;
56 private boolean firstTime = true;
57 private boolean more = false;
59 /** The spans in the same order as the SpanNearQuery */
60 private final Spans[] subSpans;
62 /** Indicates that all subSpans have same doc() */
63 private boolean inSameDoc = false;
65 private int matchDoc = -1;
66 private int matchStart = -1;
67 private int matchEnd = -1;
68 private List<byte[]> matchPayload;
70 private final Spans[] subSpansByDoc;
71 private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() {
72 public int compare(Spans o1, Spans o2) {
73 return o1.doc() - o2.doc();
77 private SpanNearQuery query;
78 private boolean collectPayloads = true;
80 public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException {
81 this(spanNearQuery, reader, true);
84 public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads)
86 if (spanNearQuery.getClauses().length < 2) {
87 throw new IllegalArgumentException("Less than 2 clauses: "
90 this.collectPayloads = collectPayloads;
91 allowedSlop = spanNearQuery.getSlop();
92 SpanQuery[] clauses = spanNearQuery.getClauses();
93 subSpans = new Spans[clauses.length];
94 matchPayload = new LinkedList<byte[]>();
95 subSpansByDoc = new Spans[clauses.length];
96 for (int i = 0; i < clauses.length; i++) {
97 subSpans[i] = clauses[i].getSpans(reader);
98 subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
100 query = spanNearQuery; // kept for toString() only.
105 public int doc() { return matchDoc; }
109 public int start() { return matchStart; }
113 public int end() { return matchEnd; }
115 public Spans[] getSubSpans() {
119 // TODO: Remove warning after API has been finalized
120 // TODO: Would be nice to be able to lazy load payloads
122 public Collection<byte[]> getPayload() throws IOException {
126 // TODO: Remove warning after API has been finalized
128 public boolean isPayloadAvailable() {
129 return matchPayload.isEmpty() == false;
134 public boolean next() throws IOException {
137 for (int i = 0; i < subSpans.length; i++) {
138 if (! subSpans[i].next()) {
145 if(collectPayloads) {
146 matchPayload.clear();
148 return advanceAfterOrdered();
153 public boolean skipTo(int target) throws IOException {
156 for (int i = 0; i < subSpans.length; i++) {
157 if (! subSpans[i].skipTo(target)) {
163 } else if (more && (subSpans[0].doc() < target)) {
164 if (subSpans[0].skipTo(target)) {
171 if(collectPayloads) {
172 matchPayload.clear();
174 return advanceAfterOrdered();
177 /** Advances the subSpans to just after an ordered match with a minimum slop
178 * that is smaller than the slop allowed by the SpanNearQuery.
179 * @return true iff there is such a match.
181 private boolean advanceAfterOrdered() throws IOException {
182 while (more && (inSameDoc || toSameDoc())) {
183 if (stretchToOrder() && shrinkToAfterShortestMatch()) {
187 return false; // no more matches
191 /** Advance the subSpans to the same document */
192 private boolean toSameDoc() throws IOException {
193 ArrayUtil.mergeSort(subSpansByDoc, spanDocComparator);
195 int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
196 while (subSpansByDoc[firstIndex].doc() != maxDoc) {
197 if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) {
202 maxDoc = subSpansByDoc[firstIndex].doc();
203 if (++firstIndex == subSpansByDoc.length) {
207 for (int i = 0; i < subSpansByDoc.length; i++) {
208 assert (subSpansByDoc[i].doc() == maxDoc)
209 : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0]
210 + "\n at doc " + subSpansByDoc[i].doc()
211 + ", but should be at " + maxDoc;
217 /** Check whether two Spans in the same document are ordered.
220 * @return true iff spans1 starts before spans2
221 * or the spans start at the same position,
222 * and spans1 ends before spans2.
224 static final boolean docSpansOrdered(Spans spans1, Spans spans2) {
225 assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc();
226 int start1 = spans1.start();
227 int start2 = spans2.start();
228 /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
229 return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2);
232 /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans
233 * starts and ends as parameters.
235 private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) {
236 return (start1 == start2) ? (end1 < end2) : (start1 < start2);
239 /** Order the subSpans within the same document by advancing all later spans
240 * after the previous one.
242 private boolean stretchToOrder() throws IOException {
243 matchDoc = subSpans[0].doc();
244 for (int i = 1; inSameDoc && (i < subSpans.length); i++) {
245 while (! docSpansOrdered(subSpans[i-1], subSpans[i])) {
246 if (! subSpans[i].next()) {
250 } else if (matchDoc != subSpans[i].doc()) {
259 /** The subSpans are ordered in the same doc, so there is a possible match.
260 * Compute the slop while making the match as short as possible by advancing
261 * all subSpans except the last one in reverse order.
263 private boolean shrinkToAfterShortestMatch() throws IOException {
264 matchStart = subSpans[subSpans.length - 1].start();
265 matchEnd = subSpans[subSpans.length - 1].end();
266 Set<byte[]> possibleMatchPayloads = new HashSet<byte[]>();
267 if (subSpans[subSpans.length - 1].isPayloadAvailable()) {
268 possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload());
271 Collection<byte[]> possiblePayload = null;
274 int lastStart = matchStart;
275 int lastEnd = matchEnd;
276 for (int i = subSpans.length - 2; i >= 0; i--) {
277 Spans prevSpans = subSpans[i];
278 if (collectPayloads && prevSpans.isPayloadAvailable()) {
279 Collection<byte[]> payload = prevSpans.getPayload();
280 possiblePayload = new ArrayList<byte[]>(payload.size());
281 possiblePayload.addAll(payload);
284 int prevStart = prevSpans.start();
285 int prevEnd = prevSpans.end();
286 while (true) { // Advance prevSpans until after (lastStart, lastEnd)
287 if (! prevSpans.next()) {
290 break; // Check remaining subSpans for final match.
291 } else if (matchDoc != prevSpans.doc()) {
292 inSameDoc = false; // The last subSpans is not advanced here.
293 break; // Check remaining subSpans for last match in this document.
295 int ppStart = prevSpans.start();
296 int ppEnd = prevSpans.end(); // Cannot avoid invoking .end()
297 if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) {
298 break; // Check remaining subSpans.
299 } else { // prevSpans still before (lastStart, lastEnd)
302 if (collectPayloads && prevSpans.isPayloadAvailable()) {
303 Collection<byte[]> payload = prevSpans.getPayload();
304 possiblePayload = new ArrayList<byte[]>(payload.size());
305 possiblePayload.addAll(payload);
311 if (collectPayloads && possiblePayload != null) {
312 possibleMatchPayloads.addAll(possiblePayload);
315 assert prevStart <= matchStart;
316 if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
317 matchSlop += (matchStart - prevEnd);
320 /* Do not break on (matchSlop > allowedSlop) here to make sure
321 * that subSpans[0] is advanced after the match, if any.
323 matchStart = prevStart;
324 lastStart = prevStart;
328 boolean match = matchSlop <= allowedSlop;
330 if(collectPayloads && match && possibleMatchPayloads.size() > 0) {
331 matchPayload.addAll(possibleMatchPayloads);
334 return match; // ordered and allowed slop
338 public String toString() {
339 return getClass().getName() + "("+query.toString()+")@"+
340 (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));