lucene-java-3.4.0/lucene/src/java/org/apache/lucene/search/spans/NearSpansOrdered.java

   1 package org.apache.lucene.search.spans;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.index.IndexReader;
  21 import org.apache.lucene.util.ArrayUtil;
  22
  23 import java.io.IOException;
  24 import java.util.ArrayList;
  25 import java.util.Comparator;
  26 import java.util.HashSet;
  27 import java.util.LinkedList;
  28 import java.util.List;
  29 import java.util.Collection;
  30 import java.util.Set;
  31
  32 /** A Spans that is formed from the ordered subspans of a SpanNearQuery
  33  * where the subspans do not overlap and have a maximum slop between them.
  34  * <p>
  35  * The formed spans only contains minimum slop matches.<br>
  36  * The matching slop is computed from the distance(s) between
  37  * the non overlapping matching Spans.<br>
  38  * Successive matches are always formed from the successive Spans
  39  * of the SpanNearQuery.
  40  * <p>
  41  * The formed spans may contain overlaps when the slop is at least 1.
  42  * For example, when querying using
  43  * <pre>t1 t2 t3</pre>
  44  * with slop at least 1, the fragment:
  45  * <pre>t1 t2 t1 t3 t2 t3</pre>
  46  * matches twice:
  47  * <pre>t1 t2 .. t3      </pre>
  48  * <pre>      t1 .. t2 t3</pre>
  49  *
  50  *
  51  * Expert:
  52  * Only public for subclassing.  Most implementations should not need this class
  53  */
  54 public class NearSpansOrdered extends Spans {
  55   private final int allowedSlop;
  56   private boolean firstTime = true;
  57   private boolean more = false;
  58
  59   /** The spans in the same order as the SpanNearQuery */
  60   private final Spans[] subSpans;
  61
  62   /** Indicates that all subSpans have same doc() */
  63   private boolean inSameDoc = false;
  64
  65   private int matchDoc = -1;
  66   private int matchStart = -1;
  67   private int matchEnd = -1;
  68   private List<byte[]> matchPayload;
  69
  70   private final Spans[] subSpansByDoc;
  71   private final Comparator<Spans> spanDocComparator = new Comparator<Spans>() {
  72     public int compare(Spans o1, Spans o2) {
  73       return o1.doc() - o2.doc();
  74     }
  75   };
  76
  77   private SpanNearQuery query;
  78   private boolean collectPayloads = true;
  79
  80   public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader) throws IOException {
  81     this(spanNearQuery, reader, true);
  82   }
  83
  84   public NearSpansOrdered(SpanNearQuery spanNearQuery, IndexReader reader, boolean collectPayloads)
  85   throws IOException {
  86     if (spanNearQuery.getClauses().length < 2) {
  87       throw new IllegalArgumentException("Less than 2 clauses: "
  88                                          + spanNearQuery);
  89     }
  90     this.collectPayloads = collectPayloads;
  91     allowedSlop = spanNearQuery.getSlop();
  92     SpanQuery[] clauses = spanNearQuery.getClauses();
  93     subSpans = new Spans[clauses.length];
  94     matchPayload = new LinkedList<byte[]>();
  95     subSpansByDoc = new Spans[clauses.length];
  96     for (int i = 0; i < clauses.length; i++) {
  97       subSpans[i] = clauses[i].getSpans(reader);
  98       subSpansByDoc[i] = subSpans[i]; // used in toSameDoc()
  99     }
 100     query = spanNearQuery; // kept for toString() only.
 101   }
 102
 103   // inherit javadocs
 104   @Override
 105   public int doc() { return matchDoc; }
 106
 107   // inherit javadocs
 108   @Override
 109   public int start() { return matchStart; }
 110
 111   // inherit javadocs
 112   @Override
 113   public int end() { return matchEnd; }
 114
 115   public Spans[] getSubSpans() {
 116           return subSpans;
 117   }
 118
 119   // TODO: Remove warning after API has been finalized
 120   // TODO: Would be nice to be able to lazy load payloads
 121   @Override
 122   public Collection<byte[]> getPayload() throws IOException {
 123     return matchPayload;
 124   }
 125
 126   // TODO: Remove warning after API has been finalized
 127   @Override
 128   public boolean isPayloadAvailable() {
 129     return matchPayload.isEmpty() == false;
 130   }
 131
 132   // inherit javadocs
 133   @Override
 134   public boolean next() throws IOException {
 135     if (firstTime) {
 136       firstTime = false;
 137       for (int i = 0; i < subSpans.length; i++) {
 138         if (! subSpans[i].next()) {
 139           more = false;
 140           return false;
 141         }
 142       }
 143       more = true;
 144     }
 145     if(collectPayloads) {
 146       matchPayload.clear();
 147     }
 148     return advanceAfterOrdered();
 149   }
 150
 151   // inherit javadocs
 152   @Override
 153   public boolean skipTo(int target) throws IOException {
 154     if (firstTime) {
 155       firstTime = false;
 156       for (int i = 0; i < subSpans.length; i++) {
 157         if (! subSpans[i].skipTo(target)) {
 158           more = false;
 159           return false;
 160         }
 161       }
 162       more = true;
 163     } else if (more && (subSpans[0].doc() < target)) {
 164       if (subSpans[0].skipTo(target)) {
 165         inSameDoc = false;
 166       } else {
 167         more = false;
 168         return false;
 169       }
 170     }
 171     if(collectPayloads) {
 172       matchPayload.clear();
 173     }
 174     return advanceAfterOrdered();
 175   }
 176
 177   /** Advances the subSpans to just after an ordered match with a minimum slop
 178    * that is smaller than the slop allowed by the SpanNearQuery.
 179    * @return true iff there is such a match.
 180    */
 181   private boolean advanceAfterOrdered() throws IOException {
 182     while (more && (inSameDoc || toSameDoc())) {
 183       if (stretchToOrder() && shrinkToAfterShortestMatch()) {
 184         return true;
 185       }
 186     }
 187     return false; // no more matches
 188   }
 189
 190
 191   /** Advance the subSpans to the same document */
 192   private boolean toSameDoc() throws IOException {
 193     ArrayUtil.mergeSort(subSpansByDoc, spanDocComparator);
 194     int firstIndex = 0;
 195     int maxDoc = subSpansByDoc[subSpansByDoc.length - 1].doc();
 196     while (subSpansByDoc[firstIndex].doc() != maxDoc) {
 197       if (! subSpansByDoc[firstIndex].skipTo(maxDoc)) {
 198         more = false;
 199         inSameDoc = false;
 200         return false;
 201       }
 202       maxDoc = subSpansByDoc[firstIndex].doc();
 203       if (++firstIndex == subSpansByDoc.length) {
 204         firstIndex = 0;
 205       }
 206     }
 207     for (int i = 0; i < subSpansByDoc.length; i++) {
 208       assert (subSpansByDoc[i].doc() == maxDoc)
 209              : " NearSpansOrdered.toSameDoc() spans " + subSpansByDoc[0]
 210                                  + "\n at doc " + subSpansByDoc[i].doc()
 211                                  + ", but should be at " + maxDoc;
 212     }
 213     inSameDoc = true;
 214     return true;
 215   }
 216
 217   /** Check whether two Spans in the same document are ordered.
 218    * @param spans1
 219    * @param spans2
 220    * @return true iff spans1 starts before spans2
 221    *              or the spans start at the same position,
 222    *              and spans1 ends before spans2.
 223    */
 224   static final boolean docSpansOrdered(Spans spans1, Spans spans2) {
 225     assert spans1.doc() == spans2.doc() : "doc1 " + spans1.doc() + " != doc2 " + spans2.doc();
 226     int start1 = spans1.start();
 227     int start2 = spans2.start();
 228     /* Do not call docSpansOrdered(int,int,int,int) to avoid invoking .end() : */
 229     return (start1 == start2) ? (spans1.end() < spans2.end()) : (start1 < start2);
 230   }
 231
 232   /** Like {@link #docSpansOrdered(Spans,Spans)}, but use the spans
 233    * starts and ends as parameters.
 234    */
 235   private static final boolean docSpansOrdered(int start1, int end1, int start2, int end2) {
 236     return (start1 == start2) ? (end1 < end2) : (start1 < start2);
 237   }
 238
 239   /** Order the subSpans within the same document by advancing all later spans
 240    * after the previous one.
 241    */
 242   private boolean stretchToOrder() throws IOException {
 243     matchDoc = subSpans[0].doc();
 244     for (int i = 1; inSameDoc && (i < subSpans.length); i++) {
 245       while (! docSpansOrdered(subSpans[i-1], subSpans[i])) {
 246         if (! subSpans[i].next()) {
 247           inSameDoc = false;
 248           more = false;
 249           break;
 250         } else if (matchDoc != subSpans[i].doc()) {
 251           inSameDoc = false;
 252           break;
 253         }
 254       }
 255     }
 256     return inSameDoc;
 257   }
 258
 259   /** The subSpans are ordered in the same doc, so there is a possible match.
 260    * Compute the slop while making the match as short as possible by advancing
 261    * all subSpans except the last one in reverse order.
 262    */
 263   private boolean shrinkToAfterShortestMatch() throws IOException {
 264     matchStart = subSpans[subSpans.length - 1].start();
 265     matchEnd = subSpans[subSpans.length - 1].end();
 266     Set<byte[]> possibleMatchPayloads = new HashSet<byte[]>();
 267     if (subSpans[subSpans.length - 1].isPayloadAvailable()) {
 268       possibleMatchPayloads.addAll(subSpans[subSpans.length - 1].getPayload());
 269     }
 270
 271     Collection<byte[]> possiblePayload = null;
 272
 273     int matchSlop = 0;
 274     int lastStart = matchStart;
 275     int lastEnd = matchEnd;
 276     for (int i = subSpans.length - 2; i >= 0; i--) {
 277       Spans prevSpans = subSpans[i];
 278       if (collectPayloads && prevSpans.isPayloadAvailable()) {
 279         Collection<byte[]> payload = prevSpans.getPayload();
 280         possiblePayload = new ArrayList<byte[]>(payload.size());
 281         possiblePayload.addAll(payload);
 282       }
 283
 284       int prevStart = prevSpans.start();
 285       int prevEnd = prevSpans.end();
 286       while (true) { // Advance prevSpans until after (lastStart, lastEnd)
 287         if (! prevSpans.next()) {
 288           inSameDoc = false;
 289           more = false;
 290           break; // Check remaining subSpans for final match.
 291         } else if (matchDoc != prevSpans.doc()) {
 292           inSameDoc = false; // The last subSpans is not advanced here.
 293           break; // Check remaining subSpans for last match in this document.
 294         } else {
 295           int ppStart = prevSpans.start();
 296           int ppEnd = prevSpans.end(); // Cannot avoid invoking .end()
 297           if (! docSpansOrdered(ppStart, ppEnd, lastStart, lastEnd)) {
 298             break; // Check remaining subSpans.
 299           } else { // prevSpans still before (lastStart, lastEnd)
 300             prevStart = ppStart;
 301             prevEnd = ppEnd;
 302             if (collectPayloads && prevSpans.isPayloadAvailable()) {
 303               Collection<byte[]> payload = prevSpans.getPayload();
 304               possiblePayload = new ArrayList<byte[]>(payload.size());
 305               possiblePayload.addAll(payload);
 306             }
 307           }
 308         }
 309       }
 310
 311       if (collectPayloads && possiblePayload != null) {
 312         possibleMatchPayloads.addAll(possiblePayload);
 313       }
 314
 315       assert prevStart <= matchStart;
 316       if (matchStart > prevEnd) { // Only non overlapping spans add to slop.
 317         matchSlop += (matchStart - prevEnd);
 318       }
 319
 320       /* Do not break on (matchSlop > allowedSlop) here to make sure
 321        * that subSpans[0] is advanced after the match, if any.
 322        */
 323       matchStart = prevStart;
 324       lastStart = prevStart;
 325       lastEnd = prevEnd;
 326     }
 327
 328     boolean match = matchSlop <= allowedSlop;
 329
 330     if(collectPayloads && match && possibleMatchPayloads.size() > 0) {
 331       matchPayload.addAll(possibleMatchPayloads);
 332     }
 333
 334     return match; // ordered and allowed slop
 335   }
 336
 337   @Override
 338   public String toString() {
 339     return getClass().getName() + "("+query.toString()+")@"+
 340       (firstTime?"START":(more?(doc()+":"+start()+"-"+end()):"END"));
 341   }
 342 }
 343