lucene-java-3.5.0/lucene/src/java/org/apache/lucene/search/PhraseQuery.java

   1 package org.apache.lucene.search;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Set;
  22 import java.util.ArrayList;
  23
  24 import org.apache.lucene.index.Term;
  25 import org.apache.lucene.index.TermPositions;
  26 import org.apache.lucene.index.IndexReader;
  27 import org.apache.lucene.search.Explanation.IDFExplanation;
  28 import org.apache.lucene.util.ToStringUtils;
  29 import org.apache.lucene.util.ArrayUtil;
  30
  31 /** A Query that matches documents containing a particular sequence of terms.
  32  * A PhraseQuery is built by QueryParser for input like <code>"new york"</code>.
  33  *
  34  * <p>This query may be combined with other terms or queries with a {@link BooleanQuery}.
  35  */
  36 public class PhraseQuery extends Query {
  37   private String field;
  38   private ArrayList<Term> terms = new ArrayList<Term>(4);
  39   private ArrayList<Integer> positions = new ArrayList<Integer>(4);
  40   private int maxPosition = 0;
  41   private int slop = 0;
  42
  43   /** Constructs an empty phrase query. */
  44   public PhraseQuery() {}
  45
  46   /** Sets the number of other words permitted between words in query phrase.
  47     If zero, then this is an exact phrase search.  For larger values this works
  48     like a <code>WITHIN</code> or <code>NEAR</code> operator.
  49
  50     <p>The slop is in fact an edit-distance, where the units correspond to
  51     moves of terms in the query phrase out of position.  For example, to switch
  52     the order of two words requires two moves (the first move places the words
  53     atop one another), so to permit re-orderings of phrases, the slop must be
  54     at least two.
  55
  56     <p>More exact matches are scored higher than sloppier matches, thus search
  57     results are sorted by exactness.
  58
  59     <p>The slop is zero by default, requiring exact matches.*/
  60   public void setSlop(int s) { slop = s; }
  61   /** Returns the slop.  See setSlop(). */
  62   public int getSlop() { return slop; }
  63
  64   /**
  65    * Adds a term to the end of the query phrase.
  66    * The relative position of the term is the one immediately after the last term added.
  67    */
  68   public void add(Term term) {
  69     int position = 0;
  70     if(positions.size() > 0)
  71         position = positions.get(positions.size()-1).intValue() + 1;
  72
  73     add(term, position);
  74   }
  75
  76   /**
  77    * Adds a term to the end of the query phrase.
  78    * The relative position of the term within the phrase is specified explicitly.
  79    * This allows e.g. phrases with more than one term at the same position
  80    * or phrases with gaps (e.g. in connection with stopwords).
  81    *
  82    * @param term
  83    * @param position
  84    */
  85   public void add(Term term, int position) {
  86       if (terms.size() == 0)
  87           field = term.field();
  88       else if (term.field() != field)
  89           throw new IllegalArgumentException("All phrase terms must be in the same field: " + term);
  90
  91       terms.add(term);
  92       positions.add(Integer.valueOf(position));
  93       if (position > maxPosition) maxPosition = position;
  94   }
  95
  96   /** Returns the set of terms in this phrase. */
  97   public Term[] getTerms() {
  98     return terms.toArray(new Term[0]);
  99   }
 100
 101   /**
 102    * Returns the relative positions of terms in this phrase.
 103    */
 104   public int[] getPositions() {
 105       int[] result = new int[positions.size()];
 106       for(int i = 0; i < positions.size(); i++)
 107           result[i] = positions.get(i).intValue();
 108       return result;
 109   }
 110
 111   @Override
 112   public Query rewrite(IndexReader reader) throws IOException {
 113     if (terms.size() == 1) {
 114       TermQuery tq = new TermQuery(terms.get(0));
 115       tq.setBoost(getBoost());
 116       return tq;
 117     } else
 118       return super.rewrite(reader);
 119   }
 120
 121   static class PostingsAndFreq implements Comparable<PostingsAndFreq> {
 122     final TermPositions postings;
 123     final int docFreq;
 124     final int position;
 125     final Term term;
 126
 127     public PostingsAndFreq(TermPositions postings, int docFreq, int position, Term term) {
 128       this.postings = postings;
 129       this.docFreq = docFreq;
 130       this.position = position;
 131       this.term = term;
 132     }
 133
 134     public int compareTo(PostingsAndFreq other) {
 135       if (docFreq == other.docFreq) {
 136         if (position == other.position) {
 137           return term.compareTo(other.term);
 138         }
 139         return position - other.position;
 140       }
 141       return docFreq - other.docFreq;
 142     }
 143
 144     @Override
 145     public int hashCode() {
 146       final int prime = 31;
 147       int result = 1;
 148       result = prime * result + docFreq;
 149       result = prime * result + position;
 150       result = prime * result + ((term == null) ? 0 : term.hashCode());
 151       return result;
 152     }
 153
 154     @Override
 155     public boolean equals(Object obj) {
 156       if (this == obj) return true;
 157       if (obj == null) return false;
 158       if (getClass() != obj.getClass()) return false;
 159       PostingsAndFreq other = (PostingsAndFreq) obj;
 160       if (docFreq != other.docFreq) return false;
 161       if (position != other.position) return false;
 162       if (term == null) {
 163         if (other.term != null) return false;
 164       } else if (!term.equals(other.term)) return false;
 165       return true;
 166     }
 167   }
 168
 169   private class PhraseWeight extends Weight {
 170     private final Similarity similarity;
 171     private float value;
 172     private float idf;
 173     private float queryNorm;
 174     private float queryWeight;
 175     private IDFExplanation idfExp;
 176
 177     public PhraseWeight(Searcher searcher)
 178       throws IOException {
 179       this.similarity = getSimilarity(searcher);
 180
 181       idfExp = similarity.idfExplain(terms, searcher);
 182       idf = idfExp.getIdf();
 183     }
 184
 185     @Override
 186     public String toString() { return "weight(" + PhraseQuery.this + ")"; }
 187
 188     @Override
 189     public Query getQuery() { return PhraseQuery.this; }
 190
 191     @Override
 192     public float getValue() { return value; }
 193
 194     @Override
 195     public float sumOfSquaredWeights() {
 196       queryWeight = idf * getBoost();             // compute query weight
 197       return queryWeight * queryWeight;           // square it
 198     }
 199
 200     @Override
 201     public void normalize(float queryNorm) {
 202       this.queryNorm = queryNorm;
 203       queryWeight *= queryNorm;                   // normalize query weight
 204       value = queryWeight * idf;                  // idf for document
 205     }
 206
 207     @Override
 208     public Scorer scorer(IndexReader reader, boolean scoreDocsInOrder, boolean topScorer) throws IOException {
 209       if (terms.size() == 0)                      // optimize zero-term case
 210         return null;
 211
 212       PostingsAndFreq[] postingsFreqs = new PostingsAndFreq[terms.size()];
 213       for (int i = 0; i < terms.size(); i++) {
 214         final Term t = terms.get(i);
 215         TermPositions p = reader.termPositions(t);
 216         if (p == null)
 217           return null;
 218         postingsFreqs[i] = new PostingsAndFreq(p, reader.docFreq(t), positions.get(i).intValue(), t);
 219       }
 220
 221       // sort by increasing docFreq order
 222       if (slop == 0) {
 223         ArrayUtil.mergeSort(postingsFreqs);
 224       }
 225
 226       if (slop == 0) {                            // optimize exact case
 227         ExactPhraseScorer s = new ExactPhraseScorer(this, postingsFreqs, similarity,
 228                                                     reader.norms(field));
 229         if (s.noDocs) {
 230           return null;
 231         } else {
 232           return s;
 233         }
 234       } else {
 235         return
 236           new SloppyPhraseScorer(this, postingsFreqs, similarity, slop,
 237                                  reader.norms(field));
 238       }
 239     }
 240
 241     @Override
 242     public Explanation explain(IndexReader reader, int doc)
 243       throws IOException {
 244
 245       ComplexExplanation result = new ComplexExplanation();
 246       result.setDescription("weight("+getQuery()+" in "+doc+"), product of:");
 247
 248       StringBuilder docFreqs = new StringBuilder();
 249       StringBuilder query = new StringBuilder();
 250       query.append('\"');
 251       docFreqs.append(idfExp.explain());
 252       for (int i = 0; i < terms.size(); i++) {
 253         if (i != 0) {
 254           query.append(" ");
 255         }
 256
 257         Term term = terms.get(i);
 258
 259         query.append(term.text());
 260       }
 261       query.append('\"');
 262
 263       Explanation idfExpl =
 264         new Explanation(idf, "idf(" + field + ":" + docFreqs + ")");
 265
 266       // explain query weight
 267       Explanation queryExpl = new Explanation();
 268       queryExpl.setDescription("queryWeight(" + getQuery() + "), product of:");
 269
 270       Explanation boostExpl = new Explanation(getBoost(), "boost");
 271       if (getBoost() != 1.0f)
 272         queryExpl.addDetail(boostExpl);
 273       queryExpl.addDetail(idfExpl);
 274
 275       Explanation queryNormExpl = new Explanation(queryNorm,"queryNorm");
 276       queryExpl.addDetail(queryNormExpl);
 277
 278       queryExpl.setValue(boostExpl.getValue() *
 279                          idfExpl.getValue() *
 280                          queryNormExpl.getValue());
 281
 282       result.addDetail(queryExpl);
 283
 284       // explain field weight
 285       Explanation fieldExpl = new Explanation();
 286       fieldExpl.setDescription("fieldWeight("+field+":"+query+" in "+doc+
 287                                "), product of:");
 288
 289       Scorer scorer = scorer(reader, true, false);
 290       if (scorer == null) {
 291         return new Explanation(0.0f, "no matching docs");
 292       }
 293       Explanation tfExplanation = new Explanation();
 294       int d = scorer.advance(doc);
 295       float phraseFreq;
 296       if (d == doc) {
 297         phraseFreq = scorer.freq();
 298       } else {
 299         phraseFreq = 0.0f;
 300       }
 301
 302       tfExplanation.setValue(similarity.tf(phraseFreq));
 303       tfExplanation.setDescription("tf(phraseFreq=" + phraseFreq + ")");
 304
 305       fieldExpl.addDetail(tfExplanation);
 306       fieldExpl.addDetail(idfExpl);
 307
 308       Explanation fieldNormExpl = new Explanation();
 309       byte[] fieldNorms = reader.norms(field);
 310       float fieldNorm =
 311         fieldNorms!=null ? similarity.decodeNormValue(fieldNorms[doc]) : 1.0f;
 312       fieldNormExpl.setValue(fieldNorm);
 313       fieldNormExpl.setDescription("fieldNorm(field="+field+", doc="+doc+")");
 314       fieldExpl.addDetail(fieldNormExpl);
 315
 316       fieldExpl.setValue(tfExplanation.getValue() *
 317                          idfExpl.getValue() *
 318                          fieldNormExpl.getValue());
 319
 320       result.addDetail(fieldExpl);
 321
 322       // combine them
 323       result.setValue(queryExpl.getValue() * fieldExpl.getValue());
 324       result.setMatch(tfExplanation.isMatch());
 325       return result;
 326     }
 327   }
 328
 329   @Override
 330   public Weight createWeight(Searcher searcher) throws IOException {
 331     if (terms.size() == 1) {                      // optimize one-term case
 332       Term term = terms.get(0);
 333       Query termQuery = new TermQuery(term);
 334       termQuery.setBoost(getBoost());
 335       return termQuery.createWeight(searcher);
 336     }
 337     return new PhraseWeight(searcher);
 338   }
 339
 340   /**
 341    * @see org.apache.lucene.search.Query#extractTerms(Set)
 342    */
 343   @Override
 344   public void extractTerms(Set<Term> queryTerms) {
 345     queryTerms.addAll(terms);
 346   }
 347
 348   /** Prints a user-readable version of this query. */
 349   @Override
 350   public String toString(String f) {
 351     StringBuilder buffer = new StringBuilder();
 352     if (field != null && !field.equals(f)) {
 353       buffer.append(field);
 354       buffer.append(":");
 355     }
 356
 357     buffer.append("\"");
 358     String[] pieces = new String[maxPosition + 1];
 359     for (int i = 0; i < terms.size(); i++) {
 360       int pos = positions.get(i).intValue();
 361       String s = pieces[pos];
 362       if (s == null) {
 363         s = (terms.get(i)).text();
 364       } else {
 365         s = s + "|" + (terms.get(i)).text();
 366       }
 367       pieces[pos] = s;
 368     }
 369     for (int i = 0; i < pieces.length; i++) {
 370       if (i > 0) {
 371         buffer.append(' ');
 372       }
 373       String s = pieces[i];
 374       if (s == null) {
 375         buffer.append('?');
 376       } else {
 377         buffer.append(s);
 378       }
 379     }
 380     buffer.append("\"");
 381
 382     if (slop != 0) {
 383       buffer.append("~");
 384       buffer.append(slop);
 385     }
 386
 387     buffer.append(ToStringUtils.boost(getBoost()));
 388
 389     return buffer.toString();
 390   }
 391
 392   /** Returns true iff <code>o</code> is equal to this. */
 393   @Override
 394   public boolean equals(Object o) {
 395     if (!(o instanceof PhraseQuery))
 396       return false;
 397     PhraseQuery other = (PhraseQuery)o;
 398     return (this.getBoost() == other.getBoost())
 399       && (this.slop == other.slop)
 400       &&  this.terms.equals(other.terms)
 401       && this.positions.equals(other.positions);
 402   }
 403
 404   /** Returns a hash code value for this object.*/
 405   @Override
 406   public int hashCode() {
 407     return Float.floatToIntBits(getBoost())
 408       ^ slop
 409       ^ terms.hashCode()
 410       ^ positions.hashCode();
 411   }
 412
 413 }