lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryScorer.java

   1 package org.apache.lucene.search.highlight;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.HashMap;
  22 import java.util.HashSet;
  23 import java.util.Map;
  24 import java.util.Set;
  25
  26 import org.apache.lucene.analysis.CachingTokenFilter;
  27 import org.apache.lucene.analysis.TokenStream;
  28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  30 import org.apache.lucene.index.IndexReader;
  31 import org.apache.lucene.index.memory.MemoryIndex;
  32 import org.apache.lucene.search.Query;
  33 import org.apache.lucene.search.spans.SpanQuery;
  34 import org.apache.lucene.util.StringHelper;
  35
  36 /**
  37  * {@link Scorer} implementation which scores text fragments by the number of
  38  * unique query terms found. This class converts appropriate {@link Query}s to
  39  * {@link SpanQuery}s and attempts to score only those terms that participated in
  40  * generating the 'hit' on the document.
  41  */
  42 public class QueryScorer implements Scorer {
  43   private float totalScore;
  44   private Set<String> foundTerms;
  45   private Map<String,WeightedSpanTerm> fieldWeightedSpanTerms;
  46   private float maxTermWeight;
  47   private int position = -1;
  48   private String defaultField;
  49   private CharTermAttribute termAtt;
  50   private PositionIncrementAttribute posIncAtt;
  51   private boolean expandMultiTermQuery = true;
  52   private Query query;
  53   private String field;
  54   private IndexReader reader;
  55   private boolean skipInitExtractor;
  56   private boolean wrapToCaching = true;
  57   private int maxCharsToAnalyze;
  58
  59   /**
  60    * @param query Query to use for highlighting
  61    */
  62   public QueryScorer(Query query) {
  63     init(query, null, null, true);
  64   }
  65
  66   /**
  67    * @param query Query to use for highlighting
  68    * @param field Field to highlight - pass null to ignore fields
  69    */
  70   public QueryScorer(Query query, String field) {
  71     init(query, field, null, true);
  72   }
  73
  74   /**
  75    * @param query Query to use for highlighting
  76    * @param field Field to highlight - pass null to ignore fields
  77    * @param reader {@link IndexReader} to use for quasi tf/idf scoring
  78    */
  79   public QueryScorer(Query query, IndexReader reader, String field) {
  80     init(query, field, reader, true);
  81   }
  82
  83
  84   /**
  85    * @param query to use for highlighting
  86    * @param reader {@link IndexReader} to use for quasi tf/idf scoring
  87    * @param field to highlight - pass null to ignore fields
  88    * @param defaultField
  89    */
  90   public QueryScorer(Query query, IndexReader reader, String field, String defaultField) {
  91     this.defaultField = StringHelper.intern(defaultField);
  92     init(query, field, reader, true);
  93   }
  94
  95   /**
  96    * @param defaultField - The default field for queries with the field name unspecified
  97    */
  98   public QueryScorer(Query query, String field, String defaultField) {
  99     this.defaultField = StringHelper.intern(defaultField);
 100     init(query, field, null, true);
 101   }
 102
 103   /**
 104    * @param weightedTerms an array of pre-created {@link WeightedSpanTerm}s
 105    */
 106   public QueryScorer(WeightedSpanTerm[] weightedTerms) {
 107     this.fieldWeightedSpanTerms = new HashMap<String,WeightedSpanTerm>(weightedTerms.length);
 108
 109     for (int i = 0; i < weightedTerms.length; i++) {
 110       WeightedSpanTerm existingTerm = fieldWeightedSpanTerms.get(weightedTerms[i].term);
 111
 112       if ((existingTerm == null) ||
 113             (existingTerm.weight < weightedTerms[i].weight)) {
 114         // if a term is defined more than once, always use the highest
 115         // scoring weight
 116         fieldWeightedSpanTerms.put(weightedTerms[i].term, weightedTerms[i]);
 117         maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
 118       }
 119     }
 120     skipInitExtractor = true;
 121   }
 122
 123   /*
 124    * (non-Javadoc)
 125    *
 126    * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
 127    */
 128   public float getFragmentScore() {
 129     return totalScore;
 130   }
 131
 132   /**
 133    *
 134    * @return The highest weighted term (useful for passing to
 135    *         GradientFormatter to set top end of coloring scale).
 136    */
 137   public float getMaxTermWeight() {
 138     return maxTermWeight;
 139   }
 140
 141   /*
 142    * (non-Javadoc)
 143    *
 144    * @see org.apache.lucene.search.highlight.Scorer#getTokenScore(org.apache.lucene.analysis.Token,
 145    *      int)
 146    */
 147   public float getTokenScore() {
 148     position += posIncAtt.getPositionIncrement();
 149     String termText = termAtt.toString();
 150
 151     WeightedSpanTerm weightedSpanTerm;
 152
 153     if ((weightedSpanTerm = fieldWeightedSpanTerms.get(
 154               termText)) == null) {
 155       return 0;
 156     }
 157
 158     if (weightedSpanTerm.positionSensitive &&
 159           !weightedSpanTerm.checkPosition(position)) {
 160       return 0;
 161     }
 162
 163     float score = weightedSpanTerm.getWeight();
 164
 165     // found a query term - is it unique in this doc?
 166     if (!foundTerms.contains(termText)) {
 167       totalScore += score;
 168       foundTerms.add(termText);
 169     }
 170
 171     return score;
 172   }
 173
 174   /* (non-Javadoc)
 175    * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
 176    */
 177   public TokenStream init(TokenStream tokenStream) throws IOException {
 178     position = -1;
 179     termAtt = tokenStream.addAttribute(CharTermAttribute.class);
 180     posIncAtt = tokenStream.addAttribute(PositionIncrementAttribute.class);
 181     if(!skipInitExtractor) {
 182       if(fieldWeightedSpanTerms != null) {
 183         fieldWeightedSpanTerms.clear();
 184       }
 185       return initExtractor(tokenStream);
 186     }
 187     return null;
 188   }
 189
 190   /**
 191    * Retrieve the {@link WeightedSpanTerm} for the specified token. Useful for passing
 192    * Span information to a {@link Fragmenter}.
 193    *
 194    * @param token to get {@link WeightedSpanTerm} for
 195    * @return WeightedSpanTerm for token
 196    */
 197   public WeightedSpanTerm getWeightedSpanTerm(String token) {
 198     return fieldWeightedSpanTerms.get(token);
 199   }
 200
 201   /**
 202    */
 203   private void init(Query query, String field, IndexReader reader, boolean expandMultiTermQuery) {
 204     this.reader = reader;
 205     this.expandMultiTermQuery = expandMultiTermQuery;
 206     this.query = query;
 207     this.field = field;
 208   }
 209
 210   private TokenStream initExtractor(TokenStream tokenStream) throws IOException {
 211     WeightedSpanTermExtractor qse = defaultField == null ? new WeightedSpanTermExtractor()
 212         : new WeightedSpanTermExtractor(defaultField);
 213     qse.setMaxDocCharsToAnalyze(maxCharsToAnalyze);
 214     qse.setExpandMultiTermQuery(expandMultiTermQuery);
 215     qse.setWrapIfNotCachingTokenFilter(wrapToCaching);
 216     if (reader == null) {
 217       this.fieldWeightedSpanTerms = qse.getWeightedSpanTerms(query,
 218           tokenStream, field);
 219     } else {
 220       this.fieldWeightedSpanTerms = qse.getWeightedSpanTermsWithScores(query,
 221           tokenStream, field, reader);
 222     }
 223     if(qse.isCachedTokenStream()) {
 224       return qse.getTokenStream();
 225     }
 226
 227     return null;
 228   }
 229
 230   /*
 231    * (non-Javadoc)
 232    *
 233    * @see org.apache.lucene.search.highlight.Scorer#startFragment(org.apache.lucene.search.highlight.TextFragment)
 234    */
 235   public void startFragment(TextFragment newFragment) {
 236     foundTerms = new HashSet<String>();
 237     totalScore = 0;
 238   }
 239
 240   /**
 241    * @return true if multi-term queries should be expanded
 242    */
 243   public boolean isExpandMultiTermQuery() {
 244     return expandMultiTermQuery;
 245   }
 246
 247   /**
 248    * Controls whether or not multi-term queries are expanded
 249    * against a {@link MemoryIndex} {@link IndexReader}.
 250    *
 251    * @param expandMultiTermQuery true if multi-term queries should be expanded
 252    */
 253   public void setExpandMultiTermQuery(boolean expandMultiTermQuery) {
 254     this.expandMultiTermQuery = expandMultiTermQuery;
 255   }
 256
 257   /**
 258    * By default, {@link TokenStream}s that are not of the type
 259    * {@link CachingTokenFilter} are wrapped in a {@link CachingTokenFilter} to
 260    * ensure an efficient reset - if you are already using a different caching
 261    * {@link TokenStream} impl and you don't want it to be wrapped, set this to
 262    * false.
 263    *
 264    * @param wrap
 265    */
 266   public void setWrapIfNotCachingTokenFilter(boolean wrap) {
 267     this.wrapToCaching = wrap;
 268   }
 269
 270   public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
 271     this.maxCharsToAnalyze = maxDocCharsToAnalyze;
 272   }
 273 }