lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java

   1 package org.apache.lucene.search.highlight;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.util.HashMap;
  21 import java.util.HashSet;
  22
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25 import org.apache.lucene.index.IndexReader;
  26 import org.apache.lucene.search.Query;
  27
  28 /**
  29  * {@link Scorer} implementation which scores text fragments by the number of
  30  * unique query terms found. This class uses the {@link QueryTermExtractor}
  31  * class to process determine the query terms and their boosts to be used.
  32  */
  33 // TODO: provide option to boost score of fragments near beginning of document
  34 // based on fragment.getFragNum()
  35 public class QueryTermScorer implements Scorer {
  36
  37   TextFragment currentTextFragment = null;
  38   HashSet<String> uniqueTermsInFragment;
  39
  40   float totalScore = 0;
  41   float maxTermWeight = 0;
  42   private HashMap<String,WeightedTerm> termsToFind;
  43
  44   private CharTermAttribute termAtt;
  45
  46   /**
  47    *
  48    * @param query a Lucene query (ideally rewritten using query.rewrite before
  49    *        being passed to this class and the searcher)
  50    */
  51   public QueryTermScorer(Query query) {
  52     this(QueryTermExtractor.getTerms(query));
  53   }
  54
  55   /**
  56    *
  57    * @param query a Lucene query (ideally rewritten using query.rewrite before
  58    *        being passed to this class and the searcher)
  59    * @param fieldName the Field name which is used to match Query terms
  60    */
  61   public QueryTermScorer(Query query, String fieldName) {
  62     this(QueryTermExtractor.getTerms(query, false, fieldName));
  63   }
  64
  65   /**
  66    *
  67    * @param query a Lucene query (ideally rewritten using query.rewrite before
  68    *        being passed to this class and the searcher)
  69    * @param reader used to compute IDF which can be used to a) score selected
  70    *        fragments better b) use graded highlights eg set font color
  71    *        intensity
  72    * @param fieldName the field on which Inverse Document Frequency (IDF)
  73    *        calculations are based
  74    */
  75   public QueryTermScorer(Query query, IndexReader reader, String fieldName) {
  76     this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName));
  77   }
  78
  79   public QueryTermScorer(WeightedTerm[] weightedTerms) {
  80     termsToFind = new HashMap<String,WeightedTerm>();
  81     for (int i = 0; i < weightedTerms.length; i++) {
  82       WeightedTerm existingTerm = termsToFind
  83           .get(weightedTerms[i].term);
  84       if ((existingTerm == null)
  85           || (existingTerm.weight < weightedTerms[i].weight)) {
  86         // if a term is defined more than once, always use the highest scoring
  87         // weight
  88         termsToFind.put(weightedTerms[i].term, weightedTerms[i]);
  89         maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight());
  90       }
  91     }
  92   }
  93
  94   /* (non-Javadoc)
  95    * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream)
  96    */
  97   public TokenStream init(TokenStream tokenStream) {
  98     termAtt = tokenStream.addAttribute(CharTermAttribute.class);
  99     return null;
 100   }
 101
 102   /*
 103    * (non-Javadoc)
 104    *
 105    * @see
 106    * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache
 107    * .lucene.search.highlight.TextFragment)
 108    */
 109   public void startFragment(TextFragment newFragment) {
 110     uniqueTermsInFragment = new HashSet<String>();
 111     currentTextFragment = newFragment;
 112     totalScore = 0;
 113
 114   }
 115
 116
 117   /* (non-Javadoc)
 118    * @see org.apache.lucene.search.highlight.Scorer#getTokenScore()
 119    */
 120   public float getTokenScore() {
 121     String termText = termAtt.toString();
 122
 123     WeightedTerm queryTerm = termsToFind.get(termText);
 124     if (queryTerm == null) {
 125       // not a query term - return
 126       return 0;
 127     }
 128     // found a query term - is it unique in this doc?
 129     if (!uniqueTermsInFragment.contains(termText)) {
 130       totalScore += queryTerm.getWeight();
 131       uniqueTermsInFragment.add(termText);
 132     }
 133     return queryTerm.getWeight();
 134   }
 135
 136
 137   /* (non-Javadoc)
 138    * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore()
 139    */
 140   public float getFragmentScore() {
 141     return totalScore;
 142   }
 143
 144   /*
 145    * (non-Javadoc)
 146    *
 147    * @see
 148    * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed()
 149    */
 150   public void allFragmentsProcessed() {
 151     // this class has no special operations to perform at end of processing
 152   }
 153
 154   /**
 155    *
 156    * @return The highest weighted term (useful for passing to GradientFormatter
 157    *         to set top end of coloring scale.
 158    */
 159   public float getMaxTermWeight() {
 160     return maxTermWeight;
 161   }
 162 }