X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java new file mode 100644 index 0000000..167bf3d --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/QueryTermScorer.java @@ -0,0 +1,162 @@ +package org.apache.lucene.search.highlight; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.util.HashMap; +import java.util.HashSet; + +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.search.Query; + +/** + * {@link Scorer} implementation which scores text fragments by the number of + * unique query terms found. This class uses the {@link QueryTermExtractor} + * class to process determine the query terms and their boosts to be used. + */ +// TODO: provide option to boost score of fragments near beginning of document +// based on fragment.getFragNum() +public class QueryTermScorer implements Scorer { + + TextFragment currentTextFragment = null; + HashSet uniqueTermsInFragment; + + float totalScore = 0; + float maxTermWeight = 0; + private HashMap termsToFind; + + private CharTermAttribute termAtt; + + /** + * + * @param query a Lucene query (ideally rewritten using query.rewrite before + * being passed to this class and the searcher) + */ + public QueryTermScorer(Query query) { + this(QueryTermExtractor.getTerms(query)); + } + + /** + * + * @param query a Lucene query (ideally rewritten using query.rewrite before + * being passed to this class and the searcher) + * @param fieldName the Field name which is used to match Query terms + */ + public QueryTermScorer(Query query, String fieldName) { + this(QueryTermExtractor.getTerms(query, false, fieldName)); + } + + /** + * + * @param query a Lucene query (ideally rewritten using query.rewrite before + * being passed to this class and the searcher) + * @param reader used to compute IDF which can be used to a) score selected + * fragments better b) use graded highlights eg set font color + * intensity + * @param fieldName the field on which Inverse Document Frequency (IDF) + * calculations are based + */ + public QueryTermScorer(Query query, IndexReader reader, String fieldName) { + this(QueryTermExtractor.getIdfWeightedTerms(query, reader, fieldName)); + } + + public QueryTermScorer(WeightedTerm[] weightedTerms) { + termsToFind = new HashMap(); + for (int i = 0; i < weightedTerms.length; i++) { + WeightedTerm existingTerm = termsToFind + .get(weightedTerms[i].term); + if ((existingTerm == null) + || (existingTerm.weight < weightedTerms[i].weight)) { + // if a term is defined more than once, always use the highest scoring + // weight + termsToFind.put(weightedTerms[i].term, weightedTerms[i]); + maxTermWeight = Math.max(maxTermWeight, weightedTerms[i].getWeight()); + } + } + } + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Scorer#init(org.apache.lucene.analysis.TokenStream) + */ + public TokenStream init(TokenStream tokenStream) { + termAtt = tokenStream.addAttribute(CharTermAttribute.class); + return null; + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.search.highlight.FragmentScorer#startFragment(org.apache + * .lucene.search.highlight.TextFragment) + */ + public void startFragment(TextFragment newFragment) { + uniqueTermsInFragment = new HashSet(); + currentTextFragment = newFragment; + totalScore = 0; + + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Scorer#getTokenScore() + */ + public float getTokenScore() { + String termText = termAtt.toString(); + + WeightedTerm queryTerm = termsToFind.get(termText); + if (queryTerm == null) { + // not a query term - return + return 0; + } + // found a query term - is it unique in this doc? + if (!uniqueTermsInFragment.contains(termText)) { + totalScore += queryTerm.getWeight(); + uniqueTermsInFragment.add(termText); + } + return queryTerm.getWeight(); + } + + + /* (non-Javadoc) + * @see org.apache.lucene.search.highlight.Scorer#getFragmentScore() + */ + public float getFragmentScore() { + return totalScore; + } + + /* + * (non-Javadoc) + * + * @see + * org.apache.lucene.search.highlight.FragmentScorer#allFragmentsProcessed() + */ + public void allFragmentsProcessed() { + // this class has no special operations to perform at end of processing + } + + /** + * + * @return The highest weighted term (useful for passing to GradientFormatter + * to set top end of coloring scale. + */ + public float getMaxTermWeight() { + return maxTermWeight; + } +}