+++ /dev/null
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Iterator;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.PriorityQueue;
-
-/**
- * Class used to markup highlighted terms found in the best sections of a
- * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
- * {@link Encoder} and tokenizers.
- */
-public class Highlighter
-{
- public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
-
- private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
- private Formatter formatter;
- private Encoder encoder;
- private Fragmenter textFragmenter=new SimpleFragmenter();
- private Scorer fragmentScorer=null;
-
- public Highlighter(Scorer fragmentScorer)
- {
- this(new SimpleHTMLFormatter(),fragmentScorer);
- }
-
-
- public Highlighter(Formatter formatter, Scorer fragmentScorer)
- {
- this(formatter,new DefaultEncoder(),fragmentScorer);
- }
-
-
- public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
- {
- this.formatter = formatter;
- this.encoder = encoder;
- this.fragmentScorer = fragmentScorer;
- }
-
- /**
- * Highlights chosen terms in a text, extracting the most relevant section.
- * This is a convenience method that calls
- * {@link #getBestFragment(TokenStream, String)}
- *
- * @param analyzer the analyzer that will be used to split <code>text</code>
- * into chunks
- * @param text text to highlight terms in
- * @param fieldName Name of field used to influence analyzer's tokenization policy
- *
- * @return highlighted text fragment or null if no terms found
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
- throws IOException, InvalidTokenOffsetsException
- {
- TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
- return getBestFragment(tokenStream, text);
- }
-
- /**
- * Highlights chosen terms in a text, extracting the most relevant section.
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragment with the highest score
- * is returned
- *
- * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
- * This is typically produced by an analyzer re-parsing a document's
- * text. Some work may be done on retrieving TokenStreams more efficiently
- * by adding support for storing original text position data in the Lucene
- * index but this support is not currently available (as of Lucene 1.4 rc2).
- * @param text text to highlight terms in
- *
- * @return highlighted text fragment or null if no terms found
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final String getBestFragment(TokenStream tokenStream, String text)
- throws IOException, InvalidTokenOffsetsException
- {
- String[] results = getBestFragments(tokenStream,text, 1);
- if (results.length > 0)
- {
- return results[0];
- }
- return null;
- }
-
- /**
- * Highlights chosen terms in a text, extracting the most relevant sections.
- * This is a convenience method that calls
- * {@link #getBestFragments(TokenStream, String, int)}
- *
- * @param analyzer the analyzer that will be used to split <code>text</code>
- * into chunks
- * @param fieldName the name of the field being highlighted (used by analyzer)
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
- *
- * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final String[] getBestFragments(
- Analyzer analyzer,
- String fieldName,
- String text,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
- TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
- return getBestFragments(tokenStream, text, maxNumFragments);
- }
-
- /**
- * Highlights chosen terms in a text, extracting the most relevant sections.
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragments with the highest scores
- * are returned as an array of strings in order of score (contiguous fragments are merged into
- * one in their original order to improve readability)
- *
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
- *
- * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final String[] getBestFragments(
- TokenStream tokenStream,
- String text,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
- maxNumFragments = Math.max(1, maxNumFragments); //sanity check
-
- TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
-
- //Get text
- ArrayList<String> fragTexts = new ArrayList<String>();
- for (int i = 0; i < frag.length; i++)
- {
- if ((frag[i] != null) && (frag[i].getScore() > 0))
- {
- fragTexts.add(frag[i].toString());
- }
- }
- return fragTexts.toArray(new String[0]);
- }
-
-
- /**
- * Low level api to get the most relevant (formatted) sections of the document.
- * This method has been made public to allow visibility of score information held in TextFragment objects.
- * Thanks to Jason Calabrese for help in redefining the interface.
- * @param tokenStream
- * @param text
- * @param maxNumFragments
- * @param mergeContiguousFragments
- * @throws IOException
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final TextFragment[] getBestTextFragments(
- TokenStream tokenStream,
- String text,
- boolean mergeContiguousFragments,
- int maxNumFragments)
- throws IOException, InvalidTokenOffsetsException
- {
- ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
- StringBuilder newText=new StringBuilder();
-
- CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
- OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
- tokenStream.addAttribute(PositionIncrementAttribute.class);
- tokenStream.reset();
-
- TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
-
- if (fragmentScorer instanceof QueryScorer) {
- ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
- }
-
- TokenStream newStream = fragmentScorer.init(tokenStream);
- if(newStream != null) {
- tokenStream = newStream;
- }
- fragmentScorer.startFragment(currentFrag);
- docFrags.add(currentFrag);
-
- FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
-
- try
- {
-
- String tokenText;
- int startOffset;
- int endOffset;
- int lastEndOffset = 0;
- textFragmenter.start(text, tokenStream);
-
- TokenGroup tokenGroup=new TokenGroup(tokenStream);
-
- for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
- next = tokenStream.incrementToken())
- {
- if( (offsetAtt.endOffset()>text.length())
- ||
- (offsetAtt.startOffset()>text.length())
- )
- {
- throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
- +" exceeds length of provided text sized "+text.length());
- }
- if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
- {
- //the current token is distinct from previous tokens -
- // markup the cached token group info
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
- tokenText = text.substring(startOffset, endOffset);
- String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
- if (startOffset > lastEndOffset)
- newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
- newText.append(markedUpText);
- lastEndOffset=Math.max(endOffset, lastEndOffset);
- tokenGroup.clear();
-
- //check if current token marks the start of a new fragment
- if(textFragmenter.isNewFragment())
- {
- currentFrag.setScore(fragmentScorer.getFragmentScore());
- //record stats for a new fragment
- currentFrag.textEndPos = newText.length();
- currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
- fragmentScorer.startFragment(currentFrag);
- docFrags.add(currentFrag);
- }
- }
-
- tokenGroup.addToken(fragmentScorer.getTokenScore());
-
-// if(lastEndOffset>maxDocBytesToAnalyze)
-// {
-// break;
-// }
- }
- currentFrag.setScore(fragmentScorer.getFragmentScore());
-
- if(tokenGroup.numTokens>0)
- {
- //flush the accumulated text (same code as in above loop)
- startOffset = tokenGroup.matchStartOffset;
- endOffset = tokenGroup.matchEndOffset;
- tokenText = text.substring(startOffset, endOffset);
- String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
- //store any whitespace etc from between this and last group
- if (startOffset > lastEndOffset)
- newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
- newText.append(markedUpText);
- lastEndOffset=Math.max(lastEndOffset,endOffset);
- }
-
- //Test what remains of the original text beyond the point where we stopped analyzing
- if (
-// if there is text beyond the last token considered..
- (lastEndOffset < text.length())
- &&
-// and that text is not too large...
- (text.length()<= maxDocCharsToAnalyze)
- )
- {
- //append it to the last fragment
- newText.append(encoder.encodeText(text.substring(lastEndOffset)));
- }
-
- currentFrag.textEndPos = newText.length();
-
- //sort the most relevant sections of the text
- for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
- {
- currentFrag = i.next();
-
- //If you are running with a version of Lucene before 11th Sept 03
- // you do not have PriorityQueue.insert() - so uncomment the code below
- /*
- if (currentFrag.getScore() >= minScore)
- {
- fragQueue.put(currentFrag);
- if (fragQueue.size() > maxNumFragments)
- { // if hit queue overfull
- fragQueue.pop(); // remove lowest in hit queue
- minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
- }
-
-
- }
- */
- //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
- //fix to PriorityQueue. The correct method to use here is the new "insert" method
- // USE ABOVE CODE IF THIS DOES NOT COMPILE!
- fragQueue.insertWithOverflow(currentFrag);
- }
-
- //return the most relevant fragments
- TextFragment frag[] = new TextFragment[fragQueue.size()];
- for (int i = frag.length - 1; i >= 0; i--)
- {
- frag[i] = fragQueue.pop();
- }
-
- //merge any contiguous fragments to improve readability
- if(mergeContiguousFragments)
- {
- mergeContiguousFragments(frag);
- ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
- for (int i = 0; i < frag.length; i++)
- {
- if ((frag[i] != null) && (frag[i].getScore() > 0))
- {
- fragTexts.add(frag[i]);
- }
- }
- frag= fragTexts.toArray(new TextFragment[0]);
- }
-
- return frag;
-
- }
- finally
- {
- if (tokenStream != null)
- {
- try
- {
- tokenStream.end();
- tokenStream.close();
- }
- catch (Exception e)
- {
- }
- }
- }
- }
-
-
- /** Improves readability of a score-sorted list of TextFragments by merging any fragments
- * that were contiguous in the original text into one larger fragment with the correct order.
- * This will leave a "null" in the array entry for the lesser scored fragment.
- *
- * @param frag An array of document fragments in descending score
- */
- private void mergeContiguousFragments(TextFragment[] frag)
- {
- boolean mergingStillBeingDone;
- if (frag.length > 1)
- do
- {
- mergingStillBeingDone = false; //initialise loop control flag
- //for each fragment, scan other frags looking for contiguous blocks
- for (int i = 0; i < frag.length; i++)
- {
- if (frag[i] == null)
- {
- continue;
- }
- //merge any contiguous blocks
- for (int x = 0; x < frag.length; x++)
- {
- if (frag[x] == null)
- {
- continue;
- }
- if (frag[i] == null)
- {
- break;
- }
- TextFragment frag1 = null;
- TextFragment frag2 = null;
- int frag1Num = 0;
- int frag2Num = 0;
- int bestScoringFragNum;
- int worstScoringFragNum;
- //if blocks are contiguous....
- if (frag[i].follows(frag[x]))
- {
- frag1 = frag[x];
- frag1Num = x;
- frag2 = frag[i];
- frag2Num = i;
- }
- else
- if (frag[x].follows(frag[i]))
- {
- frag1 = frag[i];
- frag1Num = i;
- frag2 = frag[x];
- frag2Num = x;
- }
- //merging required..
- if (frag1 != null)
- {
- if (frag1.getScore() > frag2.getScore())
- {
- bestScoringFragNum = frag1Num;
- worstScoringFragNum = frag2Num;
- }
- else
- {
- bestScoringFragNum = frag2Num;
- worstScoringFragNum = frag1Num;
- }
- frag1.merge(frag2);
- frag[worstScoringFragNum] = null;
- mergingStillBeingDone = true;
- frag[bestScoringFragNum] = frag1;
- }
- }
- }
- }
- while (mergingStillBeingDone);
- }
-
-
- /**
- * Highlights terms in the text , extracting the most relevant sections
- * and concatenating the chosen fragments with a separator (typically "...").
- * The document text is analysed in chunks to record hit statistics
- * across the document. After accumulating stats, the fragments with the highest scores
- * are returned in order as "separator" delimited strings.
- *
- * @param text text to highlight terms in
- * @param maxNumFragments the maximum number of fragments.
- * @param separator the separator used to intersperse the document fragments (typically "...")
- *
- * @return highlighted text
- * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
- */
- public final String getBestFragments(
- TokenStream tokenStream,
- String text,
- int maxNumFragments,
- String separator)
- throws IOException, InvalidTokenOffsetsException
- {
- String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
- StringBuilder result = new StringBuilder();
- for (int i = 0; i < sections.length; i++)
- {
- if (i > 0)
- {
- result.append(separator);
- }
- result.append(sections[i]);
- }
- return result.toString();
- }
-
- public int getMaxDocCharsToAnalyze() {
- return maxDocCharsToAnalyze;
- }
-
- public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
- this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
- }
-
-
- public Fragmenter getTextFragmenter()
- {
- return textFragmenter;
- }
-
- /**
- * @param fragmenter
- */
- public void setTextFragmenter(Fragmenter fragmenter)
- {
- textFragmenter = fragmenter;
- }
-
- /**
- * @return Object used to score each text fragment
- */
- public Scorer getFragmentScorer()
- {
- return fragmentScorer;
- }
-
-
- /**
- * @param scorer
- */
- public void setFragmentScorer(Scorer scorer)
- {
- fragmentScorer = scorer;
- }
-
- public Encoder getEncoder()
- {
- return encoder;
- }
- public void setEncoder(Encoder encoder)
- {
- this.encoder = encoder;
- }
-}
-class FragmentQueue extends PriorityQueue<TextFragment>
-{
- public FragmentQueue(int size)
- {
- initialize(size);
- }
-
- @Override
- public final boolean lessThan(TextFragment fragA, TextFragment fragB)
- {
- if (fragA.getScore() == fragB.getScore())
- return fragA.fragNum > fragB.fragNum;
- else
- return fragA.getScore() < fragB.getScore();
- }
-}