1 package org.apache.lucene.search.highlight;
3 * Licensed to the Apache Software Foundation (ASF) under one or more
4 * contributor license agreements. See the NOTICE file distributed with
5 * this work for additional information regarding copyright ownership.
6 * The ASF licenses this file to You under the Apache License, Version 2.0
7 * (the "License"); you may not use this file except in compliance with
8 * the License. You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.IOException;
20 import java.io.StringReader;
21 import java.util.ArrayList;
22 import java.util.Iterator;
24 import org.apache.lucene.analysis.Analyzer;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29 import org.apache.lucene.util.PriorityQueue;
32 * Class used to markup highlighted terms found in the best sections of a
33 * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
34 * {@link Encoder} and tokenizers.
36 public class Highlighter
38 public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
40 private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
41 private Formatter formatter;
42 private Encoder encoder;
43 private Fragmenter textFragmenter=new SimpleFragmenter();
44 private Scorer fragmentScorer=null;
46 public Highlighter(Scorer fragmentScorer)
48 this(new SimpleHTMLFormatter(),fragmentScorer);
52 public Highlighter(Formatter formatter, Scorer fragmentScorer)
54 this(formatter,new DefaultEncoder(),fragmentScorer);
58 public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
60 this.formatter = formatter;
61 this.encoder = encoder;
62 this.fragmentScorer = fragmentScorer;
66 * Highlights chosen terms in a text, extracting the most relevant section.
67 * This is a convenience method that calls
68 * {@link #getBestFragment(TokenStream, String)}
70 * @param analyzer the analyzer that will be used to split <code>text</code>
72 * @param text text to highlight terms in
73 * @param fieldName Name of field used to influence analyzer's tokenization policy
75 * @return highlighted text fragment or null if no terms found
76 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
78 public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
79 throws IOException, InvalidTokenOffsetsException
81 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
82 return getBestFragment(tokenStream, text);
86 * Highlights chosen terms in a text, extracting the most relevant section.
87 * The document text is analysed in chunks to record hit statistics
88 * across the document. After accumulating stats, the fragment with the highest score
91 * @param tokenStream a stream of tokens identified in the text parameter, including offset information.
92 * This is typically produced by an analyzer re-parsing a document's
93 * text. Some work may be done on retrieving TokenStreams more efficiently
94 * by adding support for storing original text position data in the Lucene
95 * index but this support is not currently available (as of Lucene 1.4 rc2).
96 * @param text text to highlight terms in
98 * @return highlighted text fragment or null if no terms found
99 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
101 public final String getBestFragment(TokenStream tokenStream, String text)
102 throws IOException, InvalidTokenOffsetsException
104 String[] results = getBestFragments(tokenStream,text, 1);
105 if (results.length > 0)
113 * Highlights chosen terms in a text, extracting the most relevant sections.
114 * This is a convenience method that calls
115 * {@link #getBestFragments(TokenStream, String, int)}
117 * @param analyzer the analyzer that will be used to split <code>text</code>
119 * @param fieldName the name of the field being highlighted (used by analyzer)
120 * @param text text to highlight terms in
121 * @param maxNumFragments the maximum number of fragments.
123 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
124 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
126 public final String[] getBestFragments(
131 throws IOException, InvalidTokenOffsetsException
133 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
134 return getBestFragments(tokenStream, text, maxNumFragments);
138 * Highlights chosen terms in a text, extracting the most relevant sections.
139 * The document text is analysed in chunks to record hit statistics
140 * across the document. After accumulating stats, the fragments with the highest scores
141 * are returned as an array of strings in order of score (contiguous fragments are merged into
142 * one in their original order to improve readability)
144 * @param text text to highlight terms in
145 * @param maxNumFragments the maximum number of fragments.
147 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
148 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
150 public final String[] getBestFragments(
151 TokenStream tokenStream,
154 throws IOException, InvalidTokenOffsetsException
156 maxNumFragments = Math.max(1, maxNumFragments); //sanity check
158 TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
161 ArrayList<String> fragTexts = new ArrayList<String>();
162 for (int i = 0; i < frag.length; i++)
164 if ((frag[i] != null) && (frag[i].getScore() > 0))
166 fragTexts.add(frag[i].toString());
169 return fragTexts.toArray(new String[0]);
174 * Low level api to get the most relevant (formatted) sections of the document.
175 * This method has been made public to allow visibility of score information held in TextFragment objects.
176 * Thanks to Jason Calabrese for help in redefining the interface.
179 * @param maxNumFragments
180 * @param mergeContiguousFragments
181 * @throws IOException
182 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
184 public final TextFragment[] getBestTextFragments(
185 TokenStream tokenStream,
187 boolean mergeContiguousFragments,
189 throws IOException, InvalidTokenOffsetsException
191 ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
192 StringBuilder newText=new StringBuilder();
194 CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
195 OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
196 tokenStream.addAttribute(PositionIncrementAttribute.class);
199 TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size());
201 if (fragmentScorer instanceof QueryScorer) {
202 ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
205 TokenStream newStream = fragmentScorer.init(tokenStream);
206 if(newStream != null) {
207 tokenStream = newStream;
209 fragmentScorer.startFragment(currentFrag);
210 docFrags.add(currentFrag);
212 FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
220 int lastEndOffset = 0;
221 textFragmenter.start(text, tokenStream);
223 TokenGroup tokenGroup=new TokenGroup(tokenStream);
225 for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
226 next = tokenStream.incrementToken())
228 if( (offsetAtt.endOffset()>text.length())
230 (offsetAtt.startOffset()>text.length())
233 throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
234 +" exceeds length of provided text sized "+text.length());
236 if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
238 //the current token is distinct from previous tokens -
239 // markup the cached token group info
240 startOffset = tokenGroup.matchStartOffset;
241 endOffset = tokenGroup.matchEndOffset;
242 tokenText = text.substring(startOffset, endOffset);
243 String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
244 //store any whitespace etc from between this and last group
245 if (startOffset > lastEndOffset)
246 newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
247 newText.append(markedUpText);
248 lastEndOffset=Math.max(endOffset, lastEndOffset);
251 //check if current token marks the start of a new fragment
252 if(textFragmenter.isNewFragment())
254 currentFrag.setScore(fragmentScorer.getFragmentScore());
255 //record stats for a new fragment
256 currentFrag.textEndPos = newText.length();
257 currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
258 fragmentScorer.startFragment(currentFrag);
259 docFrags.add(currentFrag);
263 tokenGroup.addToken(fragmentScorer.getTokenScore());
265 // if(lastEndOffset>maxDocBytesToAnalyze)
270 currentFrag.setScore(fragmentScorer.getFragmentScore());
272 if(tokenGroup.numTokens>0)
274 //flush the accumulated text (same code as in above loop)
275 startOffset = tokenGroup.matchStartOffset;
276 endOffset = tokenGroup.matchEndOffset;
277 tokenText = text.substring(startOffset, endOffset);
278 String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
279 //store any whitespace etc from between this and last group
280 if (startOffset > lastEndOffset)
281 newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
282 newText.append(markedUpText);
283 lastEndOffset=Math.max(lastEndOffset,endOffset);
286 //Test what remains of the original text beyond the point where we stopped analyzing
288 // if there is text beyond the last token considered..
289 (lastEndOffset < text.length())
291 // and that text is not too large...
292 (text.length()<= maxDocCharsToAnalyze)
295 //append it to the last fragment
296 newText.append(encoder.encodeText(text.substring(lastEndOffset)));
299 currentFrag.textEndPos = newText.length();
301 //sort the most relevant sections of the text
302 for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
304 currentFrag = i.next();
306 //If you are running with a version of Lucene before 11th Sept 03
307 // you do not have PriorityQueue.insert() - so uncomment the code below
309 if (currentFrag.getScore() >= minScore)
311 fragQueue.put(currentFrag);
312 if (fragQueue.size() > maxNumFragments)
313 { // if hit queue overfull
314 fragQueue.pop(); // remove lowest in hit queue
315 minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
321 //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
322 //fix to PriorityQueue. The correct method to use here is the new "insert" method
323 // USE ABOVE CODE IF THIS DOES NOT COMPILE!
324 fragQueue.insertWithOverflow(currentFrag);
327 //return the most relevant fragments
328 TextFragment frag[] = new TextFragment[fragQueue.size()];
329 for (int i = frag.length - 1; i >= 0; i--)
331 frag[i] = fragQueue.pop();
334 //merge any contiguous fragments to improve readability
335 if(mergeContiguousFragments)
337 mergeContiguousFragments(frag);
338 ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
339 for (int i = 0; i < frag.length; i++)
341 if ((frag[i] != null) && (frag[i].getScore() > 0))
343 fragTexts.add(frag[i]);
346 frag= fragTexts.toArray(new TextFragment[0]);
354 if (tokenStream != null)
369 /** Improves readability of a score-sorted list of TextFragments by merging any fragments
370 * that were contiguous in the original text into one larger fragment with the correct order.
371 * This will leave a "null" in the array entry for the lesser scored fragment.
373 * @param frag An array of document fragments in descending score
375 private void mergeContiguousFragments(TextFragment[] frag)
377 boolean mergingStillBeingDone;
381 mergingStillBeingDone = false; //initialise loop control flag
382 //for each fragment, scan other frags looking for contiguous blocks
383 for (int i = 0; i < frag.length; i++)
389 //merge any contiguous blocks
390 for (int x = 0; x < frag.length; x++)
400 TextFragment frag1 = null;
401 TextFragment frag2 = null;
404 int bestScoringFragNum;
405 int worstScoringFragNum;
406 //if blocks are contiguous....
407 if (frag[i].follows(frag[x]))
415 if (frag[x].follows(frag[i]))
425 if (frag1.getScore() > frag2.getScore())
427 bestScoringFragNum = frag1Num;
428 worstScoringFragNum = frag2Num;
432 bestScoringFragNum = frag2Num;
433 worstScoringFragNum = frag1Num;
436 frag[worstScoringFragNum] = null;
437 mergingStillBeingDone = true;
438 frag[bestScoringFragNum] = frag1;
443 while (mergingStillBeingDone);
448 * Highlights terms in the text , extracting the most relevant sections
449 * and concatenating the chosen fragments with a separator (typically "...").
450 * The document text is analysed in chunks to record hit statistics
451 * across the document. After accumulating stats, the fragments with the highest scores
452 * are returned in order as "separator" delimited strings.
454 * @param text text to highlight terms in
455 * @param maxNumFragments the maximum number of fragments.
456 * @param separator the separator used to intersperse the document fragments (typically "...")
458 * @return highlighted text
459 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
461 public final String getBestFragments(
462 TokenStream tokenStream,
466 throws IOException, InvalidTokenOffsetsException
468 String sections[] = getBestFragments(tokenStream,text, maxNumFragments);
469 StringBuilder result = new StringBuilder();
470 for (int i = 0; i < sections.length; i++)
474 result.append(separator);
476 result.append(sections[i]);
478 return result.toString();
481 public int getMaxDocCharsToAnalyze() {
482 return maxDocCharsToAnalyze;
485 public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
486 this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
490 public Fragmenter getTextFragmenter()
492 return textFragmenter;
498 public void setTextFragmenter(Fragmenter fragmenter)
500 textFragmenter = fragmenter;
504 * @return Object used to score each text fragment
506 public Scorer getFragmentScorer()
508 return fragmentScorer;
515 public void setFragmentScorer(Scorer scorer)
517 fragmentScorer = scorer;
520 public Encoder getEncoder()
524 public void setEncoder(Encoder encoder)
526 this.encoder = encoder;
529 class FragmentQueue extends PriorityQueue<TextFragment>
531 public FragmentQueue(int size)
537 public final boolean lessThan(TextFragment fragA, TextFragment fragB)
539 if (fragA.getScore() == fragB.getScore())
540 return fragA.fragNum > fragB.fragNum;
542 return fragA.getScore() < fragB.getScore();