pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / highlighter / src / java / org / apache / lucene / search / highlight / Highlighter.java
1 package org.apache.lucene.search.highlight;
2 /**
3  * Licensed to the Apache Software Foundation (ASF) under one or more
4  * contributor license agreements.  See the NOTICE file distributed with
5  * this work for additional information regarding copyright ownership.
6  * The ASF licenses this file to You under the Apache License, Version 2.0
7  * (the "License"); you may not use this file except in compliance with
8  * the License.  You may obtain a copy of the License at
9  *
10  *     http://www.apache.org/licenses/LICENSE-2.0
11  *
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  */
18
19 import java.io.IOException;
20 import java.io.StringReader;
21 import java.util.ArrayList;
22 import java.util.Iterator;
23
24 import org.apache.lucene.analysis.Analyzer;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29 import org.apache.lucene.util.PriorityQueue;
30
31 /**
32  * Class used to markup highlighted terms found in the best sections of a
33  * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
34  * {@link Encoder} and tokenizers.
35  */
36 public class Highlighter
37 {
38   public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
39
40   private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
41         private Formatter formatter;
42         private Encoder encoder;
43         private Fragmenter textFragmenter=new SimpleFragmenter();
44         private Scorer fragmentScorer=null;
45
46         public Highlighter(Scorer fragmentScorer)
47         {
48                 this(new SimpleHTMLFormatter(),fragmentScorer);
49         }
50
51
52         public Highlighter(Formatter formatter, Scorer fragmentScorer)
53         {
54                 this(formatter,new DefaultEncoder(),fragmentScorer);
55         }
56
57
58         public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
59         {
60                 this.formatter = formatter;
61                 this.encoder = encoder;
62                 this.fragmentScorer = fragmentScorer;
63         }
64
65         /**
66          * Highlights chosen terms in a text, extracting the most relevant section.
67          * This is a convenience method that calls
68          * {@link #getBestFragment(TokenStream, String)}
69          *
70          * @param analyzer   the analyzer that will be used to split <code>text</code>
71          * into chunks
72          * @param text text to highlight terms in
73          * @param fieldName Name of field used to influence analyzer's tokenization policy
74          *
75          * @return highlighted text fragment or null if no terms found
76          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
77          */
78         public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
79                 throws IOException, InvalidTokenOffsetsException
80         {
81                 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
82                 return getBestFragment(tokenStream, text);
83         }
84
85         /**
86          * Highlights chosen terms in a text, extracting the most relevant section.
87          * The document text is analysed in chunks to record hit statistics
88          * across the document. After accumulating stats, the fragment with the highest score
89          * is returned
90          *
91          * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
92          * This is typically produced by an analyzer re-parsing a document's
93          * text. Some work may be done on retrieving TokenStreams more efficiently
94          * by adding support for storing original text position data in the Lucene
95          * index but this support is not currently available (as of Lucene 1.4 rc2).
96          * @param text text to highlight terms in
97          *
98          * @return highlighted text fragment or null if no terms found
99          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
100          */
101         public final String getBestFragment(TokenStream tokenStream, String text)
102                 throws IOException, InvalidTokenOffsetsException
103         {
104                 String[] results = getBestFragments(tokenStream,text, 1);
105                 if (results.length > 0)
106                 {
107                         return results[0];
108                 }
109                 return null;
110         }
111
112         /**
113          * Highlights chosen terms in a text, extracting the most relevant sections.
114          * This is a convenience method that calls
115          * {@link #getBestFragments(TokenStream, String, int)}
116          *
117          * @param analyzer   the analyzer that will be used to split <code>text</code>
118          * into chunks
119          * @param fieldName     the name of the field being highlighted (used by analyzer)
120          * @param text          text to highlight terms in
121          * @param maxNumFragments  the maximum number of fragments.
122          *
123          * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
124          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
125          */
126         public final String[] getBestFragments(
127                 Analyzer analyzer,
128                 String fieldName,
129                 String text,
130                 int maxNumFragments)
131                 throws IOException, InvalidTokenOffsetsException
132         {
133                 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
134                 return getBestFragments(tokenStream, text, maxNumFragments);
135         }
136
137         /**
138          * Highlights chosen terms in a text, extracting the most relevant sections.
139          * The document text is analysed in chunks to record hit statistics
140          * across the document. After accumulating stats, the fragments with the highest scores
141          * are returned as an array of strings in order of score (contiguous fragments are merged into
142          * one in their original order to improve readability)
143          *
144          * @param text          text to highlight terms in
145          * @param maxNumFragments  the maximum number of fragments.
146          *
147          * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
148          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
149          */
150         public final String[] getBestFragments(
151                 TokenStream tokenStream,
152                 String text,
153                 int maxNumFragments)
154                 throws IOException, InvalidTokenOffsetsException
155         {
156                 maxNumFragments = Math.max(1, maxNumFragments); //sanity check
157
158                 TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
159
160                 //Get text
161                 ArrayList<String> fragTexts = new ArrayList<String>();
162                 for (int i = 0; i < frag.length; i++)
163                 {
164                         if ((frag[i] != null) && (frag[i].getScore() > 0))
165                         {
166                                 fragTexts.add(frag[i].toString());
167                         }
168                 }
169                 return fragTexts.toArray(new String[0]);
170         }
171
172
173         /**
174          * Low level api to get the most relevant (formatted) sections of the document.
175          * This method has been made public to allow visibility of score information held in TextFragment objects.
176          * Thanks to Jason Calabrese for help in redefining the interface.
177          * @param tokenStream
178          * @param text
179          * @param maxNumFragments
180          * @param mergeContiguousFragments
181          * @throws IOException
182          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
183          */
184         public final TextFragment[] getBestTextFragments(
185                 TokenStream tokenStream,
186                 String text,
187                 boolean mergeContiguousFragments,
188                 int maxNumFragments)
189                 throws IOException, InvalidTokenOffsetsException
190         {
191                 ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
192                 StringBuilder newText=new StringBuilder();
193                 
194             CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
195             OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
196             tokenStream.addAttribute(PositionIncrementAttribute.class);
197             tokenStream.reset();
198             
199                 TextFragment currentFrag =      new TextFragment(newText,newText.length(), docFrags.size());
200                 
201     if (fragmentScorer instanceof QueryScorer) {
202       ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
203     }
204     
205                 TokenStream newStream = fragmentScorer.init(tokenStream);
206                 if(newStream != null) {
207                   tokenStream = newStream;
208                 }
209                 fragmentScorer.startFragment(currentFrag);
210                 docFrags.add(currentFrag);
211
212                 FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
213
214                 try
215                 {
216
217                         String tokenText;
218                         int startOffset;
219                         int endOffset;
220                         int lastEndOffset = 0;
221                         textFragmenter.start(text, tokenStream);
222
223                         TokenGroup tokenGroup=new TokenGroup(tokenStream);
224
225                         for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
226                               next = tokenStream.incrementToken())
227                         {
228                                 if(     (offsetAtt.endOffset()>text.length())
229                                         ||
230                                         (offsetAtt.startOffset()>text.length())
231                                         )                                               
232                                 {
233                                         throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
234                                                         +" exceeds length of provided text sized "+text.length());
235                                 }
236                                 if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
237                                 {
238                                         //the current token is distinct from previous tokens -
239                                         // markup the cached token group info
240                                         startOffset = tokenGroup.matchStartOffset;
241                                         endOffset = tokenGroup.matchEndOffset;
242                                         tokenText = text.substring(startOffset, endOffset);
243                                         String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
244                                         //store any whitespace etc from between this and last group
245                                         if (startOffset > lastEndOffset)
246                                                 newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
247                                         newText.append(markedUpText);
248                                         lastEndOffset=Math.max(endOffset, lastEndOffset);
249                                         tokenGroup.clear();
250
251                                         //check if current token marks the start of a new fragment
252                                         if(textFragmenter.isNewFragment())
253                                         {
254                                                 currentFrag.setScore(fragmentScorer.getFragmentScore());
255                                                 //record stats for a new fragment
256                                                 currentFrag.textEndPos = newText.length();
257                                                 currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
258                                                 fragmentScorer.startFragment(currentFrag);
259                                                 docFrags.add(currentFrag);
260                                         }
261                                 }
262
263                                 tokenGroup.addToken(fragmentScorer.getTokenScore());
264
265 //                              if(lastEndOffset>maxDocBytesToAnalyze)
266 //                              {
267 //                                      break;
268 //                              }
269                         }
270                         currentFrag.setScore(fragmentScorer.getFragmentScore());
271
272                         if(tokenGroup.numTokens>0)
273                         {
274                                 //flush the accumulated text (same code as in above loop)
275                                 startOffset = tokenGroup.matchStartOffset;
276                                 endOffset = tokenGroup.matchEndOffset;
277                                 tokenText = text.substring(startOffset, endOffset);
278                                 String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
279                                 //store any whitespace etc from between this and last group
280                                 if (startOffset > lastEndOffset)
281                                         newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
282                                 newText.append(markedUpText);
283                                 lastEndOffset=Math.max(lastEndOffset,endOffset);
284                         }
285
286                         //Test what remains of the original text beyond the point where we stopped analyzing 
287                         if (
288 //                                      if there is text beyond the last token considered..
289                                         (lastEndOffset < text.length()) 
290                                         &&
291 //                                      and that text is not too large...
292                                         (text.length()<= maxDocCharsToAnalyze)
293                                 )                               
294                         {
295                                 //append it to the last fragment
296                                 newText.append(encoder.encodeText(text.substring(lastEndOffset)));
297                         }
298
299                         currentFrag.textEndPos = newText.length();
300
301                         //sort the most relevant sections of the text
302                         for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
303                         {
304                                 currentFrag = i.next();
305
306                                 //If you are running with a version of Lucene before 11th Sept 03
307                                 // you do not have PriorityQueue.insert() - so uncomment the code below
308                                 /*
309                                                                         if (currentFrag.getScore() >= minScore)
310                                                                         {
311                                                                                 fragQueue.put(currentFrag);
312                                                                                 if (fragQueue.size() > maxNumFragments)
313                                                                                 { // if hit queue overfull
314                                                                                         fragQueue.pop(); // remove lowest in hit queue
315                                                                                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
316                                                                                 }
317
318
319                                                                         }
320                                 */
321                                 //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
322                                 //fix to PriorityQueue. The correct method to use here is the new "insert" method
323                                 // USE ABOVE CODE IF THIS DOES NOT COMPILE!
324                                 fragQueue.insertWithOverflow(currentFrag);
325                         }
326
327                         //return the most relevant fragments
328                         TextFragment frag[] = new TextFragment[fragQueue.size()];
329                         for (int i = frag.length - 1; i >= 0; i--)
330                         {
331                                 frag[i] = fragQueue.pop();
332                         }
333
334                         //merge any contiguous fragments to improve readability
335                         if(mergeContiguousFragments)
336                         {
337                                 mergeContiguousFragments(frag);
338                                 ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
339                                 for (int i = 0; i < frag.length; i++)
340                                 {
341                                         if ((frag[i] != null) && (frag[i].getScore() > 0))
342                                         {
343                                                 fragTexts.add(frag[i]);
344                                         }
345                                 }
346                                 frag= fragTexts.toArray(new TextFragment[0]);
347                         }
348
349                         return frag;
350
351                 }
352                 finally
353                 {
354                         if (tokenStream != null)
355                         {
356                                 try
357                                 {
358                                   tokenStream.end();
359                                         tokenStream.close();
360                                 }
361                                 catch (Exception e)
362                                 {
363                                 }
364                         }
365                 }
366         }
367
368
369         /** Improves readability of a score-sorted list of TextFragments by merging any fragments
370          * that were contiguous in the original text into one larger fragment with the correct order.
371          * This will leave a "null" in the array entry for the lesser scored fragment. 
372          * 
373          * @param frag An array of document fragments in descending score
374          */
375         private void mergeContiguousFragments(TextFragment[] frag)
376         {
377                 boolean mergingStillBeingDone;
378                 if (frag.length > 1)
379                         do
380                         {
381                                 mergingStillBeingDone = false; //initialise loop control flag
382                                 //for each fragment, scan other frags looking for contiguous blocks
383                                 for (int i = 0; i < frag.length; i++)
384                                 {
385                                         if (frag[i] == null)
386                                         {
387                                                 continue;
388                                         }
389                                         //merge any contiguous blocks 
390                                         for (int x = 0; x < frag.length; x++)
391                                         {
392                                                 if (frag[x] == null)
393                                                 {
394                                                         continue;
395                                                 }
396                                                 if (frag[i] == null)
397                                                 {
398                                                         break;
399                                                 }
400                                                 TextFragment frag1 = null;
401                                                 TextFragment frag2 = null;
402                                                 int frag1Num = 0;
403                                                 int frag2Num = 0;
404                                                 int bestScoringFragNum;
405                                                 int worstScoringFragNum;
406                                                 //if blocks are contiguous....
407                                                 if (frag[i].follows(frag[x]))
408                                                 {
409                                                         frag1 = frag[x];
410                                                         frag1Num = x;
411                                                         frag2 = frag[i];
412                                                         frag2Num = i;
413                                                 }
414                                                 else
415                                                         if (frag[x].follows(frag[i]))
416                                                         {
417                                                                 frag1 = frag[i];
418                                                                 frag1Num = i;
419                                                                 frag2 = frag[x];
420                                                                 frag2Num = x;
421                                                         }
422                                                 //merging required..
423                                                 if (frag1 != null)
424                                                 {
425                                                         if (frag1.getScore() > frag2.getScore())
426                                                         {
427                                                                 bestScoringFragNum = frag1Num;
428                                                                 worstScoringFragNum = frag2Num;
429                                                         }
430                                                         else
431                                                         {
432                                                                 bestScoringFragNum = frag2Num;
433                                                                 worstScoringFragNum = frag1Num;
434                                                         }
435                                                         frag1.merge(frag2);
436                                                         frag[worstScoringFragNum] = null;
437                                                         mergingStillBeingDone = true;
438                                                         frag[bestScoringFragNum] = frag1;
439                                                 }
440                                         }
441                                 }
442                         }
443                         while (mergingStillBeingDone);
444         }
445         
446         
447         /**
448          * Highlights terms in the  text , extracting the most relevant sections
449          * and concatenating the chosen fragments with a separator (typically "...").
450          * The document text is analysed in chunks to record hit statistics
451          * across the document. After accumulating stats, the fragments with the highest scores
452          * are returned in order as "separator" delimited strings.
453          *
454          * @param text        text to highlight terms in
455          * @param maxNumFragments  the maximum number of fragments.
456          * @param separator  the separator used to intersperse the document fragments (typically "...")
457          *
458          * @return highlighted text
459          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
460          */
461         public final String getBestFragments(
462                 TokenStream tokenStream,        
463                 String text,
464                 int maxNumFragments,
465                 String separator)
466                 throws IOException, InvalidTokenOffsetsException
467         {
468                 String sections[] =     getBestFragments(tokenStream,text, maxNumFragments);
469                 StringBuilder result = new StringBuilder();
470                 for (int i = 0; i < sections.length; i++)
471                 {
472                         if (i > 0)
473                         {
474                                 result.append(separator);
475                         }
476                         result.append(sections[i]);
477                 }
478                 return result.toString();
479         }
480
481   public int getMaxDocCharsToAnalyze() {
482     return maxDocCharsToAnalyze;
483   }
484
485   public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
486     this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
487   }
488
489   
490         public Fragmenter getTextFragmenter()
491         {
492                 return textFragmenter;
493         }
494
495         /**
496          * @param fragmenter
497          */
498         public void setTextFragmenter(Fragmenter fragmenter)
499         {
500                 textFragmenter = fragmenter;
501         }
502
503         /**
504          * @return Object used to score each text fragment 
505          */
506         public Scorer getFragmentScorer()
507         {
508                 return fragmentScorer;
509         }
510
511
512         /**
513          * @param scorer
514          */
515         public void setFragmentScorer(Scorer scorer)
516         {
517                 fragmentScorer = scorer;
518         }
519
520     public Encoder getEncoder()
521     {
522         return encoder;
523     }
524     public void setEncoder(Encoder encoder)
525     {
526         this.encoder = encoder;
527     }
528 }
529 class FragmentQueue extends PriorityQueue<TextFragment>
530 {
531         public FragmentQueue(int size)
532         {
533                 initialize(size);
534         }
535
536         @Override
537         public final boolean lessThan(TextFragment fragA, TextFragment fragB)
538         {
539                 if (fragA.getScore() == fragB.getScore())
540                         return fragA.fragNum > fragB.fragNum;
541                 else
542                         return fragA.getScore() < fragB.getScore();
543         }
544 }