lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

   1 package org.apache.lucene.search.highlight;
   2 /**
   3  * Licensed to the Apache Software Foundation (ASF) under one or more
   4  * contributor license agreements.  See the NOTICE file distributed with
   5  * this work for additional information regarding copyright ownership.
   6  * The ASF licenses this file to You under the Apache License, Version 2.0
   7  * (the "License"); you may not use this file except in compliance with
   8  * the License.  You may obtain a copy of the License at
   9  *
  10  *     http://www.apache.org/licenses/LICENSE-2.0
  11  *
  12  * Unless required by applicable law or agreed to in writing, software
  13  * distributed under the License is distributed on an "AS IS" BASIS,
  14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15  * See the License for the specific language governing permissions and
  16  * limitations under the License.
  17  */
  18
  19 import java.io.IOException;
  20 import java.io.StringReader;
  21 import java.util.ArrayList;
  22 import java.util.Iterator;
  23
  24 import org.apache.lucene.analysis.Analyzer;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  29 import org.apache.lucene.util.PriorityQueue;
  30
  31 /**
  32  * Class used to markup highlighted terms found in the best sections of a
  33  * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
  34  * {@link Encoder} and tokenizers.
  35  */
  36 public class Highlighter
  37 {
  38   public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
  39
  40   private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
  41         private Formatter formatter;
  42         private Encoder encoder;
  43         private Fragmenter textFragmenter=new SimpleFragmenter();
  44         private Scorer fragmentScorer=null;
  45
  46         public Highlighter(Scorer fragmentScorer)
  47         {
  48                 this(new SimpleHTMLFormatter(),fragmentScorer);
  49         }
  50
  51
  52         public Highlighter(Formatter formatter, Scorer fragmentScorer)
  53         {
  54                 this(formatter,new DefaultEncoder(),fragmentScorer);
  55         }
  56
  57
  58         public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
  59         {
  60                 this.formatter = formatter;
  61                 this.encoder = encoder;
  62                 this.fragmentScorer = fragmentScorer;
  63         }
  64
  65         /**
  66          * Highlights chosen terms in a text, extracting the most relevant section.
  67          * This is a convenience method that calls
  68          * {@link #getBestFragment(TokenStream, String)}
  69          *
  70          * @param analyzer   the analyzer that will be used to split <code>text</code>
  71          * into chunks
  72          * @param text text to highlight terms in
  73          * @param fieldName Name of field used to influence analyzer's tokenization policy
  74          *
  75          * @return highlighted text fragment or null if no terms found
  76          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
  77          */
  78         public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
  79                 throws IOException, InvalidTokenOffsetsException
  80         {
  81                 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
  82                 return getBestFragment(tokenStream, text);
  83         }
  84
  85         /**
  86          * Highlights chosen terms in a text, extracting the most relevant section.
  87          * The document text is analysed in chunks to record hit statistics
  88          * across the document. After accumulating stats, the fragment with the highest score
  89          * is returned
  90          *
  91          * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
  92          * This is typically produced by an analyzer re-parsing a document's
  93          * text. Some work may be done on retrieving TokenStreams more efficiently
  94          * by adding support for storing original text position data in the Lucene
  95          * index but this support is not currently available (as of Lucene 1.4 rc2).
  96          * @param text text to highlight terms in
  97          *
  98          * @return highlighted text fragment or null if no terms found
  99          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 100          */
 101         public final String getBestFragment(TokenStream tokenStream, String text)
 102                 throws IOException, InvalidTokenOffsetsException
 103         {
 104                 String[] results = getBestFragments(tokenStream,text, 1);
 105                 if (results.length > 0)
 106                 {
 107                         return results[0];
 108                 }
 109                 return null;
 110         }
 111
 112         /**
 113          * Highlights chosen terms in a text, extracting the most relevant sections.
 114          * This is a convenience method that calls
 115          * {@link #getBestFragments(TokenStream, String, int)}
 116          *
 117          * @param analyzer   the analyzer that will be used to split <code>text</code>
 118          * into chunks
 119          * @param fieldName     the name of the field being highlighted (used by analyzer)
 120          * @param text          text to highlight terms in
 121          * @param maxNumFragments  the maximum number of fragments.
 122          *
 123          * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
 124          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 125          */
 126         public final String[] getBestFragments(
 127                 Analyzer analyzer,
 128                 String fieldName,
 129                 String text,
 130                 int maxNumFragments)
 131                 throws IOException, InvalidTokenOffsetsException
 132         {
 133                 TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
 134                 return getBestFragments(tokenStream, text, maxNumFragments);
 135         }
 136
 137         /**
 138          * Highlights chosen terms in a text, extracting the most relevant sections.
 139          * The document text is analysed in chunks to record hit statistics
 140          * across the document. After accumulating stats, the fragments with the highest scores
 141          * are returned as an array of strings in order of score (contiguous fragments are merged into
 142          * one in their original order to improve readability)
 143          *
 144          * @param text          text to highlight terms in
 145          * @param maxNumFragments  the maximum number of fragments.
 146          *
 147          * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
 148          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 149          */
 150         public final String[] getBestFragments(
 151                 TokenStream tokenStream,
 152                 String text,
 153                 int maxNumFragments)
 154                 throws IOException, InvalidTokenOffsetsException
 155         {
 156                 maxNumFragments = Math.max(1, maxNumFragments); //sanity check
 157
 158                 TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
 159
 160                 //Get text
 161                 ArrayList<String> fragTexts = new ArrayList<String>();
 162                 for (int i = 0; i < frag.length; i++)
 163                 {
 164                         if ((frag[i] != null) && (frag[i].getScore() > 0))
 165                         {
 166                                 fragTexts.add(frag[i].toString());
 167                         }
 168                 }
 169                 return fragTexts.toArray(new String[0]);
 170         }
 171
 172
 173         /**
 174          * Low level api to get the most relevant (formatted) sections of the document.
 175          * This method has been made public to allow visibility of score information held in TextFragment objects.
 176          * Thanks to Jason Calabrese for help in redefining the interface.
 177          * @param tokenStream
 178          * @param text
 179          * @param maxNumFragments
 180          * @param mergeContiguousFragments
 181          * @throws IOException
 182          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 183          */
 184         public final TextFragment[] getBestTextFragments(
 185                 TokenStream tokenStream,
 186                 String text,
 187                 boolean mergeContiguousFragments,
 188                 int maxNumFragments)
 189                 throws IOException, InvalidTokenOffsetsException
 190         {
 191                 ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
 192                 StringBuilder newText=new StringBuilder();
 193
 194             CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
 195             OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
 196             tokenStream.addAttribute(PositionIncrementAttribute.class);
 197             tokenStream.reset();
 198
 199                 TextFragment currentFrag =      new TextFragment(newText,newText.length(), docFrags.size());
 200
 201     if (fragmentScorer instanceof QueryScorer) {
 202       ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
 203     }
 204
 205                 TokenStream newStream = fragmentScorer.init(tokenStream);
 206                 if(newStream != null) {
 207                   tokenStream = newStream;
 208                 }
 209                 fragmentScorer.startFragment(currentFrag);
 210                 docFrags.add(currentFrag);
 211
 212                 FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
 213
 214                 try
 215                 {
 216
 217                         String tokenText;
 218                         int startOffset;
 219                         int endOffset;
 220                         int lastEndOffset = 0;
 221                         textFragmenter.start(text, tokenStream);
 222
 223                         TokenGroup tokenGroup=new TokenGroup(tokenStream);
 224
 225                         for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
 226                               next = tokenStream.incrementToken())
 227                         {
 228                                 if(     (offsetAtt.endOffset()>text.length())
 229                                         ||
 230                                         (offsetAtt.startOffset()>text.length())
 231                                         )
 232                                 {
 233                                         throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
 234                                                         +" exceeds length of provided text sized "+text.length());
 235                                 }
 236                                 if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
 237                                 {
 238                                         //the current token is distinct from previous tokens -
 239                                         // markup the cached token group info
 240                                         startOffset = tokenGroup.matchStartOffset;
 241                                         endOffset = tokenGroup.matchEndOffset;
 242                                         tokenText = text.substring(startOffset, endOffset);
 243                                         String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
 244                                         //store any whitespace etc from between this and last group
 245                                         if (startOffset > lastEndOffset)
 246                                                 newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
 247                                         newText.append(markedUpText);
 248                                         lastEndOffset=Math.max(endOffset, lastEndOffset);
 249                                         tokenGroup.clear();
 250
 251                                         //check if current token marks the start of a new fragment
 252                                         if(textFragmenter.isNewFragment())
 253                                         {
 254                                                 currentFrag.setScore(fragmentScorer.getFragmentScore());
 255                                                 //record stats for a new fragment
 256                                                 currentFrag.textEndPos = newText.length();
 257                                                 currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
 258                                                 fragmentScorer.startFragment(currentFrag);
 259                                                 docFrags.add(currentFrag);
 260                                         }
 261                                 }
 262
 263                                 tokenGroup.addToken(fragmentScorer.getTokenScore());
 264
 265 //                              if(lastEndOffset>maxDocBytesToAnalyze)
 266 //                              {
 267 //                                      break;
 268 //                              }
 269                         }
 270                         currentFrag.setScore(fragmentScorer.getFragmentScore());
 271
 272                         if(tokenGroup.numTokens>0)
 273                         {
 274                                 //flush the accumulated text (same code as in above loop)
 275                                 startOffset = tokenGroup.matchStartOffset;
 276                                 endOffset = tokenGroup.matchEndOffset;
 277                                 tokenText = text.substring(startOffset, endOffset);
 278                                 String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
 279                                 //store any whitespace etc from between this and last group
 280                                 if (startOffset > lastEndOffset)
 281                                         newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
 282                                 newText.append(markedUpText);
 283                                 lastEndOffset=Math.max(lastEndOffset,endOffset);
 284                         }
 285
 286                         //Test what remains of the original text beyond the point where we stopped analyzing
 287                         if (
 288 //                                      if there is text beyond the last token considered..
 289                                         (lastEndOffset < text.length())
 290                                         &&
 291 //                                      and that text is not too large...
 292                                         (text.length()<= maxDocCharsToAnalyze)
 293                                 )
 294                         {
 295                                 //append it to the last fragment
 296                                 newText.append(encoder.encodeText(text.substring(lastEndOffset)));
 297                         }
 298
 299                         currentFrag.textEndPos = newText.length();
 300
 301                         //sort the most relevant sections of the text
 302                         for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
 303                         {
 304                                 currentFrag = i.next();
 305
 306                                 //If you are running with a version of Lucene before 11th Sept 03
 307                                 // you do not have PriorityQueue.insert() - so uncomment the code below
 308                                 /*
 309                                                                         if (currentFrag.getScore() >= minScore)
 310                                                                         {
 311                                                                                 fragQueue.put(currentFrag);
 312                                                                                 if (fragQueue.size() > maxNumFragments)
 313                                                                                 { // if hit queue overfull
 314                                                                                         fragQueue.pop(); // remove lowest in hit queue
 315                                                                                         minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
 316                                                                                 }
 317
 318
 319                                                                         }
 320                                 */
 321                                 //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
 322                                 //fix to PriorityQueue. The correct method to use here is the new "insert" method
 323                                 // USE ABOVE CODE IF THIS DOES NOT COMPILE!
 324                                 fragQueue.insertWithOverflow(currentFrag);
 325                         }
 326
 327                         //return the most relevant fragments
 328                         TextFragment frag[] = new TextFragment[fragQueue.size()];
 329                         for (int i = frag.length - 1; i >= 0; i--)
 330                         {
 331                                 frag[i] = fragQueue.pop();
 332                         }
 333
 334                         //merge any contiguous fragments to improve readability
 335                         if(mergeContiguousFragments)
 336                         {
 337                                 mergeContiguousFragments(frag);
 338                                 ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
 339                                 for (int i = 0; i < frag.length; i++)
 340                                 {
 341                                         if ((frag[i] != null) && (frag[i].getScore() > 0))
 342                                         {
 343                                                 fragTexts.add(frag[i]);
 344                                         }
 345                                 }
 346                                 frag= fragTexts.toArray(new TextFragment[0]);
 347                         }
 348
 349                         return frag;
 350
 351                 }
 352                 finally
 353                 {
 354                         if (tokenStream != null)
 355                         {
 356                                 try
 357                                 {
 358                                   tokenStream.end();
 359                                         tokenStream.close();
 360                                 }
 361                                 catch (Exception e)
 362                                 {
 363                                 }
 364                         }
 365                 }
 366         }
 367
 368
 369         /** Improves readability of a score-sorted list of TextFragments by merging any fragments
 370          * that were contiguous in the original text into one larger fragment with the correct order.
 371          * This will leave a "null" in the array entry for the lesser scored fragment.
 372          *
 373          * @param frag An array of document fragments in descending score
 374          */
 375         private void mergeContiguousFragments(TextFragment[] frag)
 376         {
 377                 boolean mergingStillBeingDone;
 378                 if (frag.length > 1)
 379                         do
 380                         {
 381                                 mergingStillBeingDone = false; //initialise loop control flag
 382                                 //for each fragment, scan other frags looking for contiguous blocks
 383                                 for (int i = 0; i < frag.length; i++)
 384                                 {
 385                                         if (frag[i] == null)
 386                                         {
 387                                                 continue;
 388                                         }
 389                                         //merge any contiguous blocks
 390                                         for (int x = 0; x < frag.length; x++)
 391                                         {
 392                                                 if (frag[x] == null)
 393                                                 {
 394                                                         continue;
 395                                                 }
 396                                                 if (frag[i] == null)
 397                                                 {
 398                                                         break;
 399                                                 }
 400                                                 TextFragment frag1 = null;
 401                                                 TextFragment frag2 = null;
 402                                                 int frag1Num = 0;
 403                                                 int frag2Num = 0;
 404                                                 int bestScoringFragNum;
 405                                                 int worstScoringFragNum;
 406                                                 //if blocks are contiguous....
 407                                                 if (frag[i].follows(frag[x]))
 408                                                 {
 409                                                         frag1 = frag[x];
 410                                                         frag1Num = x;
 411                                                         frag2 = frag[i];
 412                                                         frag2Num = i;
 413                                                 }
 414                                                 else
 415                                                         if (frag[x].follows(frag[i]))
 416                                                         {
 417                                                                 frag1 = frag[i];
 418                                                                 frag1Num = i;
 419                                                                 frag2 = frag[x];
 420                                                                 frag2Num = x;
 421                                                         }
 422                                                 //merging required..
 423                                                 if (frag1 != null)
 424                                                 {
 425                                                         if (frag1.getScore() > frag2.getScore())
 426                                                         {
 427                                                                 bestScoringFragNum = frag1Num;
 428                                                                 worstScoringFragNum = frag2Num;
 429                                                         }
 430                                                         else
 431                                                         {
 432                                                                 bestScoringFragNum = frag2Num;
 433                                                                 worstScoringFragNum = frag1Num;
 434                                                         }
 435                                                         frag1.merge(frag2);
 436                                                         frag[worstScoringFragNum] = null;
 437                                                         mergingStillBeingDone = true;
 438                                                         frag[bestScoringFragNum] = frag1;
 439                                                 }
 440                                         }
 441                                 }
 442                         }
 443                         while (mergingStillBeingDone);
 444         }
 445
 446
 447         /**
 448          * Highlights terms in the  text , extracting the most relevant sections
 449          * and concatenating the chosen fragments with a separator (typically "...").
 450          * The document text is analysed in chunks to record hit statistics
 451          * across the document. After accumulating stats, the fragments with the highest scores
 452          * are returned in order as "separator" delimited strings.
 453          *
 454          * @param text        text to highlight terms in
 455          * @param maxNumFragments  the maximum number of fragments.
 456          * @param separator  the separator used to intersperse the document fragments (typically "...")
 457          *
 458          * @return highlighted text
 459          * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
 460          */
 461         public final String getBestFragments(
 462                 TokenStream tokenStream,
 463                 String text,
 464                 int maxNumFragments,
 465                 String separator)
 466                 throws IOException, InvalidTokenOffsetsException
 467         {
 468                 String sections[] =     getBestFragments(tokenStream,text, maxNumFragments);
 469                 StringBuilder result = new StringBuilder();
 470                 for (int i = 0; i < sections.length; i++)
 471                 {
 472                         if (i > 0)
 473                         {
 474                                 result.append(separator);
 475                         }
 476                         result.append(sections[i]);
 477                 }
 478                 return result.toString();
 479         }
 480
 481   public int getMaxDocCharsToAnalyze() {
 482     return maxDocCharsToAnalyze;
 483   }
 484
 485   public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
 486     this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
 487   }
 488
 489
 490         public Fragmenter getTextFragmenter()
 491         {
 492                 return textFragmenter;
 493         }
 494
 495         /**
 496          * @param fragmenter
 497          */
 498         public void setTextFragmenter(Fragmenter fragmenter)
 499         {
 500                 textFragmenter = fragmenter;
 501         }
 502
 503         /**
 504          * @return Object used to score each text fragment
 505          */
 506         public Scorer getFragmentScorer()
 507         {
 508                 return fragmentScorer;
 509         }
 510
 511
 512         /**
 513          * @param scorer
 514          */
 515         public void setFragmentScorer(Scorer scorer)
 516         {
 517                 fragmentScorer = scorer;
 518         }
 519
 520     public Encoder getEncoder()
 521     {
 522         return encoder;
 523     }
 524     public void setEncoder(Encoder encoder)
 525     {
 526         this.encoder = encoder;
 527     }
 528 }
 529 class FragmentQueue extends PriorityQueue<TextFragment>
 530 {
 531         public FragmentQueue(int size)
 532         {
 533                 initialize(size);
 534         }
 535
 536         @Override
 537         public final boolean lessThan(TextFragment fragA, TextFragment fragB)
 538         {
 539                 if (fragA.getScore() == fragB.getScore())
 540                         return fragA.fragNum > fragB.fragNum;
 541                 else
 542                         return fragA.getScore() < fragB.getScore();
 543         }
 544 }