pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / highlighter / src / java / org / apache / lucene / search / highlight / Highlighter.java
diff --git a/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

new file mode 100644 (file)

index 0000000..f424423
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
@@ -0,0 +1,544 @@
+package org.apache.lucene.search.highlight;
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.Iterator;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.util.PriorityQueue;
+
+/**
+ * Class used to markup highlighted terms found in the best sections of a
+ * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
+ * {@link Encoder} and tokenizers.
+ */
+public class Highlighter
+{
+  public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
+
+  private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
+       private Formatter formatter;
+       private Encoder encoder;
+       private Fragmenter textFragmenter=new SimpleFragmenter();
+       private Scorer fragmentScorer=null;
+
+       public Highlighter(Scorer fragmentScorer)
+       {
+               this(new SimpleHTMLFormatter(),fragmentScorer);
+       }
+
+
+       public Highlighter(Formatter formatter, Scorer fragmentScorer)
+       {
+               this(formatter,new DefaultEncoder(),fragmentScorer);
+       }
+
+
+       public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
+       {
+               this.formatter = formatter;
+               this.encoder = encoder;
+               this.fragmentScorer = fragmentScorer;
+       }
+
+       /**
+        * Highlights chosen terms in a text, extracting the most relevant section.
+        * This is a convenience method that calls
+        * {@link #getBestFragment(TokenStream, String)}
+        *
+        * @param analyzer   the analyzer that will be used to split <code>text</code>
+        * into chunks
+        * @param text text to highlight terms in
+        * @param fieldName Name of field used to influence analyzer's tokenization policy
+        *
+        * @return highlighted text fragment or null if no terms found
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
+               return getBestFragment(tokenStream, text);
+       }
+
+       /**
+        * Highlights chosen terms in a text, extracting the most relevant section.
+        * The document text is analysed in chunks to record hit statistics
+        * across the document. After accumulating stats, the fragment with the highest score
+        * is returned
+        *
+        * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
+        * This is typically produced by an analyzer re-parsing a document's
+        * text. Some work may be done on retrieving TokenStreams more efficiently
+        * by adding support for storing original text position data in the Lucene
+        * index but this support is not currently available (as of Lucene 1.4 rc2).
+        * @param text text to highlight terms in
+        *
+        * @return highlighted text fragment or null if no terms found
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final String getBestFragment(TokenStream tokenStream, String text)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               String[] results = getBestFragments(tokenStream,text, 1);
+               if (results.length > 0)
+               {
+                       return results[0];
+               }
+               return null;
+       }
+
+       /**
+        * Highlights chosen terms in a text, extracting the most relevant sections.
+        * This is a convenience method that calls
+        * {@link #getBestFragments(TokenStream, String, int)}
+        *
+        * @param analyzer   the analyzer that will be used to split <code>text</code>
+        * into chunks
+        * @param fieldName     the name of the field being highlighted (used by analyzer)
+        * @param text          text to highlight terms in
+        * @param maxNumFragments  the maximum number of fragments.
+        *
+        * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final String[] getBestFragments(
+               Analyzer analyzer,
+               String fieldName,
+               String text,
+               int maxNumFragments)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
+               return getBestFragments(tokenStream, text, maxNumFragments);
+       }
+
+       /**
+        * Highlights chosen terms in a text, extracting the most relevant sections.
+        * The document text is analysed in chunks to record hit statistics
+        * across the document. After accumulating stats, the fragments with the highest scores
+        * are returned as an array of strings in order of score (contiguous fragments are merged into
+        * one in their original order to improve readability)
+        *
+        * @param text          text to highlight terms in
+        * @param maxNumFragments  the maximum number of fragments.
+        *
+        * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final String[] getBestFragments(
+               TokenStream tokenStream,
+               String text,
+               int maxNumFragments)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               maxNumFragments = Math.max(1, maxNumFragments); //sanity check
+
+               TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
+
+               //Get text
+               ArrayList<String> fragTexts = new ArrayList<String>();
+               for (int i = 0; i < frag.length; i++)
+               {
+                       if ((frag[i] != null) && (frag[i].getScore() > 0))
+                       {
+                               fragTexts.add(frag[i].toString());
+                       }
+               }
+               return fragTexts.toArray(new String[0]);
+       }
+
+
+       /**
+        * Low level api to get the most relevant (formatted) sections of the document.
+        * This method has been made public to allow visibility of score information held in TextFragment objects.
+        * Thanks to Jason Calabrese for help in redefining the interface.
+        * @param tokenStream
+        * @param text
+        * @param maxNumFragments
+        * @param mergeContiguousFragments
+        * @throws IOException
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final TextFragment[] getBestTextFragments(
+               TokenStream tokenStream,
+               String text,
+               boolean mergeContiguousFragments,
+               int maxNumFragments)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
+               StringBuilder newText=new StringBuilder();
+               
+           CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
+           OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
+           tokenStream.addAttribute(PositionIncrementAttribute.class);
+           tokenStream.reset();
+           
+               TextFragment currentFrag =      new TextFragment(newText,newText.length(), docFrags.size());
+               
+    if (fragmentScorer instanceof QueryScorer) {
+      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
+    }
+    
+               TokenStream newStream = fragmentScorer.init(tokenStream);
+               if(newStream != null) {
+                 tokenStream = newStream;
+               }
+               fragmentScorer.startFragment(currentFrag);
+               docFrags.add(currentFrag);
+
+               FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
+
+               try
+               {
+
+                       String tokenText;
+                       int startOffset;
+                       int endOffset;
+                       int lastEndOffset = 0;
+                       textFragmenter.start(text, tokenStream);
+
+                       TokenGroup tokenGroup=new TokenGroup(tokenStream);
+
+                       for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
+                             next = tokenStream.incrementToken())
+                       {
+                               if(     (offsetAtt.endOffset()>text.length())
+                                       ||
+                                       (offsetAtt.startOffset()>text.length())
+                                       )                                               
+                               {
+                                       throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
+                                                       +" exceeds length of provided text sized "+text.length());
+                               }
+                               if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
+                               {
+                                       //the current token is distinct from previous tokens -
+                                       // markup the cached token group info
+                                       startOffset = tokenGroup.matchStartOffset;
+                                       endOffset = tokenGroup.matchEndOffset;
+                                       tokenText = text.substring(startOffset, endOffset);
+                                       String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
+                                       //store any whitespace etc from between this and last group
+                                       if (startOffset > lastEndOffset)
+                                               newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
+                                       newText.append(markedUpText);
+                                       lastEndOffset=Math.max(endOffset, lastEndOffset);
+                                       tokenGroup.clear();
+
+                                       //check if current token marks the start of a new fragment
+                                       if(textFragmenter.isNewFragment())
+                                       {
+                                               currentFrag.setScore(fragmentScorer.getFragmentScore());
+                                               //record stats for a new fragment
+                                               currentFrag.textEndPos = newText.length();
+                                               currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
+                                               fragmentScorer.startFragment(currentFrag);
+                                               docFrags.add(currentFrag);
+                                       }
+                               }
+
+                               tokenGroup.addToken(fragmentScorer.getTokenScore());
+
+//                             if(lastEndOffset>maxDocBytesToAnalyze)
+//                             {
+//                                     break;
+//                             }
+                       }
+                       currentFrag.setScore(fragmentScorer.getFragmentScore());
+
+                       if(tokenGroup.numTokens>0)
+                       {
+                               //flush the accumulated text (same code as in above loop)
+                               startOffset = tokenGroup.matchStartOffset;
+                               endOffset = tokenGroup.matchEndOffset;
+                               tokenText = text.substring(startOffset, endOffset);
+                               String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
+                               //store any whitespace etc from between this and last group
+                               if (startOffset > lastEndOffset)
+                                       newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
+                               newText.append(markedUpText);
+                               lastEndOffset=Math.max(lastEndOffset,endOffset);
+                       }
+
+                       //Test what remains of the original text beyond the point where we stopped analyzing 
+                       if (
+//                                     if there is text beyond the last token considered..
+                                       (lastEndOffset < text.length()) 
+                                       &&
+//                                     and that text is not too large...
+                                       (text.length()<= maxDocCharsToAnalyze)
+                               )                               
+                       {
+                               //append it to the last fragment
+                               newText.append(encoder.encodeText(text.substring(lastEndOffset)));
+                       }
+
+                       currentFrag.textEndPos = newText.length();
+
+                       //sort the most relevant sections of the text
+                       for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
+                       {
+                               currentFrag = i.next();
+
+                               //If you are running with a version of Lucene before 11th Sept 03
+                               // you do not have PriorityQueue.insert() - so uncomment the code below
+                               /*
+                                                                       if (currentFrag.getScore() >= minScore)
+                                                                       {
+                                                                               fragQueue.put(currentFrag);
+                                                                               if (fragQueue.size() > maxNumFragments)
+                                                                               { // if hit queue overfull
+                                                                                       fragQueue.pop(); // remove lowest in hit queue
+                                                                                       minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
+                                                                               }
+
+
+                                                                       }
+                               */
+                               //The above code caused a problem as a result of Christoph Goller's 11th Sept 03
+                               //fix to PriorityQueue. The correct method to use here is the new "insert" method
+                               // USE ABOVE CODE IF THIS DOES NOT COMPILE!
+                               fragQueue.insertWithOverflow(currentFrag);
+                       }
+
+                       //return the most relevant fragments
+                       TextFragment frag[] = new TextFragment[fragQueue.size()];
+                       for (int i = frag.length - 1; i >= 0; i--)
+                       {
+                               frag[i] = fragQueue.pop();
+                       }
+
+                       //merge any contiguous fragments to improve readability
+                       if(mergeContiguousFragments)
+                       {
+                               mergeContiguousFragments(frag);
+                               ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
+                               for (int i = 0; i < frag.length; i++)
+                               {
+                                       if ((frag[i] != null) && (frag[i].getScore() > 0))
+                                       {
+                                               fragTexts.add(frag[i]);
+                                       }
+                               }
+                               frag= fragTexts.toArray(new TextFragment[0]);
+                       }
+
+                       return frag;
+
+               }
+               finally
+               {
+                       if (tokenStream != null)
+                       {
+                               try
+                               {
+                                 tokenStream.end();
+                                       tokenStream.close();
+                               }
+                               catch (Exception e)
+                               {
+                               }
+                       }
+               }
+       }
+
+
+       /** Improves readability of a score-sorted list of TextFragments by merging any fragments
+        * that were contiguous in the original text into one larger fragment with the correct order.
+        * This will leave a "null" in the array entry for the lesser scored fragment. 
+        * 
+        * @param frag An array of document fragments in descending score
+        */
+       private void mergeContiguousFragments(TextFragment[] frag)
+       {
+               boolean mergingStillBeingDone;
+               if (frag.length > 1)
+                       do
+                       {
+                               mergingStillBeingDone = false; //initialise loop control flag
+                               //for each fragment, scan other frags looking for contiguous blocks
+                               for (int i = 0; i < frag.length; i++)
+                               {
+                                       if (frag[i] == null)
+                                       {
+                                               continue;
+                                       }
+                                       //merge any contiguous blocks 
+                                       for (int x = 0; x < frag.length; x++)
+                                       {
+                                               if (frag[x] == null)
+                                               {
+                                                       continue;
+                                               }
+                                               if (frag[i] == null)
+                                               {
+                                                       break;
+                                               }
+                                               TextFragment frag1 = null;
+                                               TextFragment frag2 = null;
+                                               int frag1Num = 0;
+                                               int frag2Num = 0;
+                                               int bestScoringFragNum;
+                                               int worstScoringFragNum;
+                                               //if blocks are contiguous....
+                                               if (frag[i].follows(frag[x]))
+                                               {
+                                                       frag1 = frag[x];
+                                                       frag1Num = x;
+                                                       frag2 = frag[i];
+                                                       frag2Num = i;
+                                               }
+                                               else
+                                                       if (frag[x].follows(frag[i]))
+                                                       {
+                                                               frag1 = frag[i];
+                                                               frag1Num = i;
+                                                               frag2 = frag[x];
+                                                               frag2Num = x;
+                                                       }
+                                               //merging required..
+                                               if (frag1 != null)
+                                               {
+                                                       if (frag1.getScore() > frag2.getScore())
+                                                       {
+                                                               bestScoringFragNum = frag1Num;
+                                                               worstScoringFragNum = frag2Num;
+                                                       }
+                                                       else
+                                                       {
+                                                               bestScoringFragNum = frag2Num;
+                                                               worstScoringFragNum = frag1Num;
+                                                       }
+                                                       frag1.merge(frag2);
+                                                       frag[worstScoringFragNum] = null;
+                                                       mergingStillBeingDone = true;
+                                                       frag[bestScoringFragNum] = frag1;
+                                               }
+                                       }
+                               }
+                       }
+                       while (mergingStillBeingDone);
+       }
+       
+       
+       /**
+        * Highlights terms in the  text , extracting the most relevant sections
+        * and concatenating the chosen fragments with a separator (typically "...").
+        * The document text is analysed in chunks to record hit statistics
+        * across the document. After accumulating stats, the fragments with the highest scores
+        * are returned in order as "separator" delimited strings.
+        *
+        * @param text        text to highlight terms in
+        * @param maxNumFragments  the maximum number of fragments.
+        * @param separator  the separator used to intersperse the document fragments (typically "...")
+        *
+        * @return highlighted text
+        * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
+        */
+       public final String getBestFragments(
+               TokenStream tokenStream,        
+               String text,
+               int maxNumFragments,
+               String separator)
+               throws IOException, InvalidTokenOffsetsException
+       {
+               String sections[] =     getBestFragments(tokenStream,text, maxNumFragments);
+               StringBuilder result = new StringBuilder();
+               for (int i = 0; i < sections.length; i++)
+               {
+                       if (i > 0)
+                       {
+                               result.append(separator);
+                       }
+                       result.append(sections[i]);
+               }
+               return result.toString();
+       }
+
+  public int getMaxDocCharsToAnalyze() {
+    return maxDocCharsToAnalyze;
+  }
+
+  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
+    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
+  }
+
+  
+       public Fragmenter getTextFragmenter()
+       {
+               return textFragmenter;
+       }
+
+       /**
+        * @param fragmenter
+        */
+       public void setTextFragmenter(Fragmenter fragmenter)
+       {
+               textFragmenter = fragmenter;
+       }
+
+       /**
+        * @return Object used to score each text fragment 
+        */
+       public Scorer getFragmentScorer()
+       {
+               return fragmentScorer;
+       }
+
+
+       /**
+        * @param scorer
+        */
+       public void setFragmentScorer(Scorer scorer)
+       {
+               fragmentScorer = scorer;
+       }
+
+    public Encoder getEncoder()
+    {
+        return encoder;
+    }
+    public void setEncoder(Encoder encoder)
+    {
+        this.encoder = encoder;
+    }
+}
+class FragmentQueue extends PriorityQueue<TextFragment>
+{
+       public FragmentQueue(int size)
+       {
+               initialize(size);
+       }
+
+       @Override
+       public final boolean lessThan(TextFragment fragA, TextFragment fragB)
+       {
+               if (fragA.getScore() == fragB.getScore())
+                       return fragA.fragNum > fragB.fragNum;
+               else
+                       return fragA.getScore() < fragB.getScore();
+       }
+}