X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java

diff --git a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
deleted file mode 100644
index f424423..0000000
--- a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java
+++ /dev/null
@@ -1,544 +0,0 @@
-package org.apache.lucene.search.highlight;
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.Iterator;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.util.PriorityQueue;
-
-/**
- * Class used to markup highlighted terms found in the best sections of a
- * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter},
- * {@link Encoder} and tokenizers.
- */
-public class Highlighter
-{
-  public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024;
-
-  private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE;
-	private Formatter formatter;
-	private Encoder encoder;
-	private Fragmenter textFragmenter=new SimpleFragmenter();
-	private Scorer fragmentScorer=null;
-
-	public Highlighter(Scorer fragmentScorer)
-	{
-		this(new SimpleHTMLFormatter(),fragmentScorer);
-	}
-
-
- 	public Highlighter(Formatter formatter, Scorer fragmentScorer)
- 	{
-		this(formatter,new DefaultEncoder(),fragmentScorer);
-	}
-
-
-	public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer)
-	{
- 		this.formatter = formatter;
-		this.encoder = encoder;
- 		this.fragmentScorer = fragmentScorer;
- 	}
-
-	/**
-	 * Highlights chosen terms in a text, extracting the most relevant section.
-	 * This is a convenience method that calls
-	 * {@link #getBestFragment(TokenStream, String)}
-	 *
-	 * @param analyzer   the analyzer that will be used to split <code>text</code>
-	 * into chunks
-	 * @param text text to highlight terms in
-	 * @param fieldName Name of field used to influence analyzer's tokenization policy
-	 *
-	 * @return highlighted text fragment or null if no terms found
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final String getBestFragment(Analyzer analyzer, String fieldName,String text)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
-		return getBestFragment(tokenStream, text);
-	}
-
-	/**
-	 * Highlights chosen terms in a text, extracting the most relevant section.
-	 * The document text is analysed in chunks to record hit statistics
-	 * across the document. After accumulating stats, the fragment with the highest score
-	 * is returned
-	 *
-	 * @param tokenStream   a stream of tokens identified in the text parameter, including offset information.
-	 * This is typically produced by an analyzer re-parsing a document's
-	 * text. Some work may be done on retrieving TokenStreams more efficiently
-	 * by adding support for storing original text position data in the Lucene
-	 * index but this support is not currently available (as of Lucene 1.4 rc2).
-	 * @param text text to highlight terms in
-	 *
-	 * @return highlighted text fragment or null if no terms found
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final String getBestFragment(TokenStream tokenStream, String text)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		String[] results = getBestFragments(tokenStream,text, 1);
-		if (results.length > 0)
-		{
-			return results[0];
-		}
-		return null;
-	}
-
-	/**
-	 * Highlights chosen terms in a text, extracting the most relevant sections.
-	 * This is a convenience method that calls
-	 * {@link #getBestFragments(TokenStream, String, int)}
-	 *
-	 * @param analyzer   the analyzer that will be used to split <code>text</code>
-	 * into chunks
-	 * @param fieldName     the name of the field being highlighted (used by analyzer)
-	 * @param text        	text to highlight terms in
-	 * @param maxNumFragments  the maximum number of fragments.
-	 *
-	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final String[] getBestFragments(
-		Analyzer analyzer,
-		String fieldName,
-		String text,
-		int maxNumFragments)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text));
-		return getBestFragments(tokenStream, text, maxNumFragments);
-	}
-
-	/**
-	 * Highlights chosen terms in a text, extracting the most relevant sections.
-	 * The document text is analysed in chunks to record hit statistics
-	 * across the document. After accumulating stats, the fragments with the highest scores
-	 * are returned as an array of strings in order of score (contiguous fragments are merged into
-	 * one in their original order to improve readability)
-	 *
-	 * @param text        	text to highlight terms in
-	 * @param maxNumFragments  the maximum number of fragments.
-	 *
-	 * @return highlighted text fragments (between 0 and maxNumFragments number of fragments)
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final String[] getBestFragments(
-		TokenStream tokenStream,
-		String text,
-		int maxNumFragments)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		maxNumFragments = Math.max(1, maxNumFragments); //sanity check
-
-		TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments);
-
-		//Get text
-		ArrayList<String> fragTexts = new ArrayList<String>();
-		for (int i = 0; i < frag.length; i++)
-		{
-			if ((frag[i] != null) && (frag[i].getScore() > 0))
-			{
-				fragTexts.add(frag[i].toString());
-			}
-		}
-		return fragTexts.toArray(new String[0]);
-	}
-
-
-	/**
-	 * Low level api to get the most relevant (formatted) sections of the document.
-	 * This method has been made public to allow visibility of score information held in TextFragment objects.
-	 * Thanks to Jason Calabrese for help in redefining the interface.
-	 * @param tokenStream
-	 * @param text
-	 * @param maxNumFragments
-	 * @param mergeContiguousFragments
-	 * @throws IOException
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final TextFragment[] getBestTextFragments(
-		TokenStream tokenStream,
-		String text,
-		boolean mergeContiguousFragments,
-		int maxNumFragments)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		ArrayList<TextFragment> docFrags = new ArrayList<TextFragment>();
-		StringBuilder newText=new StringBuilder();
-		
-	    CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class);
-	    OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class);
-	    tokenStream.addAttribute(PositionIncrementAttribute.class);
-	    tokenStream.reset();
-	    
-		TextFragment currentFrag =	new TextFragment(newText,newText.length(), docFrags.size());
-		
-    if (fragmentScorer instanceof QueryScorer) {
-      ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze);
-    }
-    
-		TokenStream newStream = fragmentScorer.init(tokenStream);
-		if(newStream != null) {
-		  tokenStream = newStream;
-		}
-		fragmentScorer.startFragment(currentFrag);
-		docFrags.add(currentFrag);
-
-		FragmentQueue fragQueue = new FragmentQueue(maxNumFragments);
-
-		try
-		{
-
-			String tokenText;
-			int startOffset;
-			int endOffset;
-			int lastEndOffset = 0;
-			textFragmenter.start(text, tokenStream);
-
-			TokenGroup tokenGroup=new TokenGroup(tokenStream);
-
-			for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze);
-			      next = tokenStream.incrementToken())
-			{
-				if(	(offsetAtt.endOffset()>text.length())
-					||
-					(offsetAtt.startOffset()>text.length())
-					)						
-				{
-					throw new InvalidTokenOffsetsException("Token "+ termAtt.toString()
-							+" exceeds length of provided text sized "+text.length());
-				}
-				if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct()))
-				{
-					//the current token is distinct from previous tokens -
-					// markup the cached token group info
-					startOffset = tokenGroup.matchStartOffset;
-					endOffset = tokenGroup.matchEndOffset;
-					tokenText = text.substring(startOffset, endOffset);
-					String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
-					//store any whitespace etc from between this and last group
-					if (startOffset > lastEndOffset)
-						newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
-					newText.append(markedUpText);
-					lastEndOffset=Math.max(endOffset, lastEndOffset);
-					tokenGroup.clear();
-
-					//check if current token marks the start of a new fragment
-					if(textFragmenter.isNewFragment())
-					{
-						currentFrag.setScore(fragmentScorer.getFragmentScore());
-						//record stats for a new fragment
-						currentFrag.textEndPos = newText.length();
-						currentFrag =new TextFragment(newText, newText.length(), docFrags.size());
-						fragmentScorer.startFragment(currentFrag);
-						docFrags.add(currentFrag);
-					}
-				}
-
-				tokenGroup.addToken(fragmentScorer.getTokenScore());
-
-//				if(lastEndOffset>maxDocBytesToAnalyze)
-//				{
-//					break;
-//				}
-			}
-			currentFrag.setScore(fragmentScorer.getFragmentScore());
-
-			if(tokenGroup.numTokens>0)
-			{
-				//flush the accumulated text (same code as in above loop)
-				startOffset = tokenGroup.matchStartOffset;
-				endOffset = tokenGroup.matchEndOffset;
-				tokenText = text.substring(startOffset, endOffset);
-				String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup);
-				//store any whitespace etc from between this and last group
-				if (startOffset > lastEndOffset)
-					newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset)));
-				newText.append(markedUpText);
-				lastEndOffset=Math.max(lastEndOffset,endOffset);
-			}
-
-			//Test what remains of the original text beyond the point where we stopped analyzing 
-			if (
-//					if there is text beyond the last token considered..
-					(lastEndOffset < text.length()) 
-					&&
-//					and that text is not too large...
-					(text.length()<= maxDocCharsToAnalyze)
-				)				
-			{
-				//append it to the last fragment
-				newText.append(encoder.encodeText(text.substring(lastEndOffset)));
-			}
-
-			currentFrag.textEndPos = newText.length();
-
-			//sort the most relevant sections of the text
-			for (Iterator<TextFragment> i = docFrags.iterator(); i.hasNext();)
-			{
-				currentFrag = i.next();
-
-				//If you are running with a version of Lucene before 11th Sept 03
-				// you do not have PriorityQueue.insert() - so uncomment the code below
-				/*
-									if (currentFrag.getScore() >= minScore)
-									{
-										fragQueue.put(currentFrag);
-										if (fragQueue.size() > maxNumFragments)
-										{ // if hit queue overfull
-											fragQueue.pop(); // remove lowest in hit queue
-											minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore
-										}
-
-
-									}
-				*/
-				//The above code caused a problem as a result of Christoph Goller's 11th Sept 03
-				//fix to PriorityQueue. The correct method to use here is the new "insert" method
-				// USE ABOVE CODE IF THIS DOES NOT COMPILE!
-				fragQueue.insertWithOverflow(currentFrag);
-			}
-
-			//return the most relevant fragments
-			TextFragment frag[] = new TextFragment[fragQueue.size()];
-			for (int i = frag.length - 1; i >= 0; i--)
-			{
-				frag[i] = fragQueue.pop();
-			}
-
-			//merge any contiguous fragments to improve readability
-			if(mergeContiguousFragments)
-			{
-				mergeContiguousFragments(frag);
-				ArrayList<TextFragment> fragTexts = new ArrayList<TextFragment>();
-				for (int i = 0; i < frag.length; i++)
-				{
-					if ((frag[i] != null) && (frag[i].getScore() > 0))
-					{
-						fragTexts.add(frag[i]);
-					}
-				}
-				frag= fragTexts.toArray(new TextFragment[0]);
-			}
-
-			return frag;
-
-		}
-		finally
-		{
-			if (tokenStream != null)
-			{
-				try
-				{
-				  tokenStream.end();
-					tokenStream.close();
-				}
-				catch (Exception e)
-				{
-				}
-			}
-		}
-	}
-
-
-	/** Improves readability of a score-sorted list of TextFragments by merging any fragments
-	 * that were contiguous in the original text into one larger fragment with the correct order.
-	 * This will leave a "null" in the array entry for the lesser scored fragment. 
-	 * 
-	 * @param frag An array of document fragments in descending score
-	 */
-	private void mergeContiguousFragments(TextFragment[] frag)
-	{
-		boolean mergingStillBeingDone;
-		if (frag.length > 1)
-			do
-			{
-				mergingStillBeingDone = false; //initialise loop control flag
-				//for each fragment, scan other frags looking for contiguous blocks
-				for (int i = 0; i < frag.length; i++)
-				{
-					if (frag[i] == null)
-					{
-						continue;
-					}
-					//merge any contiguous blocks 
-					for (int x = 0; x < frag.length; x++)
-					{
-						if (frag[x] == null)
-						{
-							continue;
-						}
-						if (frag[i] == null)
-						{
-							break;
-						}
-						TextFragment frag1 = null;
-						TextFragment frag2 = null;
-						int frag1Num = 0;
-						int frag2Num = 0;
-						int bestScoringFragNum;
-						int worstScoringFragNum;
-						//if blocks are contiguous....
-						if (frag[i].follows(frag[x]))
-						{
-							frag1 = frag[x];
-							frag1Num = x;
-							frag2 = frag[i];
-							frag2Num = i;
-						}
-						else
-							if (frag[x].follows(frag[i]))
-							{
-								frag1 = frag[i];
-								frag1Num = i;
-								frag2 = frag[x];
-								frag2Num = x;
-							}
-						//merging required..
-						if (frag1 != null)
-						{
-							if (frag1.getScore() > frag2.getScore())
-							{
-								bestScoringFragNum = frag1Num;
-								worstScoringFragNum = frag2Num;
-							}
-							else
-							{
-								bestScoringFragNum = frag2Num;
-								worstScoringFragNum = frag1Num;
-							}
-							frag1.merge(frag2);
-							frag[worstScoringFragNum] = null;
-							mergingStillBeingDone = true;
-							frag[bestScoringFragNum] = frag1;
-						}
-					}
-				}
-			}
-			while (mergingStillBeingDone);
-	}
-	
-	
-	/**
-	 * Highlights terms in the  text , extracting the most relevant sections
-	 * and concatenating the chosen fragments with a separator (typically "...").
-	 * The document text is analysed in chunks to record hit statistics
-	 * across the document. After accumulating stats, the fragments with the highest scores
-	 * are returned in order as "separator" delimited strings.
-	 *
-	 * @param text        text to highlight terms in
-	 * @param maxNumFragments  the maximum number of fragments.
-	 * @param separator  the separator used to intersperse the document fragments (typically "...")
-	 *
-	 * @return highlighted text
-	 * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length
-	 */
-	public final String getBestFragments(
-		TokenStream tokenStream,	
-		String text,
-		int maxNumFragments,
-		String separator)
-		throws IOException, InvalidTokenOffsetsException
-	{
-		String sections[] =	getBestFragments(tokenStream,text, maxNumFragments);
-		StringBuilder result = new StringBuilder();
-		for (int i = 0; i < sections.length; i++)
-		{
-			if (i > 0)
-			{
-				result.append(separator);
-			}
-			result.append(sections[i]);
-		}
-		return result.toString();
-	}
-
-  public int getMaxDocCharsToAnalyze() {
-    return maxDocCharsToAnalyze;
-  }
-
-  public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) {
-    this.maxDocCharsToAnalyze = maxDocCharsToAnalyze;
-  }
-
-  
-	public Fragmenter getTextFragmenter()
-	{
-		return textFragmenter;
-	}
-
-	/**
-	 * @param fragmenter
-	 */
-	public void setTextFragmenter(Fragmenter fragmenter)
-	{
-		textFragmenter = fragmenter;
-	}
-
-	/**
-	 * @return Object used to score each text fragment 
-	 */
-	public Scorer getFragmentScorer()
-	{
-		return fragmentScorer;
-	}
-
-
-	/**
-	 * @param scorer
-	 */
-	public void setFragmentScorer(Scorer scorer)
-	{
-		fragmentScorer = scorer;
-	}
-
-    public Encoder getEncoder()
-    {
-        return encoder;
-    }
-    public void setEncoder(Encoder encoder)
-    {
-        this.encoder = encoder;
-    }
-}
-class FragmentQueue extends PriorityQueue<TextFragment>
-{
-	public FragmentQueue(int size)
-	{
-		initialize(size);
-	}
-
-	@Override
-	public final boolean lessThan(TextFragment fragA, TextFragment fragB)
-	{
-		if (fragA.getScore() == fragB.getScore())
-			return fragA.fragNum > fragB.fragNum;
-		else
-			return fragA.getScore() < fragB.getScore();
-	}
-}