X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java diff --git a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java deleted file mode 100644 index f424423..0000000 --- a/lucene-java-3.4.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java +++ /dev/null @@ -1,544 +0,0 @@ -package org.apache.lucene.search.highlight; -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Iterator; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.util.PriorityQueue; - -/** - * Class used to markup highlighted terms found in the best sections of a - * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, - * {@link Encoder} and tokenizers. - */ -public class Highlighter -{ - public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024; - - private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE; - private Formatter formatter; - private Encoder encoder; - private Fragmenter textFragmenter=new SimpleFragmenter(); - private Scorer fragmentScorer=null; - - public Highlighter(Scorer fragmentScorer) - { - this(new SimpleHTMLFormatter(),fragmentScorer); - } - - - public Highlighter(Formatter formatter, Scorer fragmentScorer) - { - this(formatter,new DefaultEncoder(),fragmentScorer); - } - - - public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) - { - this.formatter = formatter; - this.encoder = encoder; - this.fragmentScorer = fragmentScorer; - } - - /** - * Highlights chosen terms in a text, extracting the most relevant section. - * This is a convenience method that calls - * {@link #getBestFragment(TokenStream, String)} - * - * @param analyzer the analyzer that will be used to split text - * into chunks - * @param text text to highlight terms in - * @param fieldName Name of field used to influence analyzer's tokenization policy - * - * @return highlighted text fragment or null if no terms found - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragment(Analyzer analyzer, String fieldName,String text) - throws IOException, InvalidTokenOffsetsException - { - TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); - return getBestFragment(tokenStream, text); - } - - /** - * Highlights chosen terms in a text, extracting the most relevant section. - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragment with the highest score - * is returned - * - * @param tokenStream a stream of tokens identified in the text parameter, including offset information. - * This is typically produced by an analyzer re-parsing a document's - * text. Some work may be done on retrieving TokenStreams more efficiently - * by adding support for storing original text position data in the Lucene - * index but this support is not currently available (as of Lucene 1.4 rc2). - * @param text text to highlight terms in - * - * @return highlighted text fragment or null if no terms found - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragment(TokenStream tokenStream, String text) - throws IOException, InvalidTokenOffsetsException - { - String[] results = getBestFragments(tokenStream,text, 1); - if (results.length > 0) - { - return results[0]; - } - return null; - } - - /** - * Highlights chosen terms in a text, extracting the most relevant sections. - * This is a convenience method that calls - * {@link #getBestFragments(TokenStream, String, int)} - * - * @param analyzer the analyzer that will be used to split text - * into chunks - * @param fieldName the name of the field being highlighted (used by analyzer) - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * - * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String[] getBestFragments( - Analyzer analyzer, - String fieldName, - String text, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); - return getBestFragments(tokenStream, text, maxNumFragments); - } - - /** - * Highlights chosen terms in a text, extracting the most relevant sections. - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragments with the highest scores - * are returned as an array of strings in order of score (contiguous fragments are merged into - * one in their original order to improve readability) - * - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * - * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String[] getBestFragments( - TokenStream tokenStream, - String text, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - maxNumFragments = Math.max(1, maxNumFragments); //sanity check - - TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); - - //Get text - ArrayList fragTexts = new ArrayList(); - for (int i = 0; i < frag.length; i++) - { - if ((frag[i] != null) && (frag[i].getScore() > 0)) - { - fragTexts.add(frag[i].toString()); - } - } - return fragTexts.toArray(new String[0]); - } - - - /** - * Low level api to get the most relevant (formatted) sections of the document. - * This method has been made public to allow visibility of score information held in TextFragment objects. - * Thanks to Jason Calabrese for help in redefining the interface. - * @param tokenStream - * @param text - * @param maxNumFragments - * @param mergeContiguousFragments - * @throws IOException - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final TextFragment[] getBestTextFragments( - TokenStream tokenStream, - String text, - boolean mergeContiguousFragments, - int maxNumFragments) - throws IOException, InvalidTokenOffsetsException - { - ArrayList docFrags = new ArrayList(); - StringBuilder newText=new StringBuilder(); - - CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); - OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); - tokenStream.addAttribute(PositionIncrementAttribute.class); - tokenStream.reset(); - - TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); - - if (fragmentScorer instanceof QueryScorer) { - ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); - } - - TokenStream newStream = fragmentScorer.init(tokenStream); - if(newStream != null) { - tokenStream = newStream; - } - fragmentScorer.startFragment(currentFrag); - docFrags.add(currentFrag); - - FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); - - try - { - - String tokenText; - int startOffset; - int endOffset; - int lastEndOffset = 0; - textFragmenter.start(text, tokenStream); - - TokenGroup tokenGroup=new TokenGroup(tokenStream); - - for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); - next = tokenStream.incrementToken()) - { - if( (offsetAtt.endOffset()>text.length()) - || - (offsetAtt.startOffset()>text.length()) - ) - { - throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() - +" exceeds length of provided text sized "+text.length()); - } - if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) - { - //the current token is distinct from previous tokens - - // markup the cached token group info - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; - tokenText = text.substring(startOffset, endOffset); - String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); - newText.append(markedUpText); - lastEndOffset=Math.max(endOffset, lastEndOffset); - tokenGroup.clear(); - - //check if current token marks the start of a new fragment - if(textFragmenter.isNewFragment()) - { - currentFrag.setScore(fragmentScorer.getFragmentScore()); - //record stats for a new fragment - currentFrag.textEndPos = newText.length(); - currentFrag =new TextFragment(newText, newText.length(), docFrags.size()); - fragmentScorer.startFragment(currentFrag); - docFrags.add(currentFrag); - } - } - - tokenGroup.addToken(fragmentScorer.getTokenScore()); - -// if(lastEndOffset>maxDocBytesToAnalyze) -// { -// break; -// } - } - currentFrag.setScore(fragmentScorer.getFragmentScore()); - - if(tokenGroup.numTokens>0) - { - //flush the accumulated text (same code as in above loop) - startOffset = tokenGroup.matchStartOffset; - endOffset = tokenGroup.matchEndOffset; - tokenText = text.substring(startOffset, endOffset); - String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); - //store any whitespace etc from between this and last group - if (startOffset > lastEndOffset) - newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); - newText.append(markedUpText); - lastEndOffset=Math.max(lastEndOffset,endOffset); - } - - //Test what remains of the original text beyond the point where we stopped analyzing - if ( -// if there is text beyond the last token considered.. - (lastEndOffset < text.length()) - && -// and that text is not too large... - (text.length()<= maxDocCharsToAnalyze) - ) - { - //append it to the last fragment - newText.append(encoder.encodeText(text.substring(lastEndOffset))); - } - - currentFrag.textEndPos = newText.length(); - - //sort the most relevant sections of the text - for (Iterator i = docFrags.iterator(); i.hasNext();) - { - currentFrag = i.next(); - - //If you are running with a version of Lucene before 11th Sept 03 - // you do not have PriorityQueue.insert() - so uncomment the code below - /* - if (currentFrag.getScore() >= minScore) - { - fragQueue.put(currentFrag); - if (fragQueue.size() > maxNumFragments) - { // if hit queue overfull - fragQueue.pop(); // remove lowest in hit queue - minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore - } - - - } - */ - //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 - //fix to PriorityQueue. The correct method to use here is the new "insert" method - // USE ABOVE CODE IF THIS DOES NOT COMPILE! - fragQueue.insertWithOverflow(currentFrag); - } - - //return the most relevant fragments - TextFragment frag[] = new TextFragment[fragQueue.size()]; - for (int i = frag.length - 1; i >= 0; i--) - { - frag[i] = fragQueue.pop(); - } - - //merge any contiguous fragments to improve readability - if(mergeContiguousFragments) - { - mergeContiguousFragments(frag); - ArrayList fragTexts = new ArrayList(); - for (int i = 0; i < frag.length; i++) - { - if ((frag[i] != null) && (frag[i].getScore() > 0)) - { - fragTexts.add(frag[i]); - } - } - frag= fragTexts.toArray(new TextFragment[0]); - } - - return frag; - - } - finally - { - if (tokenStream != null) - { - try - { - tokenStream.end(); - tokenStream.close(); - } - catch (Exception e) - { - } - } - } - } - - - /** Improves readability of a score-sorted list of TextFragments by merging any fragments - * that were contiguous in the original text into one larger fragment with the correct order. - * This will leave a "null" in the array entry for the lesser scored fragment. - * - * @param frag An array of document fragments in descending score - */ - private void mergeContiguousFragments(TextFragment[] frag) - { - boolean mergingStillBeingDone; - if (frag.length > 1) - do - { - mergingStillBeingDone = false; //initialise loop control flag - //for each fragment, scan other frags looking for contiguous blocks - for (int i = 0; i < frag.length; i++) - { - if (frag[i] == null) - { - continue; - } - //merge any contiguous blocks - for (int x = 0; x < frag.length; x++) - { - if (frag[x] == null) - { - continue; - } - if (frag[i] == null) - { - break; - } - TextFragment frag1 = null; - TextFragment frag2 = null; - int frag1Num = 0; - int frag2Num = 0; - int bestScoringFragNum; - int worstScoringFragNum; - //if blocks are contiguous.... - if (frag[i].follows(frag[x])) - { - frag1 = frag[x]; - frag1Num = x; - frag2 = frag[i]; - frag2Num = i; - } - else - if (frag[x].follows(frag[i])) - { - frag1 = frag[i]; - frag1Num = i; - frag2 = frag[x]; - frag2Num = x; - } - //merging required.. - if (frag1 != null) - { - if (frag1.getScore() > frag2.getScore()) - { - bestScoringFragNum = frag1Num; - worstScoringFragNum = frag2Num; - } - else - { - bestScoringFragNum = frag2Num; - worstScoringFragNum = frag1Num; - } - frag1.merge(frag2); - frag[worstScoringFragNum] = null; - mergingStillBeingDone = true; - frag[bestScoringFragNum] = frag1; - } - } - } - } - while (mergingStillBeingDone); - } - - - /** - * Highlights terms in the text , extracting the most relevant sections - * and concatenating the chosen fragments with a separator (typically "..."). - * The document text is analysed in chunks to record hit statistics - * across the document. After accumulating stats, the fragments with the highest scores - * are returned in order as "separator" delimited strings. - * - * @param text text to highlight terms in - * @param maxNumFragments the maximum number of fragments. - * @param separator the separator used to intersperse the document fragments (typically "...") - * - * @return highlighted text - * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length - */ - public final String getBestFragments( - TokenStream tokenStream, - String text, - int maxNumFragments, - String separator) - throws IOException, InvalidTokenOffsetsException - { - String sections[] = getBestFragments(tokenStream,text, maxNumFragments); - StringBuilder result = new StringBuilder(); - for (int i = 0; i < sections.length; i++) - { - if (i > 0) - { - result.append(separator); - } - result.append(sections[i]); - } - return result.toString(); - } - - public int getMaxDocCharsToAnalyze() { - return maxDocCharsToAnalyze; - } - - public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { - this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; - } - - - public Fragmenter getTextFragmenter() - { - return textFragmenter; - } - - /** - * @param fragmenter - */ - public void setTextFragmenter(Fragmenter fragmenter) - { - textFragmenter = fragmenter; - } - - /** - * @return Object used to score each text fragment - */ - public Scorer getFragmentScorer() - { - return fragmentScorer; - } - - - /** - * @param scorer - */ - public void setFragmentScorer(Scorer scorer) - { - fragmentScorer = scorer; - } - - public Encoder getEncoder() - { - return encoder; - } - public void setEncoder(Encoder encoder) - { - this.encoder = encoder; - } -} -class FragmentQueue extends PriorityQueue -{ - public FragmentQueue(int size) - { - initialize(size); - } - - @Override - public final boolean lessThan(TextFragment fragA, TextFragment fragB) - { - if (fragA.getScore() == fragB.getScore()) - return fragA.fragNum > fragB.fragNum; - else - return fragA.getScore() < fragB.getScore(); - } -}