X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java diff --git a/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java new file mode 100644 index 0000000..f424423 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/highlighter/src/java/org/apache/lucene/search/highlight/Highlighter.java @@ -0,0 +1,544 @@ +package org.apache.lucene.search.highlight; +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.io.StringReader; +import java.util.ArrayList; +import java.util.Iterator; + +import org.apache.lucene.analysis.Analyzer; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.PriorityQueue; + +/** + * Class used to markup highlighted terms found in the best sections of a + * text, using configurable {@link Fragmenter}, {@link Scorer}, {@link Formatter}, + * {@link Encoder} and tokenizers. + */ +public class Highlighter +{ + public static final int DEFAULT_MAX_CHARS_TO_ANALYZE = 50*1024; + + private int maxDocCharsToAnalyze = DEFAULT_MAX_CHARS_TO_ANALYZE; + private Formatter formatter; + private Encoder encoder; + private Fragmenter textFragmenter=new SimpleFragmenter(); + private Scorer fragmentScorer=null; + + public Highlighter(Scorer fragmentScorer) + { + this(new SimpleHTMLFormatter(),fragmentScorer); + } + + + public Highlighter(Formatter formatter, Scorer fragmentScorer) + { + this(formatter,new DefaultEncoder(),fragmentScorer); + } + + + public Highlighter(Formatter formatter, Encoder encoder, Scorer fragmentScorer) + { + this.formatter = formatter; + this.encoder = encoder; + this.fragmentScorer = fragmentScorer; + } + + /** + * Highlights chosen terms in a text, extracting the most relevant section. + * This is a convenience method that calls + * {@link #getBestFragment(TokenStream, String)} + * + * @param analyzer the analyzer that will be used to split text + * into chunks + * @param text text to highlight terms in + * @param fieldName Name of field used to influence analyzer's tokenization policy + * + * @return highlighted text fragment or null if no terms found + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final String getBestFragment(Analyzer analyzer, String fieldName,String text) + throws IOException, InvalidTokenOffsetsException + { + TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); + return getBestFragment(tokenStream, text); + } + + /** + * Highlights chosen terms in a text, extracting the most relevant section. + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragment with the highest score + * is returned + * + * @param tokenStream a stream of tokens identified in the text parameter, including offset information. + * This is typically produced by an analyzer re-parsing a document's + * text. Some work may be done on retrieving TokenStreams more efficiently + * by adding support for storing original text position data in the Lucene + * index but this support is not currently available (as of Lucene 1.4 rc2). + * @param text text to highlight terms in + * + * @return highlighted text fragment or null if no terms found + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final String getBestFragment(TokenStream tokenStream, String text) + throws IOException, InvalidTokenOffsetsException + { + String[] results = getBestFragments(tokenStream,text, 1); + if (results.length > 0) + { + return results[0]; + } + return null; + } + + /** + * Highlights chosen terms in a text, extracting the most relevant sections. + * This is a convenience method that calls + * {@link #getBestFragments(TokenStream, String, int)} + * + * @param analyzer the analyzer that will be used to split text + * into chunks + * @param fieldName the name of the field being highlighted (used by analyzer) + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * + * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final String[] getBestFragments( + Analyzer analyzer, + String fieldName, + String text, + int maxNumFragments) + throws IOException, InvalidTokenOffsetsException + { + TokenStream tokenStream = analyzer.reusableTokenStream(fieldName, new StringReader(text)); + return getBestFragments(tokenStream, text, maxNumFragments); + } + + /** + * Highlights chosen terms in a text, extracting the most relevant sections. + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragments with the highest scores + * are returned as an array of strings in order of score (contiguous fragments are merged into + * one in their original order to improve readability) + * + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * + * @return highlighted text fragments (between 0 and maxNumFragments number of fragments) + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final String[] getBestFragments( + TokenStream tokenStream, + String text, + int maxNumFragments) + throws IOException, InvalidTokenOffsetsException + { + maxNumFragments = Math.max(1, maxNumFragments); //sanity check + + TextFragment[] frag =getBestTextFragments(tokenStream,text, true,maxNumFragments); + + //Get text + ArrayList fragTexts = new ArrayList(); + for (int i = 0; i < frag.length; i++) + { + if ((frag[i] != null) && (frag[i].getScore() > 0)) + { + fragTexts.add(frag[i].toString()); + } + } + return fragTexts.toArray(new String[0]); + } + + + /** + * Low level api to get the most relevant (formatted) sections of the document. + * This method has been made public to allow visibility of score information held in TextFragment objects. + * Thanks to Jason Calabrese for help in redefining the interface. + * @param tokenStream + * @param text + * @param maxNumFragments + * @param mergeContiguousFragments + * @throws IOException + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final TextFragment[] getBestTextFragments( + TokenStream tokenStream, + String text, + boolean mergeContiguousFragments, + int maxNumFragments) + throws IOException, InvalidTokenOffsetsException + { + ArrayList docFrags = new ArrayList(); + StringBuilder newText=new StringBuilder(); + + CharTermAttribute termAtt = tokenStream.addAttribute(CharTermAttribute.class); + OffsetAttribute offsetAtt = tokenStream.addAttribute(OffsetAttribute.class); + tokenStream.addAttribute(PositionIncrementAttribute.class); + tokenStream.reset(); + + TextFragment currentFrag = new TextFragment(newText,newText.length(), docFrags.size()); + + if (fragmentScorer instanceof QueryScorer) { + ((QueryScorer) fragmentScorer).setMaxDocCharsToAnalyze(maxDocCharsToAnalyze); + } + + TokenStream newStream = fragmentScorer.init(tokenStream); + if(newStream != null) { + tokenStream = newStream; + } + fragmentScorer.startFragment(currentFrag); + docFrags.add(currentFrag); + + FragmentQueue fragQueue = new FragmentQueue(maxNumFragments); + + try + { + + String tokenText; + int startOffset; + int endOffset; + int lastEndOffset = 0; + textFragmenter.start(text, tokenStream); + + TokenGroup tokenGroup=new TokenGroup(tokenStream); + + for (boolean next = tokenStream.incrementToken(); next && (offsetAtt.startOffset()< maxDocCharsToAnalyze); + next = tokenStream.incrementToken()) + { + if( (offsetAtt.endOffset()>text.length()) + || + (offsetAtt.startOffset()>text.length()) + ) + { + throw new InvalidTokenOffsetsException("Token "+ termAtt.toString() + +" exceeds length of provided text sized "+text.length()); + } + if((tokenGroup.numTokens>0)&&(tokenGroup.isDistinct())) + { + //the current token is distinct from previous tokens - + // markup the cached token group info + startOffset = tokenGroup.matchStartOffset; + endOffset = tokenGroup.matchEndOffset; + tokenText = text.substring(startOffset, endOffset); + String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); + //store any whitespace etc from between this and last group + if (startOffset > lastEndOffset) + newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); + newText.append(markedUpText); + lastEndOffset=Math.max(endOffset, lastEndOffset); + tokenGroup.clear(); + + //check if current token marks the start of a new fragment + if(textFragmenter.isNewFragment()) + { + currentFrag.setScore(fragmentScorer.getFragmentScore()); + //record stats for a new fragment + currentFrag.textEndPos = newText.length(); + currentFrag =new TextFragment(newText, newText.length(), docFrags.size()); + fragmentScorer.startFragment(currentFrag); + docFrags.add(currentFrag); + } + } + + tokenGroup.addToken(fragmentScorer.getTokenScore()); + +// if(lastEndOffset>maxDocBytesToAnalyze) +// { +// break; +// } + } + currentFrag.setScore(fragmentScorer.getFragmentScore()); + + if(tokenGroup.numTokens>0) + { + //flush the accumulated text (same code as in above loop) + startOffset = tokenGroup.matchStartOffset; + endOffset = tokenGroup.matchEndOffset; + tokenText = text.substring(startOffset, endOffset); + String markedUpText=formatter.highlightTerm(encoder.encodeText(tokenText), tokenGroup); + //store any whitespace etc from between this and last group + if (startOffset > lastEndOffset) + newText.append(encoder.encodeText(text.substring(lastEndOffset, startOffset))); + newText.append(markedUpText); + lastEndOffset=Math.max(lastEndOffset,endOffset); + } + + //Test what remains of the original text beyond the point where we stopped analyzing + if ( +// if there is text beyond the last token considered.. + (lastEndOffset < text.length()) + && +// and that text is not too large... + (text.length()<= maxDocCharsToAnalyze) + ) + { + //append it to the last fragment + newText.append(encoder.encodeText(text.substring(lastEndOffset))); + } + + currentFrag.textEndPos = newText.length(); + + //sort the most relevant sections of the text + for (Iterator i = docFrags.iterator(); i.hasNext();) + { + currentFrag = i.next(); + + //If you are running with a version of Lucene before 11th Sept 03 + // you do not have PriorityQueue.insert() - so uncomment the code below + /* + if (currentFrag.getScore() >= minScore) + { + fragQueue.put(currentFrag); + if (fragQueue.size() > maxNumFragments) + { // if hit queue overfull + fragQueue.pop(); // remove lowest in hit queue + minScore = ((TextFragment) fragQueue.top()).getScore(); // reset minScore + } + + + } + */ + //The above code caused a problem as a result of Christoph Goller's 11th Sept 03 + //fix to PriorityQueue. The correct method to use here is the new "insert" method + // USE ABOVE CODE IF THIS DOES NOT COMPILE! + fragQueue.insertWithOverflow(currentFrag); + } + + //return the most relevant fragments + TextFragment frag[] = new TextFragment[fragQueue.size()]; + for (int i = frag.length - 1; i >= 0; i--) + { + frag[i] = fragQueue.pop(); + } + + //merge any contiguous fragments to improve readability + if(mergeContiguousFragments) + { + mergeContiguousFragments(frag); + ArrayList fragTexts = new ArrayList(); + for (int i = 0; i < frag.length; i++) + { + if ((frag[i] != null) && (frag[i].getScore() > 0)) + { + fragTexts.add(frag[i]); + } + } + frag= fragTexts.toArray(new TextFragment[0]); + } + + return frag; + + } + finally + { + if (tokenStream != null) + { + try + { + tokenStream.end(); + tokenStream.close(); + } + catch (Exception e) + { + } + } + } + } + + + /** Improves readability of a score-sorted list of TextFragments by merging any fragments + * that were contiguous in the original text into one larger fragment with the correct order. + * This will leave a "null" in the array entry for the lesser scored fragment. + * + * @param frag An array of document fragments in descending score + */ + private void mergeContiguousFragments(TextFragment[] frag) + { + boolean mergingStillBeingDone; + if (frag.length > 1) + do + { + mergingStillBeingDone = false; //initialise loop control flag + //for each fragment, scan other frags looking for contiguous blocks + for (int i = 0; i < frag.length; i++) + { + if (frag[i] == null) + { + continue; + } + //merge any contiguous blocks + for (int x = 0; x < frag.length; x++) + { + if (frag[x] == null) + { + continue; + } + if (frag[i] == null) + { + break; + } + TextFragment frag1 = null; + TextFragment frag2 = null; + int frag1Num = 0; + int frag2Num = 0; + int bestScoringFragNum; + int worstScoringFragNum; + //if blocks are contiguous.... + if (frag[i].follows(frag[x])) + { + frag1 = frag[x]; + frag1Num = x; + frag2 = frag[i]; + frag2Num = i; + } + else + if (frag[x].follows(frag[i])) + { + frag1 = frag[i]; + frag1Num = i; + frag2 = frag[x]; + frag2Num = x; + } + //merging required.. + if (frag1 != null) + { + if (frag1.getScore() > frag2.getScore()) + { + bestScoringFragNum = frag1Num; + worstScoringFragNum = frag2Num; + } + else + { + bestScoringFragNum = frag2Num; + worstScoringFragNum = frag1Num; + } + frag1.merge(frag2); + frag[worstScoringFragNum] = null; + mergingStillBeingDone = true; + frag[bestScoringFragNum] = frag1; + } + } + } + } + while (mergingStillBeingDone); + } + + + /** + * Highlights terms in the text , extracting the most relevant sections + * and concatenating the chosen fragments with a separator (typically "..."). + * The document text is analysed in chunks to record hit statistics + * across the document. After accumulating stats, the fragments with the highest scores + * are returned in order as "separator" delimited strings. + * + * @param text text to highlight terms in + * @param maxNumFragments the maximum number of fragments. + * @param separator the separator used to intersperse the document fragments (typically "...") + * + * @return highlighted text + * @throws InvalidTokenOffsetsException thrown if any token's endOffset exceeds the provided text's length + */ + public final String getBestFragments( + TokenStream tokenStream, + String text, + int maxNumFragments, + String separator) + throws IOException, InvalidTokenOffsetsException + { + String sections[] = getBestFragments(tokenStream,text, maxNumFragments); + StringBuilder result = new StringBuilder(); + for (int i = 0; i < sections.length; i++) + { + if (i > 0) + { + result.append(separator); + } + result.append(sections[i]); + } + return result.toString(); + } + + public int getMaxDocCharsToAnalyze() { + return maxDocCharsToAnalyze; + } + + public void setMaxDocCharsToAnalyze(int maxDocCharsToAnalyze) { + this.maxDocCharsToAnalyze = maxDocCharsToAnalyze; + } + + + public Fragmenter getTextFragmenter() + { + return textFragmenter; + } + + /** + * @param fragmenter + */ + public void setTextFragmenter(Fragmenter fragmenter) + { + textFragmenter = fragmenter; + } + + /** + * @return Object used to score each text fragment + */ + public Scorer getFragmentScorer() + { + return fragmentScorer; + } + + + /** + * @param scorer + */ + public void setFragmentScorer(Scorer scorer) + { + fragmentScorer = scorer; + } + + public Encoder getEncoder() + { + return encoder; + } + public void setEncoder(Encoder encoder) + { + this.encoder = encoder; + } +} +class FragmentQueue extends PriorityQueue +{ + public FragmentQueue(int size) + { + initialize(size); + } + + @Override + public final boolean lessThan(TextFragment fragA, TextFragment fragB) + { + if (fragA.getScore() == fragB.getScore()) + return fragA.fragNum > fragB.fragNum; + else + return fragA.getScore() < fragB.getScore(); + } +}