--- /dev/null
+package org.apache.lucene.search.highlight;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.ByteArrayInputStream;
+import java.io.IOException;
+import java.io.Reader;
+import java.io.StringReader;
+import java.util.ArrayList;
+import java.util.HashMap;
+import java.util.HashSet;
+import java.util.Iterator;
+import java.util.List;
+import java.util.Map;
+import java.util.Set;
+import java.util.StringTokenizer;
+
+import javax.xml.parsers.DocumentBuilder;
+import javax.xml.parsers.DocumentBuilderFactory;
+
+import org.apache.lucene.analysis.Analyzer;
+import org.apache.lucene.analysis.BaseTokenStreamTestCase;
+import org.apache.lucene.analysis.CharArraySet;
+import org.apache.lucene.analysis.LowerCaseTokenizer;
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.analysis.MockTokenizer;
+import org.apache.lucene.analysis.StopAnalyzer;
+import org.apache.lucene.analysis.Token;
+import org.apache.lucene.analysis.TokenStream;
+import org.apache.lucene.analysis.standard.StandardAnalyzer;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.document.NumericField;
+import org.apache.lucene.document.Field.Index;
+import org.apache.lucene.document.Field.Store;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.IndexWriterConfig.OpenMode;
+import org.apache.lucene.queryParser.ParseException;
+import org.apache.lucene.queryParser.QueryParser;
+import org.apache.lucene.search.BooleanQuery;
+import org.apache.lucene.search.FilteredQuery;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.MultiPhraseQuery;
+import org.apache.lucene.search.MultiSearcher;
+import org.apache.lucene.search.MultiTermQuery;
+import org.apache.lucene.search.NumericRangeQuery;
+import org.apache.lucene.search.PhraseQuery;
+import org.apache.lucene.search.Query;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.search.TermRangeFilter;
+import org.apache.lucene.search.TopDocs;
+import org.apache.lucene.search.WildcardQuery;
+import org.apache.lucene.search.BooleanClause.Occur;
+import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
+import org.apache.lucene.search.regex.RegexQuery;
+import org.apache.lucene.search.regex.SpanRegexQuery;
+import org.apache.lucene.search.spans.SpanNearQuery;
+import org.apache.lucene.search.spans.SpanNotQuery;
+import org.apache.lucene.search.spans.SpanOrQuery;
+import org.apache.lucene.search.spans.SpanQuery;
+import org.apache.lucene.search.spans.SpanTermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.LuceneTestCase;
+import org.w3c.dom.Element;
+import org.w3c.dom.NodeList;
+
+/**
+ * JUnit Test for Highlighter class.
+ *
+ */
+public class HighlighterTest extends BaseTokenStreamTestCase implements Formatter {
+
+ private IndexReader reader;
+ static final String FIELD_NAME = "contents";
+ private static final String NUMERIC_FIELD_NAME = "nfield";
+ private Query query;
+ Directory ramDir;
+ public IndexSearcher searcher = null;
+ int numHighlights = 0;
+ final Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
+ TopDocs hits;
+
+ String[] texts = {
+ "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
+ "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
+ "JFK has been shot", "John Kennedy has been shot",
+ "This text has a typo in referring to Keneddy",
+ "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
+
+ public void testQueryScorerHits() throws Exception {
+ Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true);
+ QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
+ query = qp.parse("\"very long\"");
+ searcher = new IndexSearcher(reader);
+ TopDocs hits = searcher.search(query, 10);
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(scorer);
+
+
+ for (int i = 0; i < hits.scoreDocs.length; i++) {
+ Document doc = searcher.doc(hits.scoreDocs[i].doc);
+ String storedField = doc.get(FIELD_NAME);
+
+ TokenStream stream = TokenSources.getAnyTokenStream(searcher
+ .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
+
+ Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
+
+ highlighter.setTextFragmenter(fragmenter);
+
+ String fragment = highlighter.getBestFragment(stream, storedField);
+
+ if (VERBOSE) System.out.println(fragment);
+ }
+ searcher.close();
+ }
+
+ public void testHighlightingWithDefaultField() throws Exception {
+
+ String s1 = "I call our world Flatland, not because we call it so,";
+
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, new StandardAnalyzer(TEST_VERSION_CURRENT));
+
+ // Verify that a query against the default field results in text being
+ // highlighted
+ // regardless of the field name.
+ Query q = parser.parse("\"world Flatland\"~3");
+ String expected = "I call our <B>world</B> <B>Flatland</B>, not because we call it so,";
+ String observed = highlightField(q, "SOME_FIELD_NAME", s1);
+ if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
+ assertEquals("Query in the default field results in text for *ANY* field being highlighted",
+ expected, observed);
+
+ // Verify that a query against a named field does not result in any
+ // highlighting
+ // when the query field name differs from the name of the field being
+ // highlighted,
+ // which in this example happens to be the default field name.
+ q = parser.parse("text:\"world Flatland\"~3");
+ expected = s1;
+ observed = highlightField(q, FIELD_NAME, s1);
+ if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
+ assertEquals(
+ "Query in a named field does not result in highlighting when that field isn't in the query",
+ s1, highlightField(q, FIELD_NAME, s1));
+ }
+
+ /**
+ * This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
+ * @throws InvalidTokenOffsetsException
+ */
+ private static String highlightField(Query query, String fieldName, String text)
+ throws IOException, InvalidTokenOffsetsException {
+ TokenStream tokenStream = new StandardAnalyzer(TEST_VERSION_CURRENT).tokenStream(fieldName, new StringReader(text));
+ // Assuming "<B>", "</B>" used to highlight
+ SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
+ QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(formatter, scorer);
+ highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
+
+ String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
+ return rv.length() == 0 ? text : rv;
+ }
+
+ public void testSimpleSpanHighlighter() throws Exception {
+ doSearching("Kennedy");
+
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
+ new StringReader(text));
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ // Not sure we can assert anything here - just running to check we dont
+ // throw any exceptions
+ }
+
+ // LUCENE-1752
+ public void testRepeatingTermsInMultBooleans() throws Exception {
+ String content = "x y z a b c d e f g b c g";
+ String ph1 = "\"a b c d\"";
+ String ph2 = "\"b c g\"";
+ String f1 = "f1";
+ String f2 = "f2";
+ String f1c = f1 + ":";
+ String f2c = f2 + ":";
+ String q = "(" + f1c + ph1 + " OR " + f2c + ph1 + ") AND (" + f1c + ph2
+ + " OR " + f2c + ph2 + ")";
+ Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
+ QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, f1, analyzer);
+ Query query = qp.parse(q);
+
+ QueryScorer scorer = new QueryScorer(query, f1);
+ scorer.setExpandMultiTermQuery(false);
+
+ Highlighter h = new Highlighter(this, scorer);
+
+ h.getBestFragment(analyzer, f1, content);
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 7);
+ }
+
+ public void testSimpleQueryScorerPhraseHighlighting() throws Exception {
+ doSearching("\"very long and contains\"");
+
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 3);
+
+ numHighlights = 0;
+ doSearching("\"This piece of text refers to Kennedy\"");
+
+ maxNumFragmentsRequired = 2;
+
+ scorer = new QueryScorer(query, FIELD_NAME);
+ highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+
+ numHighlights = 0;
+ doSearching("\"lets is a the lets is a the lets is a the lets\"");
+
+ maxNumFragmentsRequired = 2;
+
+ scorer = new QueryScorer(query, FIELD_NAME);
+ highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+
+ }
+
+ public void testSpanRegexQuery() throws Exception {
+ query = new SpanOrQuery(new SpanQuery [] {
+ new SpanRegexQuery(new Term(FIELD_NAME, "ken.*")) });
+ searcher = new IndexSearcher(reader);
+ hits = searcher.search(query, 100);
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+
+ public void testRegexQuery() throws Exception {
+ query = new RegexQuery(new Term(FIELD_NAME, "ken.*"));
+ searcher = new IndexSearcher(reader);
+ hits = searcher.search(query, 100);
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+
+ public void testNumericRangeQuery() throws Exception {
+ // doesn't currently highlight, but make sure it doesn't cause exception either
+ query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true);
+ searcher = new IndexSearcher(reader);
+ hits = searcher.search(query, 100);
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(NUMERIC_FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+// String result =
+ highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,"...");
+ //if (VERBOSE) System.out.println("\t" + result);
+ }
+
+
+ }
+
+ public void testSimpleQueryScorerPhraseHighlighting2() throws Exception {
+ doSearching("\"text piece long\"~5");
+
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this,scorer);
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 6);
+ }
+
+ public void testSimpleQueryScorerPhraseHighlighting3() throws Exception {
+ doSearching("\"x y z\"");
+
+ int maxNumFragmentsRequired = 2;
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 3);
+ }
+ }
+
+ public void testSimpleSpanFragmenter() throws Exception {
+ doSearching("\"piece of text that is very long\"");
+
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
+
+ String result = highlighter.getBestFragments(tokenStream, text,
+ maxNumFragmentsRequired, "...");
+ if (VERBOSE) System.out.println("\t" + result);
+
+ }
+
+ doSearching("\"been shot\"");
+
+ maxNumFragmentsRequired = 2;
+
+ scorer = new QueryScorer(query, FIELD_NAME);
+ highlighter = new Highlighter(this, scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 20));
+
+ String result = highlighter.getBestFragments(tokenStream, text,
+ maxNumFragmentsRequired, "...");
+ if (VERBOSE) System.out.println("\t" + result);
+
+ }
+ }
+
+ // position sensitive query added after position insensitive query
+ public void testPosTermStdTerm() throws Exception {
+ doSearching("y \"x y z\"");
+
+ int maxNumFragmentsRequired = 2;
+
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this,scorer);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,new StringReader(text));
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+ }
+ }
+
+ public void testQueryScorerMultiPhraseQueryHighlighting() throws Exception {
+ MultiPhraseQuery mpq = new MultiPhraseQuery();
+
+ mpq.add(new Term[] { new Term(FIELD_NAME, "wordx"), new Term(FIELD_NAME, "wordb") });
+ mpq.add(new Term(FIELD_NAME, "wordy"));
+
+ doSearching(mpq);
+
+ final int maxNumFragmentsRequired = 2;
+ assertExpectedHighlightCount(maxNumFragmentsRequired, 6);
+ }
+
+ public void testQueryScorerMultiPhraseQueryHighlightingWithGap() throws Exception {
+ MultiPhraseQuery mpq = new MultiPhraseQuery();
+
+ /*
+ * The toString of MultiPhraseQuery doesn't work so well with these
+ * out-of-order additions, but the Query itself seems to match accurately.
+ */
+
+ mpq.add(new Term[] { new Term(FIELD_NAME, "wordz") }, 2);
+ mpq.add(new Term[] { new Term(FIELD_NAME, "wordx") }, 0);
+
+ doSearching(mpq);
+
+ final int maxNumFragmentsRequired = 1;
+ final int expectedHighlights = 2;
+
+ assertExpectedHighlightCount(maxNumFragmentsRequired, expectedHighlights);
+ }
+
+ public void testNearSpanSimpleQuery() throws Exception {
+ doSearching(new SpanNearQuery(new SpanQuery[] {
+ new SpanTermQuery(new Term(FIELD_NAME, "beginning")),
+ new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false));
+
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ mode = QUERY;
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ }
+ };
+
+ helper.run();
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ }
+
+ public void testSimpleQueryTermScorerHighlighter() throws Exception {
+ doSearching("Kennedy");
+ Highlighter highlighter = new Highlighter(new QueryTermScorer(query));
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+ int maxNumFragmentsRequired = 2;
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ // Not sure we can assert anything here - just running to check we dont
+ // throw any exceptions
+ }
+
+ public void testSpanHighlighting() throws Exception {
+ Query query1 = new SpanNearQuery(new SpanQuery[] {
+ new SpanTermQuery(new Term(FIELD_NAME, "wordx")),
+ new SpanTermQuery(new Term(FIELD_NAME, "wordy")) }, 1, false);
+ Query query2 = new SpanNearQuery(new SpanQuery[] {
+ new SpanTermQuery(new Term(FIELD_NAME, "wordy")),
+ new SpanTermQuery(new Term(FIELD_NAME, "wordc")) }, 1, false);
+ BooleanQuery bquery = new BooleanQuery();
+ bquery.add(query1, Occur.SHOULD);
+ bquery.add(query2, Occur.SHOULD);
+ doSearching(bquery);
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ mode = QUERY;
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ }
+ };
+
+ helper.run();
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 7);
+ }
+
+ public void testNotSpanSimpleQuery() throws Exception {
+ doSearching(new SpanNotQuery(new SpanNearQuery(new SpanQuery[] {
+ new SpanTermQuery(new Term(FIELD_NAME, "shot")),
+ new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false), new SpanTermQuery(
+ new Term(FIELD_NAME, "john"))));
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ mode = QUERY;
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ }
+ };
+
+ helper.run();
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+ }
+
+ public void testGetBestFragmentsSimpleQuery() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("Kennedy");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetFuzzyFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("Kinnedy~");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetWildCardFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("K?nnedy");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetMidWildCardFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("K*dy");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetRangeFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ String queryString = FIELD_NAME + ":[kannedy TO kznnedy]";
+
+ // Need to explicitly set the QueryParser property to use TermRangeQuery
+ // rather
+ // than RangeFilters
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
+ parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ query = parser.parse(queryString);
+ doSearching(query);
+
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testConstantScoreMultiTermQuery() throws Exception {
+
+ numHighlights = 0;
+
+ query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
+ ((WildcardQuery)query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
+ searcher = new IndexSearcher(reader);
+ // can't rewrite ConstantScore if you want to highlight it -
+ // it rewrites to ConstantScoreQuery which cannot be highlighted
+ // query = unReWrittenQuery.rewrite(reader);
+ if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
+ hits = searcher.search(query, null, 1000);
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ int maxNumFragmentsRequired = 2;
+ String fragmentSeparator = "...";
+ QueryScorer scorer = null;
+ TokenStream tokenStream = null;
+
+ tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
+
+ scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
+
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(20));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ fragmentSeparator);
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+
+ // try null field
+
+ hits = searcher.search(query, null, 1000);
+
+ numHighlights = 0;
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ int maxNumFragmentsRequired = 2;
+ String fragmentSeparator = "...";
+ QueryScorer scorer = null;
+ TokenStream tokenStream = null;
+
+ tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
+
+ scorer = new QueryScorer(query, null);
+
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(20));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ fragmentSeparator);
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+
+ // try default field
+
+ hits = searcher.search(query, null, 1000);
+
+ numHighlights = 0;
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ int maxNumFragmentsRequired = 2;
+ String fragmentSeparator = "...";
+ QueryScorer scorer = null;
+ TokenStream tokenStream = null;
+
+ tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
+
+ scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
+
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(20));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ fragmentSeparator);
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+
+ public void testGetBestFragmentsPhrase() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("\"John Kennedy\"");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ // Currently highlights "John" and "Kennedy" separately
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetBestFragmentsQueryScorer() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")),
+ new SpanTermQuery(new Term("contents", "kennedy")), };
+
+ SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
+ doSearching(snq);
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ // Currently highlights "John" and "Kennedy" separately
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testOffByOne() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ TermQuery query = new TermQuery(new Term("data", "help"));
+ Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryTermScorer(query));
+ hg.setTextFragmenter(new NullFragmenter());
+
+ String match = null;
+ match = hg.getBestFragment(analyzer, "data", "help me [54-65]");
+ assertEquals("<B>help</B> me [54-65]", match);
+
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetBestFragmentsFilteredQuery() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
+ SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")),
+ new SpanTermQuery(new Term("contents", "kennedy")), };
+ SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
+ FilteredQuery fq = new FilteredQuery(snq, rf);
+
+ doSearching(fq);
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ // Currently highlights "John" and "Kennedy" separately
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetBestFragmentsFilteredPhraseQuery() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
+ PhraseQuery pq = new PhraseQuery();
+ pq.add(new Term("contents", "john"));
+ pq.add(new Term("contents", "kennedy"));
+ FilteredQuery fq = new FilteredQuery(pq, rf);
+
+ doSearching(fq);
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ // Currently highlights "John" and "Kennedy" separately
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetBestFragmentsMultiTerm() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("John Kenn*");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testGetBestFragmentsWithOr() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("JFK OR Kennedy");
+ doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 5);
+ }
+ };
+ helper.start();
+ }
+
+ public void testGetBestSingleFragment() throws Exception {
+
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ doSearching("Kennedy");
+ numHighlights = 0;
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+ String result = highlighter.getBestFragment(tokenStream, text);
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+
+ numHighlights = 0;
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);
+ highlighter.getBestFragment(analyzer, FIELD_NAME, text);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+
+ numHighlights = 0;
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);
+ highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+
+ }
+
+ };
+
+ helper.start();
+
+ }
+
+ public void testGetBestSingleFragmentWithWeights() throws Exception {
+
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2];
+ wTerms[0] = new WeightedSpanTerm(10f, "hello");
+
+ List<PositionSpan> positionSpans = new ArrayList<PositionSpan>();
+ positionSpans.add(new PositionSpan(0, 0));
+ wTerms[0].addPositionSpans(positionSpans);
+
+ wTerms[1] = new WeightedSpanTerm(1f, "kennedy");
+ positionSpans = new ArrayList<PositionSpan>();
+ positionSpans.add(new PositionSpan(14, 14));
+ wTerms[1].addPositionSpans(positionSpans);
+
+ Highlighter highlighter = getHighlighter(wTerms, HighlighterTest.this);// new
+ // Highlighter(new
+ // QueryTermScorer(wTerms));
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
+ highlighter.setTextFragmenter(new SimpleFragmenter(2));
+
+ String result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
+ assertTrue("Failed to find best section using weighted terms. Found: [" + result + "]",
+ "<B>Hello</B>".equals(result));
+
+ // readjust weights
+ wTerms[1].setWeight(50f);
+ tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
+ highlighter = getHighlighter(wTerms, HighlighterTest.this);
+ highlighter.setTextFragmenter(new SimpleFragmenter(2));
+
+ result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
+ assertTrue("Failed to find best section using weighted terms. Found: " + result,
+ "<B>kennedy</B>".equals(result));
+ }
+
+ };
+
+ helper.start();
+
+ }
+
+ // tests a "complex" analyzer that produces multiple
+ // overlapping tokens
+ public void testOverlapAnalyzer() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ HashMap<String,String> synonyms = new HashMap<String,String>();
+ synonyms.put("football", "soccer,footie");
+ Analyzer analyzer = new SynonymAnalyzer(synonyms);
+ String srchkey = "football";
+
+ String s = "football-soccer in the euro 2004 footie competition";
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "bookid", analyzer);
+ Query query = parser.parse(srchkey);
+
+ TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
+ Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this);
+
+ // Get 3 best fragments and seperate with a "..."
+ tokenStream = analyzer.tokenStream(null, new StringReader(s));
+
+ String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
+ String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
+ assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult
+ + " actual:" + result, expectedResult.equals(result));
+ }
+
+ };
+
+ helper.start();
+
+ }
+
+ public void testGetSimpleHighlight() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("Kennedy");
+ // new Highlighter(HighlighterTest.this, new QueryTermScorer(query));
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);
+ String result = highlighter.getBestFragment(tokenStream, text);
+ if (VERBOSE) System.out.println("\t" + result);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 4);
+ }
+ };
+ helper.start();
+ }
+
+ public void testGetTextFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+
+ doSearching("Kennedy");
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);// new Highlighter(this, new
+ // QueryTermScorer(query));
+ highlighter.setTextFragmenter(new SimpleFragmenter(20));
+ String stringResults[] = highlighter.getBestFragments(tokenStream, text, 10);
+
+ tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream, text,
+ true, 10);
+
+ assertTrue("Failed to find correct number of text Fragments: " + fragmentResults.length
+ + " vs " + stringResults.length, fragmentResults.length == stringResults.length);
+ for (int j = 0; j < stringResults.length; j++) {
+ if (VERBOSE) System.out.println(fragmentResults[j]);
+ assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + " found",
+ fragmentResults[j].toString().equals(stringResults[j]));
+
+ }
+
+ }
+ }
+ };
+ helper.start();
+ }
+
+ public void testMaxSizeHighlight() throws Exception {
+ final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
+ // we disable MockTokenizer checks because we will forcefully limit the
+ // tokenstream and call end() before incrementToken() returns false.
+ analyzer.setEnableChecks(false);
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ doSearching("meat");
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);// new Highlighter(this, new
+ // QueryTermScorer(query));
+ highlighter.setMaxDocCharsToAnalyze(30);
+
+ highlighter.getBestFragment(tokenStream, texts[0]);
+ assertTrue("Setting MaxDocBytesToAnalyze should have prevented "
+ + "us from finding matches for this record: " + numHighlights + " found",
+ numHighlights == 0);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testMaxSizeHighlightTruncates() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ String goodWord = "goodtoken";
+ Set<String> stopWords = new HashSet<String>(1);
+ stopWords.add("stoppedtoken");
+
+ TermQuery query = new TermQuery(new Term("data", goodWord));
+
+ String match = null;
+ StringBuilder sb = new StringBuilder();
+ sb.append(goodWord);
+ for (int i = 0; i < 10000; i++) {
+ sb.append(" ");
+ // only one stopword
+ sb.append(stopWords.iterator().next());
+ }
+ SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
+ Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords).tokenStream(
+ "data", new StringReader(sb.toString())), fm);// new Highlighter(fm,
+ // new
+ // QueryTermScorer(query));
+ hg.setTextFragmenter(new NullFragmenter());
+ hg.setMaxDocCharsToAnalyze(100);
+ match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "data", sb.toString());
+ assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
+ .getMaxDocCharsToAnalyze());
+
+ // add another tokenized word to the overrall length - but set way
+ // beyond
+ // the length of text under consideration (after a large slug of stop
+ // words
+ // + whitespace)
+ sb.append(" ");
+ sb.append(goodWord);
+ match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "data", sb.toString());
+ assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
+ .getMaxDocCharsToAnalyze());
+ }
+ };
+
+ helper.start();
+
+ }
+
+ public void testMaxSizeEndHighlight() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+ @Override
+ public void run() throws Exception {
+ Set<String> stopWords = new HashSet<String>();
+ stopWords.add("in");
+ stopWords.add("it");
+ TermQuery query = new TermQuery(new Term("text", "searchterm"));
+
+ String text = "this is a text with searchterm in it";
+ SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
+ Highlighter hg = getHighlighter(query, "text", new StandardAnalyzer(TEST_VERSION_CURRENT,
+ stopWords).tokenStream("text", new StringReader(text)), fm);
+ hg.setTextFragmenter(new NullFragmenter());
+ hg.setMaxDocCharsToAnalyze(36);
+ String match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "text", text);
+ assertTrue(
+ "Matched text should contain remainder of text after highlighted query ",
+ match.endsWith("in it"));
+ }
+ };
+ helper.start();
+ }
+
+ public void testUnRewrittenQuery() throws Exception {
+ final TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ numHighlights = 0;
+ // test to show how rewritten query can still be used
+ if (searcher != null) searcher.close();
+ searcher = new IndexSearcher(reader);
+ Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
+
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
+ Query query = parser.parse("JF? or Kenned*");
+ if (VERBOSE) System.out.println("Searching with primitive query");
+ // forget to set this and...
+ // query=query.rewrite(reader);
+ TopDocs hits = searcher.search(query, null, 1000);
+
+ // create an instance of the highlighter with the tags used to surround
+ // highlighted text
+ // QueryHighlightExtractor highlighter = new
+ // QueryHighlightExtractor(this,
+ // query, new StandardAnalyzer(TEST_VERSION));
+
+ int maxNumFragmentsRequired = 3;
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this, false);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String highlightedText = highlighter.getBestFragments(tokenStream, text,
+ maxNumFragmentsRequired, "...");
+
+ if (VERBOSE) System.out.println(highlightedText);
+ }
+ // We expect to have zero highlights if the query is multi-terms and is
+ // not
+ // rewritten!
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 0);
+ }
+ };
+
+ helper.start();
+ }
+
+ public void testNoFragments() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ doSearching("AnInvalidQueryWhichShouldYieldNoResults");
+
+ for (int i = 0; i < texts.length; i++) {
+ String text = texts[i];
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
+ HighlighterTest.this);
+ String result = highlighter.getBestFragment(tokenStream, text);
+ assertNull("The highlight result should be null for text with no query terms", result);
+ }
+ }
+ };
+
+ helper.start();
+ }
+
+ /**
+ * Demonstrates creation of an XHTML compliant doc using new encoding facilities.
+ *
+ * @throws Exception
+ */
+ public void testEncoding() throws Exception {
+
+ String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
+ // run the highlighter on the raw content (scorer does not score any tokens
+ // for
+ // highlighting but scores a single fragment for selection
+ Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new Scorer() {
+ public void startFragment(TextFragment newFragment) {
+ }
+
+ public float getTokenScore() {
+ return 0;
+ }
+
+ public float getFragmentScore() {
+ return 1;
+ }
+
+ public TokenStream init(TokenStream tokenStream) {
+ return null;
+ }
+ });
+ highlighter.setTextFragmenter(new SimpleFragmenter(2000));
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent));
+
+ String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent, 1, "");
+ // An ugly bit of XML creation:
+ String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
+ + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
+ + "<head>\n" + "<title>My Test HTML Document</title>\n" + "</head>\n" + "<body>\n" + "<h2>"
+ + encodedSnippet + "</h2>\n" + "</body>\n" + "</html>";
+ // now an ugly built of XML parsing to test the snippet is encoded OK
+ DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
+ DocumentBuilder db = dbf.newDocumentBuilder();
+ org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
+ Element root = doc.getDocumentElement();
+ NodeList nodes = root.getElementsByTagName("body");
+ Element body = (Element) nodes.item(0);
+ nodes = body.getElementsByTagName("h2");
+ Element h2 = (Element) nodes.item(0);
+ String decodedSnippet = h2.getFirstChild().getNodeValue();
+ assertEquals("XHTML Encoding should have worked:", rawDocContent, decodedSnippet);
+ }
+
+ public void testMultiSearcher() throws Exception {
+ // setup index 1
+ Directory ramDir1 = newDirectory();
+ IndexWriter writer1 = new IndexWriter(ramDir1, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
+ Document d = new Document();
+ Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.ANALYZED);
+ d.add(f);
+ writer1.addDocument(d);
+ writer1.optimize();
+ writer1.close();
+ IndexReader reader1 = IndexReader.open(ramDir1, true);
+
+ // setup index 2
+ Directory ramDir2 = newDirectory();
+ IndexWriter writer2 = new IndexWriter(ramDir2, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
+ d = new Document();
+ f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.ANALYZED);
+ d.add(f);
+ writer2.addDocument(d);
+ writer2.optimize();
+ writer2.close();
+ IndexReader reader2 = IndexReader.open(ramDir2, true);
+
+ IndexSearcher searchers[] = new IndexSearcher[2];
+ searchers[0] = new IndexSearcher(ramDir1, true);
+ searchers[1] = new IndexSearcher(ramDir2, true);
+ MultiSearcher multiSearcher = new MultiSearcher(searchers);
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, new StandardAnalyzer(TEST_VERSION_CURRENT));
+ parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ query = parser.parse("multi*");
+ if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
+ // at this point the multisearcher calls combine(query[])
+ hits = multiSearcher.search(query, null, 1000);
+
+ // query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer(TEST_VERSION));
+ Query expandedQueries[] = new Query[2];
+ expandedQueries[0] = query.rewrite(reader1);
+ expandedQueries[1] = query.rewrite(reader2);
+ query = query.combine(expandedQueries);
+
+ // create an instance of the highlighter with the tags used to surround
+ // highlighted text
+ Highlighter highlighter = new Highlighter(this, new QueryTermScorer(query));
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = multiSearcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ String highlightedText = highlighter.getBestFragment(tokenStream, text);
+ if (VERBOSE) System.out.println(highlightedText);
+ }
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == 2);
+ reader1.close();
+ reader2.close();
+ searchers[0].close();
+ searchers[1].close();
+ ramDir1.close();
+ ramDir2.close();
+ }
+
+ public void testFieldSpecificHighlighting() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ String docMainText = "fred is one of the people";
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
+ Query query = parser.parse("fred category:people");
+
+ // highlighting respects fieldnames used in query
+
+ Scorer fieldSpecificScorer = null;
+ if (mode == TestHighlightRunner.QUERY) {
+ fieldSpecificScorer = new QueryScorer(query, FIELD_NAME);
+ } else if (mode == TestHighlightRunner.QUERY_TERM) {
+ fieldSpecificScorer = new QueryTermScorer(query, "contents");
+ }
+ Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),
+ fieldSpecificScorer);
+ fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter());
+ String result = fieldSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText);
+ assertEquals("Should match", result, "<B>fred</B> is one of the people");
+
+ // highlighting does not respect fieldnames used in query
+ Scorer fieldInSpecificScorer = null;
+ if (mode == TestHighlightRunner.QUERY) {
+ fieldInSpecificScorer = new QueryScorer(query, null);
+ } else if (mode == TestHighlightRunner.QUERY_TERM) {
+ fieldInSpecificScorer = new QueryTermScorer(query);
+ }
+
+ Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),
+ fieldInSpecificScorer);
+ fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter());
+ result = fieldInSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText);
+ assertEquals("Should match", result, "<B>fred</B> is one of the <B>people</B>");
+
+ reader.close();
+ }
+ };
+
+ helper.start();
+
+ }
+
+ protected TokenStream getTS2() {
+ // String s = "Hi-Speed10 foo";
+ return new TokenStream() {
+ Iterator<Token> iter;
+ List<Token> lst;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ {
+ lst = new ArrayList<Token>();
+ Token t;
+ t = createToken("hi", 0, 2);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("hispeed", 0, 8);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("speed", 3, 8);
+ t.setPositionIncrement(0);
+ lst.add(t);
+ t = createToken("10", 8, 10);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("foo", 11, 14);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ iter = lst.iterator();
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if(iter.hasNext()) {
+ Token token = iter.next();
+ clearAttributes();
+ termAtt.setEmpty().append(token);
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ return true;
+ }
+ return false;
+ }
+
+ };
+ }
+
+ // same token-stream as above, but the bigger token comes first this time
+ protected TokenStream getTS2a() {
+ // String s = "Hi-Speed10 foo";
+ return new TokenStream() {
+ Iterator<Token> iter;
+ List<Token> lst;
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ {
+ lst = new ArrayList<Token>();
+ Token t;
+ t = createToken("hispeed", 0, 8);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("hi", 0, 2);
+ t.setPositionIncrement(0);
+ lst.add(t);
+ t = createToken("speed", 3, 8);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("10", 8, 10);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ t = createToken("foo", 11, 14);
+ t.setPositionIncrement(1);
+ lst.add(t);
+ iter = lst.iterator();
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ if(iter.hasNext()) {
+ Token token = iter.next();
+ clearAttributes();
+ termAtt.setEmpty().append(token);
+ posIncrAtt.setPositionIncrement(token.getPositionIncrement());
+ offsetAtt.setOffset(token.startOffset(), token.endOffset());
+ return true;
+ }
+ return false;
+ }
+ };
+ }
+
+ public void testOverlapAnalyzer2() throws Exception {
+ TestHighlightRunner helper = new TestHighlightRunner() {
+
+ @Override
+ public void run() throws Exception {
+ String s = "Hi-Speed10 foo";
+
+ Query query;
+ Highlighter highlighter;
+ String result;
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("Hi-Speed10 <B>foo</B>", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("Hi-Speed<B>10</B> foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("<B>Hi</B>-Speed10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("Hi-<B>Speed</B>10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("<B>Hi-Speed</B>10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed");
+ highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2(), s, 3, "...");
+ assertEquals("<B>Hi-Speed</B>10 foo", result);
+
+ // ///////////////// same tests, just put the bigger overlapping token
+ // first
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("Hi-Speed10 <B>foo</B>", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("Hi-Speed<B>10</B> foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("<B>Hi</B>-Speed10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("Hi-<B>Speed</B>10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("<B>Hi-Speed</B>10 foo", result);
+
+ query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed");
+ highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
+ result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
+ assertEquals("<B>Hi-Speed</B>10 foo", result);
+ }
+ };
+
+ helper.start();
+ }
+
+ private Directory dir;
+ private Analyzer a = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
+
+ public void testWeightedTermsWithDeletes() throws IOException, ParseException, InvalidTokenOffsetsException {
+ makeIndex();
+ deleteDocument();
+ searchIndex();
+ }
+
+ private Document doc( String f, String v ){
+ Document doc = new Document();
+ doc.add( new Field( f, v, Store.YES, Index.ANALYZED ) );
+ return doc;
+ }
+
+ private void makeIndex() throws IOException {
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
+ writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+ writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
+ writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
+ writer.addDocument( doc( "t_text1", "more random words for second field" ) );
+ writer.forceMerge(1);
+ writer.close();
+ }
+
+ private void deleteDocument() throws IOException {
+ IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).setOpenMode(OpenMode.APPEND));
+ writer.deleteDocuments( new Term( "t_text1", "del" ) );
+ // To see negative idf, keep comment the following line
+ //writer.forceMerge(1);
+ writer.close();
+ }
+
+ private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException {
+ String q = "t_text1:random";
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "t_text1", a );
+ Query query = parser.parse( q );
+ IndexReader reader = IndexReader.open(dir);
+ IndexSearcher searcher = new IndexSearcher(reader);
+ // This scorer can return negative idf -> null fragment
+ Scorer scorer = new QueryTermScorer( query, searcher.getIndexReader(), "t_text1" );
+ // This scorer doesn't use idf (patch version)
+ //Scorer scorer = new QueryTermScorer( query, "t_text1" );
+ Highlighter h = new Highlighter( scorer );
+
+ TopDocs hits = searcher.search(query, null, 10);
+ for( int i = 0; i < hits.totalHits; i++ ){
+ Document doc = searcher.doc( hits.scoreDocs[i].doc );
+ String result = h.getBestFragment( a, "t_text1", doc.get( "t_text1" ));
+ if (VERBOSE) System.out.println("result:" + result);
+ assertEquals("more <B>random</B> words for second field", result);
+ }
+ searcher.close();
+ reader.close();
+ }
+
+ /*
+ *
+ * public void testBigramAnalyzer() throws IOException, ParseException {
+ * //test to ensure analyzers with none-consecutive start/end offsets //dont
+ * double-highlight text //setup index 1 RAMDirectory ramDir = new
+ * RAMDirectory(); Analyzer bigramAnalyzer=new CJKAnalyzer(); IndexWriter
+ * writer = new IndexWriter(ramDir,bigramAnalyzer , true); Document d = new
+ * Document(); Field f = new Field(FIELD_NAME, "java abc def", true, true,
+ * true); d.add(f); writer.addDocument(d); writer.close(); IndexReader reader =
+ * IndexReader.open(ramDir, true);
+ *
+ * IndexSearcher searcher=new IndexSearcher(reader); query =
+ * QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
+ * System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits =
+ * searcher.search(query);
+ *
+ * Highlighter highlighter = new Highlighter(this,new
+ * QueryFragmentScorer(query));
+ *
+ * for (int i = 0; i < hits.totalHits; i++) { String text =
+ * searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream
+ * tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
+ * String highlightedText = highlighter.getBestFragment(tokenStream,text);
+ * System.out.println(highlightedText); } }
+ */
+
+ public String highlightTerm(String originalText, TokenGroup group) {
+ if (group.getTotalScore() <= 0) {
+ return originalText;
+ }
+ numHighlights++; // update stats used in assertions
+ return "<B>" + originalText + "</B>";
+ }
+
+ public void doSearching(String queryString) throws Exception {
+ QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
+ parser.setEnablePositionIncrements(true);
+ parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
+ query = parser.parse(queryString);
+ doSearching(query);
+ }
+
+ public void doSearching(Query unReWrittenQuery) throws Exception {
+ if (searcher != null) searcher.close();
+ searcher = new IndexSearcher(reader);
+ // for any multi-term queries to work (prefix, wildcard, range,fuzzy etc)
+ // you must use a rewritten query!
+ query = unReWrittenQuery.rewrite(reader);
+ if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
+ hits = searcher.search(query, null, 1000);
+ }
+
+ public void assertExpectedHighlightCount(final int maxNumFragmentsRequired,
+ final int expectedHighlights) throws Exception {
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
+ TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
+ QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
+ Highlighter highlighter = new Highlighter(this, scorer);
+
+ highlighter.setTextFragmenter(new SimpleFragmenter(40));
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ "...");
+ if (VERBOSE) System.out.println("\t" + result);
+
+ assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
+ numHighlights == expectedHighlights);
+ }
+ }
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ dir = newDirectory();
+ ramDir = newDirectory();
+ IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
+ TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
+ for (int i = 0; i < texts.length; i++) {
+ addDoc(writer, texts[i]);
+ }
+ Document doc = new Document();
+ NumericField nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
+ nfield.setIntValue(1);
+ doc.add(nfield);
+ writer.addDocument(doc, analyzer);
+ nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
+ nfield.setIntValue(3);
+ doc = new Document();
+ doc.add(nfield);
+ writer.addDocument(doc, analyzer);
+ nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
+ nfield.setIntValue(5);
+ doc = new Document();
+ doc.add(nfield);
+ writer.addDocument(doc, analyzer);
+ nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
+ nfield.setIntValue(7);
+ doc = new Document();
+ doc.add(nfield);
+ writer.addDocument(doc, analyzer);
+ writer.forceMerge(1);
+ writer.close();
+ reader = IndexReader.open(ramDir, true);
+ numHighlights = 0;
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ if (searcher != null) searcher.close();
+ reader.close();
+ dir.close();
+ ramDir.close();
+ super.tearDown();
+ }
+ private void addDoc(IndexWriter writer, String text) throws IOException {
+ Document d = new Document();
+ Field f = new Field(FIELD_NAME, text, Field.Store.YES, Field.Index.ANALYZED);
+ d.add(f);
+ writer.addDocument(d);
+
+ }
+
+ private static Token createToken(String term, int start, int offset)
+ {
+ return new Token(term, start, offset);
+ }
+
+}
+
+// ===================================================================
+// ========== BEGIN TEST SUPPORTING CLASSES
+// ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
+// ========== MADE MORE GENERALLY USEFUL.
+// TODO - make synonyms all interchangeable with each other and produce
+// a version that does hyponyms - the "is a specialised type of ...."
+// so that car = audi, bmw and volkswagen but bmw != audi so different
+// behaviour to synonyms
+// ===================================================================
+
+final class SynonymAnalyzer extends Analyzer {
+ private Map<String,String> synonyms;
+
+ public SynonymAnalyzer(Map<String,String> synonyms) {
+ this.synonyms = synonyms;
+ }
+
+ /*
+ * (non-Javadoc)
+ *
+ * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
+ * java.io.Reader)
+ */
+ @Override
+ public TokenStream tokenStream(String arg0, Reader arg1) {
+ LowerCaseTokenizer stream = new LowerCaseTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, arg1);
+ stream.addAttribute(CharTermAttribute.class);
+ stream.addAttribute(PositionIncrementAttribute.class);
+ stream.addAttribute(OffsetAttribute.class);
+ try {
+ stream.reset();
+ } catch (IOException e) {
+ throw new RuntimeException(e);
+ }
+ return new SynonymTokenizer(stream, synonyms);
+ }
+}
+
+/**
+ * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
+ *
+ */
+final class SynonymTokenizer extends TokenStream {
+ private final TokenStream realStream;
+ private Token currentRealToken = null;
+ private final Map<String, String> synonyms;
+ private StringTokenizer st = null;
+ private final CharTermAttribute realTermAtt;
+ private final PositionIncrementAttribute realPosIncrAtt;
+ private final OffsetAttribute realOffsetAtt;
+ private final CharTermAttribute termAtt;
+ private final PositionIncrementAttribute posIncrAtt;
+ private final OffsetAttribute offsetAtt;
+
+ public SynonymTokenizer(TokenStream realStream, Map<String, String> synonyms) {
+ this.realStream = realStream;
+ this.synonyms = synonyms;
+ realTermAtt = realStream.addAttribute(CharTermAttribute.class);
+ realPosIncrAtt = realStream.addAttribute(PositionIncrementAttribute.class);
+ realOffsetAtt = realStream.addAttribute(OffsetAttribute.class);
+
+ termAtt = addAttribute(CharTermAttribute.class);
+ posIncrAtt = addAttribute(PositionIncrementAttribute.class);
+ offsetAtt = addAttribute(OffsetAttribute.class);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+
+ if (currentRealToken == null) {
+ boolean next = realStream.incrementToken();
+ if (!next) {
+ return false;
+ }
+ //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
+ clearAttributes();
+ termAtt.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
+ offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+ posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
+
+ String expansions = synonyms.get(realTermAtt.toString());
+ if (expansions == null) {
+ return true;
+ }
+ st = new StringTokenizer(expansions, ",");
+ if (st.hasMoreTokens()) {
+ currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
+ currentRealToken.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
+ }
+
+ return true;
+ } else {
+ String tok = st.nextToken();
+ clearAttributes();
+ termAtt.setEmpty().append(tok);
+ offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
+ posIncrAtt.setPositionIncrement(0);
+ if (!st.hasMoreTokens()) {
+ currentRealToken = null;
+ st = null;
+ }
+ return true;
+ }
+
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ this.currentRealToken = null;
+ this.st = null;
+ }
+
+ static abstract class TestHighlightRunner {
+ static final int QUERY = 0;
+ static final int QUERY_TERM = 1;
+
+ int mode = QUERY;
+ Fragmenter frag = new SimpleFragmenter(20);
+
+ public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter) {
+ return getHighlighter(query, fieldName, stream, formatter, true);
+ }
+
+ public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter, boolean expanMultiTerm) {
+ Scorer scorer = null;
+ if (mode == QUERY) {
+ scorer = new QueryScorer(query, fieldName);
+ if(!expanMultiTerm) {
+ ((QueryScorer)scorer).setExpandMultiTermQuery(false);
+ }
+ } else if (mode == QUERY_TERM) {
+ scorer = new QueryTermScorer(query);
+ } else {
+ throw new RuntimeException("Unknown highlight mode");
+ }
+
+ return new Highlighter(formatter, scorer);
+ }
+
+ Highlighter getHighlighter(WeightedTerm[] weightedTerms, Formatter formatter) {
+ if (mode == QUERY) {
+ return new Highlighter(formatter, new QueryScorer((WeightedSpanTerm[]) weightedTerms));
+ } else if (mode == QUERY_TERM) {
+ return new Highlighter(formatter, new QueryTermScorer(weightedTerms));
+
+ } else {
+ throw new RuntimeException("Unknown highlight mode");
+ }
+ }
+
+ void doStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, Formatter formatter)
+ throws Exception {
+ doStandardHighlights(analyzer, searcher, hits, query, formatter, false);
+ }
+
+ void doStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, Formatter formatter, boolean expandMT)
+ throws Exception {
+
+ for (int i = 0; i < hits.totalHits; i++) {
+ String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
+ int maxNumFragmentsRequired = 2;
+ String fragmentSeparator = "...";
+ Scorer scorer = null;
+ TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
+ if (mode == QUERY) {
+ scorer = new QueryScorer(query);
+ } else if (mode == QUERY_TERM) {
+ scorer = new QueryTermScorer(query);
+ }
+ Highlighter highlighter = new Highlighter(formatter, scorer);
+ highlighter.setTextFragmenter(frag);
+
+ String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
+ fragmentSeparator);
+ if (HighlighterTest.VERBOSE) System.out.println("\t" + result);
+ }
+ }
+
+ abstract void run() throws Exception;
+
+ void start() throws Exception {
+ if (HighlighterTest.VERBOSE) System.out.println("Run QueryScorer");
+ run();
+ if (HighlighterTest.VERBOSE) System.out.println("Run QueryTermScorer");
+ mode = QUERY_TERM;
+ run();
+ }
+ }
+}