+++ /dev/null
-package org.apache.lucene.search.highlight;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.ByteArrayInputStream;
-import java.io.IOException;
-import java.io.Reader;
-import java.io.StringReader;
-import java.util.ArrayList;
-import java.util.HashMap;
-import java.util.HashSet;
-import java.util.Iterator;
-import java.util.List;
-import java.util.Map;
-import java.util.Set;
-import java.util.StringTokenizer;
-
-import javax.xml.parsers.DocumentBuilder;
-import javax.xml.parsers.DocumentBuilderFactory;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.BaseTokenStreamTestCase;
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.LowerCaseTokenizer;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.MockTokenizer;
-import org.apache.lucene.analysis.SimpleAnalyzer;
-import org.apache.lucene.analysis.StopAnalyzer;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.document.NumericField;
-import org.apache.lucene.document.Field.Index;
-import org.apache.lucene.document.Field.Store;
-import org.apache.lucene.index.IndexReader;
-import org.apache.lucene.index.IndexWriter;
-import org.apache.lucene.index.IndexWriterConfig;
-import org.apache.lucene.index.Term;
-import org.apache.lucene.index.IndexWriterConfig.OpenMode;
-import org.apache.lucene.queryParser.ParseException;
-import org.apache.lucene.queryParser.QueryParser;
-import org.apache.lucene.search.BooleanQuery;
-import org.apache.lucene.search.FilteredQuery;
-import org.apache.lucene.search.IndexSearcher;
-import org.apache.lucene.search.MultiPhraseQuery;
-import org.apache.lucene.search.MultiSearcher;
-import org.apache.lucene.search.MultiTermQuery;
-import org.apache.lucene.search.NumericRangeQuery;
-import org.apache.lucene.search.PhraseQuery;
-import org.apache.lucene.search.Query;
-import org.apache.lucene.search.TermQuery;
-import org.apache.lucene.search.TermRangeFilter;
-import org.apache.lucene.search.TopDocs;
-import org.apache.lucene.search.WildcardQuery;
-import org.apache.lucene.search.BooleanClause.Occur;
-import org.apache.lucene.search.highlight.SynonymTokenizer.TestHighlightRunner;
-import org.apache.lucene.search.regex.RegexQuery;
-import org.apache.lucene.search.regex.SpanRegexQuery;
-import org.apache.lucene.search.spans.SpanNearQuery;
-import org.apache.lucene.search.spans.SpanNotQuery;
-import org.apache.lucene.search.spans.SpanOrQuery;
-import org.apache.lucene.search.spans.SpanQuery;
-import org.apache.lucene.search.spans.SpanTermQuery;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.util.LuceneTestCase;
-import org.w3c.dom.Element;
-import org.w3c.dom.NodeList;
-
-/**
- * JUnit Test for Highlighter class.
- *
- */
-public class HighlighterTest extends BaseTokenStreamTestCase implements Formatter {
-
- private IndexReader reader;
- static final String FIELD_NAME = "contents";
- private static final String NUMERIC_FIELD_NAME = "nfield";
- private Query query;
- Directory ramDir;
- public IndexSearcher searcher = null;
- int numHighlights = 0;
- final Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
- TopDocs hits;
-
- String[] texts = {
- "Hello this is a piece of text that is very long and contains too much preamble and the meat is really here which says kennedy has been shot",
- "This piece of text refers to Kennedy at the beginning then has a longer piece of text that is very long in the middle and finally ends with another reference to Kennedy",
- "JFK has been shot", "John Kennedy has been shot",
- "This text has a typo in referring to Keneddy",
- "wordx wordy wordz wordx wordy wordx worda wordb wordy wordc", "y z x y z a b", "lets is a the lets is a the lets is a the lets" };
-
- public void testQueryScorerHits() throws Exception {
- Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true);
- QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
- query = qp.parse("\"very long\"");
- searcher = new IndexSearcher(ramDir, true);
- TopDocs hits = searcher.search(query, 10);
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(scorer);
-
-
- for (int i = 0; i < hits.scoreDocs.length; i++) {
- Document doc = searcher.doc(hits.scoreDocs[i].doc);
- String storedField = doc.get(FIELD_NAME);
-
- TokenStream stream = TokenSources.getAnyTokenStream(searcher
- .getIndexReader(), hits.scoreDocs[i].doc, FIELD_NAME, doc, analyzer);
-
- Fragmenter fragmenter = new SimpleSpanFragmenter(scorer);
-
- highlighter.setTextFragmenter(fragmenter);
-
- String fragment = highlighter.getBestFragment(stream, storedField);
-
- if (VERBOSE) System.out.println(fragment);
- }
- searcher.close();
- }
-
- public void testHighlightingWithDefaultField() throws Exception {
-
- String s1 = "I call our world Flatland, not because we call it so,";
-
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, new StandardAnalyzer(TEST_VERSION_CURRENT));
-
- // Verify that a query against the default field results in text being
- // highlighted
- // regardless of the field name.
- Query q = parser.parse("\"world Flatland\"~3");
- String expected = "I call our <B>world</B> <B>Flatland</B>, not because we call it so,";
- String observed = highlightField(q, "SOME_FIELD_NAME", s1);
- if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
- assertEquals("Query in the default field results in text for *ANY* field being highlighted",
- expected, observed);
-
- // Verify that a query against a named field does not result in any
- // highlighting
- // when the query field name differs from the name of the field being
- // highlighted,
- // which in this example happens to be the default field name.
- q = parser.parse("text:\"world Flatland\"~3");
- expected = s1;
- observed = highlightField(q, FIELD_NAME, s1);
- if (VERBOSE) System.out.println("Expected: \"" + expected + "\n" + "Observed: \"" + observed);
- assertEquals(
- "Query in a named field does not result in highlighting when that field isn't in the query",
- s1, highlightField(q, FIELD_NAME, s1));
- }
-
- /**
- * This method intended for use with <tt>testHighlightingWithDefaultField()</tt>
- * @throws InvalidTokenOffsetsException
- */
- private static String highlightField(Query query, String fieldName, String text)
- throws IOException, InvalidTokenOffsetsException {
- TokenStream tokenStream = new StandardAnalyzer(TEST_VERSION_CURRENT).tokenStream(fieldName, new StringReader(text));
- // Assuming "<B>", "</B>" used to highlight
- SimpleHTMLFormatter formatter = new SimpleHTMLFormatter();
- QueryScorer scorer = new QueryScorer(query, fieldName, FIELD_NAME);
- Highlighter highlighter = new Highlighter(formatter, scorer);
- highlighter.setTextFragmenter(new SimpleFragmenter(Integer.MAX_VALUE));
-
- String rv = highlighter.getBestFragments(tokenStream, text, 1, "(FIELD TEXT TRUNCATED)");
- return rv.length() == 0 ? text : rv;
- }
-
- public void testSimpleSpanHighlighter() throws Exception {
- doSearching("Kennedy");
-
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,
- new StringReader(text));
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- // Not sure we can assert anything here - just running to check we dont
- // throw any exceptions
- }
-
- // LUCENE-1752
- public void testRepeatingTermsInMultBooleans() throws Exception {
- String content = "x y z a b c d e f g b c g";
- String ph1 = "\"a b c d\"";
- String ph2 = "\"b c g\"";
- String f1 = "f1";
- String f2 = "f2";
- String f1c = f1 + ":";
- String f2c = f2 + ":";
- String q = "(" + f1c + ph1 + " OR " + f2c + ph1 + ") AND (" + f1c + ph2
- + " OR " + f2c + ph2 + ")";
- Analyzer analyzer = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
- QueryParser qp = new QueryParser(TEST_VERSION_CURRENT, f1, analyzer);
- Query query = qp.parse(q);
-
- QueryScorer scorer = new QueryScorer(query, f1);
- scorer.setExpandMultiTermQuery(false);
-
- Highlighter h = new Highlighter(this, scorer);
-
- h.getBestFragment(analyzer, f1, content);
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 7);
- }
-
- public void testSimpleQueryScorerPhraseHighlighting() throws Exception {
- doSearching("\"very long and contains\"");
-
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 3);
-
- numHighlights = 0;
- doSearching("\"This piece of text refers to Kennedy\"");
-
- maxNumFragmentsRequired = 2;
-
- scorer = new QueryScorer(query, FIELD_NAME);
- highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
-
- numHighlights = 0;
- doSearching("\"lets is a the lets is a the lets is a the lets\"");
-
- maxNumFragmentsRequired = 2;
-
- scorer = new QueryScorer(query, FIELD_NAME);
- highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
-
- }
-
- public void testSpanRegexQuery() throws Exception {
- query = new SpanOrQuery(new SpanQuery [] {
- new SpanRegexQuery(new Term(FIELD_NAME, "ken.*")) });
- searcher = new IndexSearcher(ramDir, true);
- hits = searcher.search(query, 100);
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
-
- public void testRegexQuery() throws Exception {
- query = new RegexQuery(new Term(FIELD_NAME, "ken.*"));
- searcher = new IndexSearcher(ramDir, true);
- hits = searcher.search(query, 100);
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
-
- public void testNumericRangeQuery() throws Exception {
- // doesn't currently highlight, but make sure it doesn't cause exception either
- query = NumericRangeQuery.newIntRange(NUMERIC_FIELD_NAME, 2, 6, true, true);
- searcher = new IndexSearcher(ramDir, true);
- hits = searcher.search(query, 100);
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(NUMERIC_FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
-// String result =
- highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,"...");
- //if (VERBOSE) System.out.println("\t" + result);
- }
-
-
- }
-
- public void testSimpleQueryScorerPhraseHighlighting2() throws Exception {
- doSearching("\"text piece long\"~5");
-
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this,scorer);
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 6);
- }
-
- public void testSimpleQueryScorerPhraseHighlighting3() throws Exception {
- doSearching("\"x y z\"");
-
- int maxNumFragmentsRequired = 2;
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 3);
- }
- }
-
- public void testSimpleSpanFragmenter() throws Exception {
- doSearching("\"piece of text that is very long\"");
-
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 5));
-
- String result = highlighter.getBestFragments(tokenStream, text,
- maxNumFragmentsRequired, "...");
- if (VERBOSE) System.out.println("\t" + result);
-
- }
-
- doSearching("\"been shot\"");
-
- maxNumFragmentsRequired = 2;
-
- scorer = new QueryScorer(query, FIELD_NAME);
- highlighter = new Highlighter(this, scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleSpanFragmenter(scorer, 20));
-
- String result = highlighter.getBestFragments(tokenStream, text,
- maxNumFragmentsRequired, "...");
- if (VERBOSE) System.out.println("\t" + result);
-
- }
- }
-
- // position sensitive query added after position insensitive query
- public void testPosTermStdTerm() throws Exception {
- doSearching("y \"x y z\"");
-
- int maxNumFragmentsRequired = 2;
-
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this,scorer);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME,new StringReader(text));
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
- }
- }
-
- public void testQueryScorerMultiPhraseQueryHighlighting() throws Exception {
- MultiPhraseQuery mpq = new MultiPhraseQuery();
-
- mpq.add(new Term[] { new Term(FIELD_NAME, "wordx"), new Term(FIELD_NAME, "wordb") });
- mpq.add(new Term(FIELD_NAME, "wordy"));
-
- doSearching(mpq);
-
- final int maxNumFragmentsRequired = 2;
- assertExpectedHighlightCount(maxNumFragmentsRequired, 6);
- }
-
- public void testQueryScorerMultiPhraseQueryHighlightingWithGap() throws Exception {
- MultiPhraseQuery mpq = new MultiPhraseQuery();
-
- /*
- * The toString of MultiPhraseQuery doesn't work so well with these
- * out-of-order additions, but the Query itself seems to match accurately.
- */
-
- mpq.add(new Term[] { new Term(FIELD_NAME, "wordz") }, 2);
- mpq.add(new Term[] { new Term(FIELD_NAME, "wordx") }, 0);
-
- doSearching(mpq);
-
- final int maxNumFragmentsRequired = 1;
- final int expectedHighlights = 2;
-
- assertExpectedHighlightCount(maxNumFragmentsRequired, expectedHighlights);
- }
-
- public void testNearSpanSimpleQuery() throws Exception {
- doSearching(new SpanNearQuery(new SpanQuery[] {
- new SpanTermQuery(new Term(FIELD_NAME, "beginning")),
- new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false));
-
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- mode = QUERY;
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- }
- };
-
- helper.run();
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- }
-
- public void testSimpleQueryTermScorerHighlighter() throws Exception {
- doSearching("Kennedy");
- Highlighter highlighter = new Highlighter(new QueryTermScorer(query));
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
- int maxNumFragmentsRequired = 2;
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
- }
- // Not sure we can assert anything here - just running to check we dont
- // throw any exceptions
- }
-
- public void testSpanHighlighting() throws Exception {
- Query query1 = new SpanNearQuery(new SpanQuery[] {
- new SpanTermQuery(new Term(FIELD_NAME, "wordx")),
- new SpanTermQuery(new Term(FIELD_NAME, "wordy")) }, 1, false);
- Query query2 = new SpanNearQuery(new SpanQuery[] {
- new SpanTermQuery(new Term(FIELD_NAME, "wordy")),
- new SpanTermQuery(new Term(FIELD_NAME, "wordc")) }, 1, false);
- BooleanQuery bquery = new BooleanQuery();
- bquery.add(query1, Occur.SHOULD);
- bquery.add(query2, Occur.SHOULD);
- doSearching(bquery);
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- mode = QUERY;
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- }
- };
-
- helper.run();
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 7);
- }
-
- public void testNotSpanSimpleQuery() throws Exception {
- doSearching(new SpanNotQuery(new SpanNearQuery(new SpanQuery[] {
- new SpanTermQuery(new Term(FIELD_NAME, "shot")),
- new SpanTermQuery(new Term(FIELD_NAME, "kennedy")) }, 3, false), new SpanTermQuery(
- new Term(FIELD_NAME, "john"))));
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- mode = QUERY;
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- }
- };
-
- helper.run();
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
- }
-
- public void testGetBestFragmentsSimpleQuery() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("Kennedy");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
- }
- };
-
- helper.start();
- }
-
- public void testGetFuzzyFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("Kinnedy~");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this, true);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
- };
-
- helper.start();
- }
-
- public void testGetWildCardFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("K?nnedy");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
- }
- };
-
- helper.start();
- }
-
- public void testGetMidWildCardFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("K*dy");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
- };
-
- helper.start();
- }
-
- public void testGetRangeFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- String queryString = FIELD_NAME + ":[kannedy TO kznnedy]";
-
- // Need to explicitly set the QueryParser property to use TermRangeQuery
- // rather
- // than RangeFilters
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
- parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
- query = parser.parse(queryString);
- doSearching(query);
-
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
- };
-
- helper.start();
- }
-
- public void testConstantScoreMultiTermQuery() throws Exception {
-
- numHighlights = 0;
-
- query = new WildcardQuery(new Term(FIELD_NAME, "ken*"));
- ((WildcardQuery)query).setRewriteMethod(MultiTermQuery.CONSTANT_SCORE_FILTER_REWRITE);
- searcher = new IndexSearcher(ramDir, true);
- // can't rewrite ConstantScore if you want to highlight it -
- // it rewrites to ConstantScoreQuery which cannot be highlighted
- // query = unReWrittenQuery.rewrite(reader);
- if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
- hits = searcher.search(query, null, 1000);
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
- String fragmentSeparator = "...";
- QueryScorer scorer = null;
- TokenStream tokenStream = null;
-
- tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
-
- scorer = new QueryScorer(query, HighlighterTest.FIELD_NAME);
-
- Highlighter highlighter = new Highlighter(this, scorer);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(20));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- fragmentSeparator);
- if (VERBOSE) System.out.println("\t" + result);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
-
- // try null field
-
- hits = searcher.search(query, null, 1000);
-
- numHighlights = 0;
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
- String fragmentSeparator = "...";
- QueryScorer scorer = null;
- TokenStream tokenStream = null;
-
- tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
-
- scorer = new QueryScorer(query, null);
-
- Highlighter highlighter = new Highlighter(this, scorer);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(20));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- fragmentSeparator);
- if (VERBOSE) System.out.println("\t" + result);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
-
- // try default field
-
- hits = searcher.search(query, null, 1000);
-
- numHighlights = 0;
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
- String fragmentSeparator = "...";
- QueryScorer scorer = null;
- TokenStream tokenStream = null;
-
- tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
-
- scorer = new QueryScorer(query, "random_field", HighlighterTest.FIELD_NAME);
-
- Highlighter highlighter = new Highlighter(this, scorer);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(20));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- fragmentSeparator);
- if (VERBOSE) System.out.println("\t" + result);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
-
- public void testGetBestFragmentsPhrase() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("\"John Kennedy\"");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- // Currently highlights "John" and "Kennedy" separately
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- }
- };
-
- helper.start();
- }
-
- public void testGetBestFragmentsQueryScorer() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")),
- new SpanTermQuery(new Term("contents", "kennedy")), };
-
- SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
- doSearching(snq);
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- // Currently highlights "John" and "Kennedy" separately
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- }
- };
-
- helper.start();
- }
-
- public void testOffByOne() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- TermQuery query = new TermQuery(new Term("data", "help"));
- Highlighter hg = new Highlighter(new SimpleHTMLFormatter(), new QueryTermScorer(query));
- hg.setTextFragmenter(new NullFragmenter());
-
- String match = null;
- match = hg.getBestFragment(analyzer, "data", "help me [54-65]");
- assertEquals("<B>help</B> me [54-65]", match);
-
- }
- };
-
- helper.start();
- }
-
- public void testGetBestFragmentsFilteredQuery() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
- SpanQuery clauses[] = { new SpanTermQuery(new Term("contents", "john")),
- new SpanTermQuery(new Term("contents", "kennedy")), };
- SpanNearQuery snq = new SpanNearQuery(clauses, 1, true);
- FilteredQuery fq = new FilteredQuery(snq, rf);
-
- doSearching(fq);
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- // Currently highlights "John" and "Kennedy" separately
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- }
- };
-
- helper.start();
- }
-
- public void testGetBestFragmentsFilteredPhraseQuery() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- TermRangeFilter rf = new TermRangeFilter("contents", "john", "john", true, true);
- PhraseQuery pq = new PhraseQuery();
- pq.add(new Term("contents", "john"));
- pq.add(new Term("contents", "kennedy"));
- FilteredQuery fq = new FilteredQuery(pq, rf);
-
- doSearching(fq);
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- // Currently highlights "John" and "Kennedy" separately
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- }
- };
-
- helper.start();
- }
-
- public void testGetBestFragmentsMultiTerm() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("John Kenn*");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
- };
-
- helper.start();
- }
-
- public void testGetBestFragmentsWithOr() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("JFK OR Kennedy");
- doStandardHighlights(analyzer, searcher, hits, query, HighlighterTest.this);
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 5);
- }
- };
- helper.start();
- }
-
- public void testGetBestSingleFragment() throws Exception {
-
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- doSearching("Kennedy");
- numHighlights = 0;
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
- String result = highlighter.getBestFragment(tokenStream, text);
- if (VERBOSE) System.out.println("\t" + result);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
-
- numHighlights = 0;
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);
- highlighter.getBestFragment(analyzer, FIELD_NAME, text);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
-
- numHighlights = 0;
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
-
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);
- highlighter.getBestFragments(analyzer, FIELD_NAME, text, 10);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
-
- }
-
- };
-
- helper.start();
-
- }
-
- public void testGetBestSingleFragmentWithWeights() throws Exception {
-
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- WeightedSpanTerm[] wTerms = new WeightedSpanTerm[2];
- wTerms[0] = new WeightedSpanTerm(10f, "hello");
-
- List<PositionSpan> positionSpans = new ArrayList<PositionSpan>();
- positionSpans.add(new PositionSpan(0, 0));
- wTerms[0].addPositionSpans(positionSpans);
-
- wTerms[1] = new WeightedSpanTerm(1f, "kennedy");
- positionSpans = new ArrayList<PositionSpan>();
- positionSpans.add(new PositionSpan(14, 14));
- wTerms[1].addPositionSpans(positionSpans);
-
- Highlighter highlighter = getHighlighter(wTerms, HighlighterTest.this);// new
- // Highlighter(new
- // QueryTermScorer(wTerms));
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
- highlighter.setTextFragmenter(new SimpleFragmenter(2));
-
- String result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
- assertTrue("Failed to find best section using weighted terms. Found: [" + result + "]",
- "<B>Hello</B>".equals(result));
-
- // readjust weights
- wTerms[1].setWeight(50f);
- tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
- highlighter = getHighlighter(wTerms, HighlighterTest.this);
- highlighter.setTextFragmenter(new SimpleFragmenter(2));
-
- result = highlighter.getBestFragment(tokenStream, texts[0]).trim();
- assertTrue("Failed to find best section using weighted terms. Found: " + result,
- "<B>kennedy</B>".equals(result));
- }
-
- };
-
- helper.start();
-
- }
-
- // tests a "complex" analyzer that produces multiple
- // overlapping tokens
- public void testOverlapAnalyzer() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- HashMap<String,String> synonyms = new HashMap<String,String>();
- synonyms.put("football", "soccer,footie");
- Analyzer analyzer = new SynonymAnalyzer(synonyms);
- String srchkey = "football";
-
- String s = "football-soccer in the euro 2004 footie competition";
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "bookid", analyzer);
- Query query = parser.parse(srchkey);
-
- TokenStream tokenStream = analyzer.tokenStream(null, new StringReader(s));
-
- Highlighter highlighter = getHighlighter(query, null, tokenStream, HighlighterTest.this);
-
- // Get 3 best fragments and seperate with a "..."
- tokenStream = analyzer.tokenStream(null, new StringReader(s));
-
- String result = highlighter.getBestFragments(tokenStream, s, 3, "...");
- String expectedResult = "<B>football</B>-<B>soccer</B> in the euro 2004 <B>footie</B> competition";
- assertTrue("overlapping analyzer should handle highlights OK, expected:" + expectedResult
- + " actual:" + result, expectedResult.equals(result));
- }
-
- };
-
- helper.start();
-
- }
-
- public void testGetSimpleHighlight() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("Kennedy");
- // new Highlighter(HighlighterTest.this, new QueryTermScorer(query));
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);
- String result = highlighter.getBestFragment(tokenStream, text);
- if (VERBOSE) System.out.println("\t" + result);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 4);
- }
- };
- helper.start();
- }
-
- public void testGetTextFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
-
- doSearching("Kennedy");
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
-
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);// new Highlighter(this, new
- // QueryTermScorer(query));
- highlighter.setTextFragmenter(new SimpleFragmenter(20));
- String stringResults[] = highlighter.getBestFragments(tokenStream, text, 10);
-
- tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- TextFragment fragmentResults[] = highlighter.getBestTextFragments(tokenStream, text,
- true, 10);
-
- assertTrue("Failed to find correct number of text Fragments: " + fragmentResults.length
- + " vs " + stringResults.length, fragmentResults.length == stringResults.length);
- for (int j = 0; j < stringResults.length; j++) {
- if (VERBOSE) System.out.println(fragmentResults[j]);
- assertTrue("Failed to find same text Fragments: " + fragmentResults[j] + " found",
- fragmentResults[j].toString().equals(stringResults[j]));
-
- }
-
- }
- }
- };
- helper.start();
- }
-
- public void testMaxSizeHighlight() throws Exception {
- final MockAnalyzer analyzer = new MockAnalyzer(random, MockTokenizer.SIMPLE, true, (CharArraySet) StopAnalyzer.ENGLISH_STOP_WORDS_SET, true);
- // we disable MockTokenizer checks because we will forcefully limit the
- // tokenstream and call end() before incrementToken() returns false.
- analyzer.setEnableChecks(false);
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- doSearching("meat");
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(texts[0]));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);// new Highlighter(this, new
- // QueryTermScorer(query));
- highlighter.setMaxDocCharsToAnalyze(30);
-
- highlighter.getBestFragment(tokenStream, texts[0]);
- assertTrue("Setting MaxDocBytesToAnalyze should have prevented "
- + "us from finding matches for this record: " + numHighlights + " found",
- numHighlights == 0);
- }
- };
-
- helper.start();
- }
-
- public void testMaxSizeHighlightTruncates() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- String goodWord = "goodtoken";
- Set<String> stopWords = new HashSet<String>(1);
- stopWords.add("stoppedtoken");
-
- TermQuery query = new TermQuery(new Term("data", goodWord));
-
- String match = null;
- StringBuilder sb = new StringBuilder();
- sb.append(goodWord);
- for (int i = 0; i < 10000; i++) {
- sb.append(" ");
- // only one stopword
- sb.append(stopWords.iterator().next());
- }
- SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
- Highlighter hg = getHighlighter(query, "data", new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords).tokenStream(
- "data", new StringReader(sb.toString())), fm);// new Highlighter(fm,
- // new
- // QueryTermScorer(query));
- hg.setTextFragmenter(new NullFragmenter());
- hg.setMaxDocCharsToAnalyze(100);
- match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "data", sb.toString());
- assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
- .getMaxDocCharsToAnalyze());
-
- // add another tokenized word to the overrall length - but set way
- // beyond
- // the length of text under consideration (after a large slug of stop
- // words
- // + whitespace)
- sb.append(" ");
- sb.append(goodWord);
- match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "data", sb.toString());
- assertTrue("Matched text should be no more than 100 chars in length ", match.length() < hg
- .getMaxDocCharsToAnalyze());
- }
- };
-
- helper.start();
-
- }
-
- public void testMaxSizeEndHighlight() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
- @Override
- public void run() throws Exception {
- Set<String> stopWords = new HashSet<String>();
- stopWords.add("in");
- stopWords.add("it");
- TermQuery query = new TermQuery(new Term("text", "searchterm"));
-
- String text = "this is a text with searchterm in it";
- SimpleHTMLFormatter fm = new SimpleHTMLFormatter();
- Highlighter hg = getHighlighter(query, "text", new StandardAnalyzer(TEST_VERSION_CURRENT,
- stopWords).tokenStream("text", new StringReader(text)), fm);
- hg.setTextFragmenter(new NullFragmenter());
- hg.setMaxDocCharsToAnalyze(36);
- String match = hg.getBestFragment(new StandardAnalyzer(TEST_VERSION_CURRENT, stopWords), "text", text);
- assertTrue(
- "Matched text should contain remainder of text after highlighted query ",
- match.endsWith("in it"));
- }
- };
- helper.start();
- }
-
- public void testUnRewrittenQuery() throws Exception {
- final TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- numHighlights = 0;
- // test to show how rewritten query can still be used
- if (searcher != null) searcher.close();
- searcher = new IndexSearcher(ramDir, true);
- Analyzer analyzer = new StandardAnalyzer(TEST_VERSION_CURRENT);
-
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
- Query query = parser.parse("JF? or Kenned*");
- if (VERBOSE) System.out.println("Searching with primitive query");
- // forget to set this and...
- // query=query.rewrite(reader);
- TopDocs hits = searcher.search(query, null, 1000);
-
- // create an instance of the highlighter with the tags used to surround
- // highlighted text
- // QueryHighlightExtractor highlighter = new
- // QueryHighlightExtractor(this,
- // query, new StandardAnalyzer(TEST_VERSION));
-
- int maxNumFragmentsRequired = 3;
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream, HighlighterTest.this, false);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String highlightedText = highlighter.getBestFragments(tokenStream, text,
- maxNumFragmentsRequired, "...");
-
- if (VERBOSE) System.out.println(highlightedText);
- }
- // We expect to have zero highlights if the query is multi-terms and is
- // not
- // rewritten!
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 0);
- }
- };
-
- helper.start();
- }
-
- public void testNoFragments() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- doSearching("AnInvalidQueryWhichShouldYieldNoResults");
-
- for (int i = 0; i < texts.length; i++) {
- String text = texts[i];
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- Highlighter highlighter = getHighlighter(query, FIELD_NAME, tokenStream,
- HighlighterTest.this);
- String result = highlighter.getBestFragment(tokenStream, text);
- assertNull("The highlight result should be null for text with no query terms", result);
- }
- }
- };
-
- helper.start();
- }
-
- /**
- * Demonstrates creation of an XHTML compliant doc using new encoding facilities.
- *
- * @throws Exception
- */
- public void testEncoding() throws Exception {
-
- String rawDocContent = "\"Smith & sons' prices < 3 and >4\" claims article";
- // run the highlighter on the raw content (scorer does not score any tokens
- // for
- // highlighting but scores a single fragment for selection
- Highlighter highlighter = new Highlighter(this, new SimpleHTMLEncoder(), new Scorer() {
- public void startFragment(TextFragment newFragment) {
- }
-
- public float getTokenScore() {
- return 0;
- }
-
- public float getFragmentScore() {
- return 1;
- }
-
- public TokenStream init(TokenStream tokenStream) {
- return null;
- }
- });
- highlighter.setTextFragmenter(new SimpleFragmenter(2000));
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(rawDocContent));
-
- String encodedSnippet = highlighter.getBestFragments(tokenStream, rawDocContent, 1, "");
- // An ugly bit of XML creation:
- String xhtml = "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n"
- + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" lang=\"en\">\n"
- + "<head>\n" + "<title>My Test HTML Document</title>\n" + "</head>\n" + "<body>\n" + "<h2>"
- + encodedSnippet + "</h2>\n" + "</body>\n" + "</html>";
- // now an ugly built of XML parsing to test the snippet is encoded OK
- DocumentBuilderFactory dbf = DocumentBuilderFactory.newInstance();
- DocumentBuilder db = dbf.newDocumentBuilder();
- org.w3c.dom.Document doc = db.parse(new ByteArrayInputStream(xhtml.getBytes()));
- Element root = doc.getDocumentElement();
- NodeList nodes = root.getElementsByTagName("body");
- Element body = (Element) nodes.item(0);
- nodes = body.getElementsByTagName("h2");
- Element h2 = (Element) nodes.item(0);
- String decodedSnippet = h2.getFirstChild().getNodeValue();
- assertEquals("XHTML Encoding should have worked:", rawDocContent, decodedSnippet);
- }
-
- public void testMultiSearcher() throws Exception {
- // setup index 1
- Directory ramDir1 = newDirectory();
- IndexWriter writer1 = new IndexWriter(ramDir1, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
- Document d = new Document();
- Field f = new Field(FIELD_NAME, "multiOne", Field.Store.YES, Field.Index.ANALYZED);
- d.add(f);
- writer1.addDocument(d);
- writer1.optimize();
- writer1.close();
- IndexReader reader1 = IndexReader.open(ramDir1, true);
-
- // setup index 2
- Directory ramDir2 = newDirectory();
- IndexWriter writer2 = new IndexWriter(ramDir2, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
- d = new Document();
- f = new Field(FIELD_NAME, "multiTwo", Field.Store.YES, Field.Index.ANALYZED);
- d.add(f);
- writer2.addDocument(d);
- writer2.optimize();
- writer2.close();
- IndexReader reader2 = IndexReader.open(ramDir2, true);
-
- IndexSearcher searchers[] = new IndexSearcher[2];
- searchers[0] = new IndexSearcher(ramDir1, true);
- searchers[1] = new IndexSearcher(ramDir2, true);
- MultiSearcher multiSearcher = new MultiSearcher(searchers);
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, new StandardAnalyzer(TEST_VERSION_CURRENT));
- parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
- query = parser.parse("multi*");
- if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
- // at this point the multisearcher calls combine(query[])
- hits = multiSearcher.search(query, null, 1000);
-
- // query = QueryParser.parse("multi*", FIELD_NAME, new StandardAnalyzer(TEST_VERSION));
- Query expandedQueries[] = new Query[2];
- expandedQueries[0] = query.rewrite(reader1);
- expandedQueries[1] = query.rewrite(reader2);
- query = query.combine(expandedQueries);
-
- // create an instance of the highlighter with the tags used to surround
- // highlighted text
- Highlighter highlighter = new Highlighter(this, new QueryTermScorer(query));
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = multiSearcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- String highlightedText = highlighter.getBestFragment(tokenStream, text);
- if (VERBOSE) System.out.println(highlightedText);
- }
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == 2);
- reader1.close();
- reader2.close();
- searchers[0].close();
- searchers[1].close();
- ramDir1.close();
- ramDir2.close();
- }
-
- public void testFieldSpecificHighlighting() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- String docMainText = "fred is one of the people";
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
- Query query = parser.parse("fred category:people");
-
- // highlighting respects fieldnames used in query
-
- Scorer fieldSpecificScorer = null;
- if (mode == TestHighlightRunner.QUERY) {
- fieldSpecificScorer = new QueryScorer(query, FIELD_NAME);
- } else if (mode == TestHighlightRunner.QUERY_TERM) {
- fieldSpecificScorer = new QueryTermScorer(query, "contents");
- }
- Highlighter fieldSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),
- fieldSpecificScorer);
- fieldSpecificHighlighter.setTextFragmenter(new NullFragmenter());
- String result = fieldSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText);
- assertEquals("Should match", result, "<B>fred</B> is one of the people");
-
- // highlighting does not respect fieldnames used in query
- Scorer fieldInSpecificScorer = null;
- if (mode == TestHighlightRunner.QUERY) {
- fieldInSpecificScorer = new QueryScorer(query, null);
- } else if (mode == TestHighlightRunner.QUERY_TERM) {
- fieldInSpecificScorer = new QueryTermScorer(query);
- }
-
- Highlighter fieldInSpecificHighlighter = new Highlighter(new SimpleHTMLFormatter(),
- fieldInSpecificScorer);
- fieldInSpecificHighlighter.setTextFragmenter(new NullFragmenter());
- result = fieldInSpecificHighlighter.getBestFragment(analyzer, FIELD_NAME, docMainText);
- assertEquals("Should match", result, "<B>fred</B> is one of the <B>people</B>");
-
- reader.close();
- }
- };
-
- helper.start();
-
- }
-
- protected TokenStream getTS2() {
- // String s = "Hi-Speed10 foo";
- return new TokenStream() {
- Iterator<Token> iter;
- List<Token> lst;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- {
- lst = new ArrayList<Token>();
- Token t;
- t = createToken("hi", 0, 2);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("hispeed", 0, 8);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("speed", 3, 8);
- t.setPositionIncrement(0);
- lst.add(t);
- t = createToken("10", 8, 10);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("foo", 11, 14);
- t.setPositionIncrement(1);
- lst.add(t);
- iter = lst.iterator();
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if(iter.hasNext()) {
- Token token = iter.next();
- clearAttributes();
- termAtt.setEmpty().append(token);
- posIncrAtt.setPositionIncrement(token.getPositionIncrement());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- return true;
- }
- return false;
- }
-
- };
- }
-
- // same token-stream as above, but the bigger token comes first this time
- protected TokenStream getTS2a() {
- // String s = "Hi-Speed10 foo";
- return new TokenStream() {
- Iterator<Token> iter;
- List<Token> lst;
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- {
- lst = new ArrayList<Token>();
- Token t;
- t = createToken("hispeed", 0, 8);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("hi", 0, 2);
- t.setPositionIncrement(0);
- lst.add(t);
- t = createToken("speed", 3, 8);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("10", 8, 10);
- t.setPositionIncrement(1);
- lst.add(t);
- t = createToken("foo", 11, 14);
- t.setPositionIncrement(1);
- lst.add(t);
- iter = lst.iterator();
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- if(iter.hasNext()) {
- Token token = iter.next();
- clearAttributes();
- termAtt.setEmpty().append(token);
- posIncrAtt.setPositionIncrement(token.getPositionIncrement());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- return true;
- }
- return false;
- }
- };
- }
-
- public void testOverlapAnalyzer2() throws Exception {
- TestHighlightRunner helper = new TestHighlightRunner() {
-
- @Override
- public void run() throws Exception {
- String s = "Hi-Speed10 foo";
-
- Query query;
- Highlighter highlighter;
- String result;
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("Hi-Speed10 <B>foo</B>", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("Hi-Speed<B>10</B> foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("<B>Hi</B>-Speed10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("Hi-<B>Speed</B>10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("<B>Hi-Speed</B>10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed");
- highlighter = getHighlighter(query, "text", getTS2(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2(), s, 3, "...");
- assertEquals("<B>Hi-Speed</B>10 foo", result);
-
- // ///////////////// same tests, just put the bigger overlapping token
- // first
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("foo");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("Hi-Speed10 <B>foo</B>", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("10");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("Hi-Speed<B>10</B> foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("<B>Hi</B>-Speed10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("speed");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("Hi-<B>Speed</B>10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hispeed");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("<B>Hi-Speed</B>10 foo", result);
-
- query = new QueryParser(TEST_VERSION_CURRENT, "text", new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).parse("hi speed");
- highlighter = getHighlighter(query, "text", getTS2a(), HighlighterTest.this);
- result = highlighter.getBestFragments(getTS2a(), s, 3, "...");
- assertEquals("<B>Hi-Speed</B>10 foo", result);
- }
- };
-
- helper.start();
- }
-
- private Directory dir;
- private Analyzer a = new MockAnalyzer(random, MockTokenizer.WHITESPACE, false);
-
- public void testWeightedTermsWithDeletes() throws IOException, ParseException, InvalidTokenOffsetsException {
- makeIndex();
- deleteDocument();
- searchIndex();
- }
-
- private Document doc( String f, String v ){
- Document doc = new Document();
- doc.add( new Field( f, v, Store.YES, Index.ANALYZED ) );
- return doc;
- }
-
- private void makeIndex() throws IOException {
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
- writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
- writer.addDocument( doc( "t_text1", "more random words for second field del" ) );
- writer.addDocument( doc( "t_text1", "random words for highlighting tests del" ) );
- writer.addDocument( doc( "t_text1", "more random words for second field" ) );
- writer.optimize();
- writer.close();
- }
-
- private void deleteDocument() throws IOException {
- IndexWriter writer = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)).setOpenMode(OpenMode.APPEND));
- writer.deleteDocuments( new Term( "t_text1", "del" ) );
- // To see negative idf, keep comment the following line
- //writer.optimize();
- writer.close();
- }
-
- private void searchIndex() throws IOException, ParseException, InvalidTokenOffsetsException {
- String q = "t_text1:random";
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, "t_text1", a );
- Query query = parser.parse( q );
- IndexSearcher searcher = new IndexSearcher( dir, true );
- // This scorer can return negative idf -> null fragment
- Scorer scorer = new QueryTermScorer( query, searcher.getIndexReader(), "t_text1" );
- // This scorer doesn't use idf (patch version)
- //Scorer scorer = new QueryTermScorer( query, "t_text1" );
- Highlighter h = new Highlighter( scorer );
-
- TopDocs hits = searcher.search(query, null, 10);
- for( int i = 0; i < hits.totalHits; i++ ){
- Document doc = searcher.doc( hits.scoreDocs[i].doc );
- String result = h.getBestFragment( a, "t_text1", doc.get( "t_text1" ));
- if (VERBOSE) System.out.println("result:" + result);
- assertEquals("more <B>random</B> words for second field", result);
- }
- searcher.close();
- }
-
- /*
- *
- * public void testBigramAnalyzer() throws IOException, ParseException {
- * //test to ensure analyzers with none-consecutive start/end offsets //dont
- * double-highlight text //setup index 1 RAMDirectory ramDir = new
- * RAMDirectory(); Analyzer bigramAnalyzer=new CJKAnalyzer(); IndexWriter
- * writer = new IndexWriter(ramDir,bigramAnalyzer , true); Document d = new
- * Document(); Field f = new Field(FIELD_NAME, "java abc def", true, true,
- * true); d.add(f); writer.addDocument(d); writer.close(); IndexReader reader =
- * IndexReader.open(ramDir, true);
- *
- * IndexSearcher searcher=new IndexSearcher(reader); query =
- * QueryParser.parse("abc", FIELD_NAME, bigramAnalyzer);
- * System.out.println("Searching for: " + query.toString(FIELD_NAME)); hits =
- * searcher.search(query);
- *
- * Highlighter highlighter = new Highlighter(this,new
- * QueryFragmentScorer(query));
- *
- * for (int i = 0; i < hits.totalHits; i++) { String text =
- * searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME); TokenStream
- * tokenStream=bigramAnalyzer.tokenStream(FIELD_NAME,new StringReader(text));
- * String highlightedText = highlighter.getBestFragment(tokenStream,text);
- * System.out.println(highlightedText); } }
- */
-
- public String highlightTerm(String originalText, TokenGroup group) {
- if (group.getTotalScore() <= 0) {
- return originalText;
- }
- numHighlights++; // update stats used in assertions
- return "<B>" + originalText + "</B>";
- }
-
- public void doSearching(String queryString) throws Exception {
- QueryParser parser = new QueryParser(TEST_VERSION_CURRENT, FIELD_NAME, analyzer);
- parser.setEnablePositionIncrements(true);
- parser.setMultiTermRewriteMethod(MultiTermQuery.SCORING_BOOLEAN_QUERY_REWRITE);
- query = parser.parse(queryString);
- doSearching(query);
- }
-
- public void doSearching(Query unReWrittenQuery) throws Exception {
- if (searcher != null) searcher.close();
- searcher = new IndexSearcher(ramDir, true);
- // for any multi-term queries to work (prefix, wildcard, range,fuzzy etc)
- // you must use a rewritten query!
- query = unReWrittenQuery.rewrite(reader);
- if (VERBOSE) System.out.println("Searching for: " + query.toString(FIELD_NAME));
- hits = searcher.search(query, null, 1000);
- }
-
- public void assertExpectedHighlightCount(final int maxNumFragmentsRequired,
- final int expectedHighlights) throws Exception {
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(FIELD_NAME);
- TokenStream tokenStream = analyzer.tokenStream(FIELD_NAME, new StringReader(text));
- QueryScorer scorer = new QueryScorer(query, FIELD_NAME);
- Highlighter highlighter = new Highlighter(this, scorer);
-
- highlighter.setTextFragmenter(new SimpleFragmenter(40));
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- "...");
- if (VERBOSE) System.out.println("\t" + result);
-
- assertTrue("Failed to find correct number of highlights " + numHighlights + " found",
- numHighlights == expectedHighlights);
- }
- }
-
- @Override
- public void setUp() throws Exception {
- super.setUp();
- dir = newDirectory();
- ramDir = newDirectory();
- IndexWriter writer = new IndexWriter(ramDir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
- for (int i = 0; i < texts.length; i++) {
- addDoc(writer, texts[i]);
- }
- Document doc = new Document();
- NumericField nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
- nfield.setIntValue(1);
- doc.add(nfield);
- writer.addDocument(doc, analyzer);
- nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
- nfield.setIntValue(3);
- doc = new Document();
- doc.add(nfield);
- writer.addDocument(doc, analyzer);
- nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
- nfield.setIntValue(5);
- doc = new Document();
- doc.add(nfield);
- writer.addDocument(doc, analyzer);
- nfield = new NumericField(NUMERIC_FIELD_NAME, Store.YES, true);
- nfield.setIntValue(7);
- doc = new Document();
- doc.add(nfield);
- writer.addDocument(doc, analyzer);
- writer.optimize();
- writer.close();
- reader = IndexReader.open(ramDir, true);
- numHighlights = 0;
- }
-
- @Override
- public void tearDown() throws Exception {
- if (searcher != null) searcher.close();
- reader.close();
- dir.close();
- ramDir.close();
- super.tearDown();
- }
- private void addDoc(IndexWriter writer, String text) throws IOException {
- Document d = new Document();
- Field f = new Field(FIELD_NAME, text, Field.Store.YES, Field.Index.ANALYZED);
- d.add(f);
- writer.addDocument(d);
-
- }
-
- private static Token createToken(String term, int start, int offset)
- {
- return new Token(term, start, offset);
- }
-
-}
-
-// ===================================================================
-// ========== BEGIN TEST SUPPORTING CLASSES
-// ========== THESE LOOK LIKE, WITH SOME MORE EFFORT THESE COULD BE
-// ========== MADE MORE GENERALLY USEFUL.
-// TODO - make synonyms all interchangeable with each other and produce
-// a version that does hyponyms - the "is a specialised type of ...."
-// so that car = audi, bmw and volkswagen but bmw != audi so different
-// behaviour to synonyms
-// ===================================================================
-
-final class SynonymAnalyzer extends Analyzer {
- private Map<String,String> synonyms;
-
- public SynonymAnalyzer(Map<String,String> synonyms) {
- this.synonyms = synonyms;
- }
-
- /*
- * (non-Javadoc)
- *
- * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String,
- * java.io.Reader)
- */
- @Override
- public TokenStream tokenStream(String arg0, Reader arg1) {
- LowerCaseTokenizer stream = new LowerCaseTokenizer(LuceneTestCase.TEST_VERSION_CURRENT, arg1);
- stream.addAttribute(CharTermAttribute.class);
- stream.addAttribute(PositionIncrementAttribute.class);
- stream.addAttribute(OffsetAttribute.class);
- try {
- stream.reset();
- } catch (IOException e) {
- throw new RuntimeException(e);
- }
- return new SynonymTokenizer(stream, synonyms);
- }
-}
-
-/**
- * Expands a token stream with synonyms (TODO - make the synonyms analyzed by choice of analyzer)
- *
- */
-final class SynonymTokenizer extends TokenStream {
- private final TokenStream realStream;
- private Token currentRealToken = null;
- private final Map<String, String> synonyms;
- private StringTokenizer st = null;
- private final CharTermAttribute realTermAtt;
- private final PositionIncrementAttribute realPosIncrAtt;
- private final OffsetAttribute realOffsetAtt;
- private final CharTermAttribute termAtt;
- private final PositionIncrementAttribute posIncrAtt;
- private final OffsetAttribute offsetAtt;
-
- public SynonymTokenizer(TokenStream realStream, Map<String, String> synonyms) {
- this.realStream = realStream;
- this.synonyms = synonyms;
- realTermAtt = realStream.addAttribute(CharTermAttribute.class);
- realPosIncrAtt = realStream.addAttribute(PositionIncrementAttribute.class);
- realOffsetAtt = realStream.addAttribute(OffsetAttribute.class);
-
- termAtt = addAttribute(CharTermAttribute.class);
- posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- offsetAtt = addAttribute(OffsetAttribute.class);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
-
- if (currentRealToken == null) {
- boolean next = realStream.incrementToken();
- if (!next) {
- return false;
- }
- //Token nextRealToken = new Token(, offsetAtt.startOffset(), offsetAtt.endOffset());
- clearAttributes();
- termAtt.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
- offsetAtt.setOffset(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
- posIncrAtt.setPositionIncrement(realPosIncrAtt.getPositionIncrement());
-
- String expansions = synonyms.get(realTermAtt.toString());
- if (expansions == null) {
- return true;
- }
- st = new StringTokenizer(expansions, ",");
- if (st.hasMoreTokens()) {
- currentRealToken = new Token(realOffsetAtt.startOffset(), realOffsetAtt.endOffset());
- currentRealToken.copyBuffer(realTermAtt.buffer(), 0, realTermAtt.length());
- }
-
- return true;
- } else {
- String tok = st.nextToken();
- clearAttributes();
- termAtt.setEmpty().append(tok);
- offsetAtt.setOffset(currentRealToken.startOffset(), currentRealToken.endOffset());
- posIncrAtt.setPositionIncrement(0);
- if (!st.hasMoreTokens()) {
- currentRealToken = null;
- st = null;
- }
- return true;
- }
-
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- this.currentRealToken = null;
- this.st = null;
- }
-
- static abstract class TestHighlightRunner {
- static final int QUERY = 0;
- static final int QUERY_TERM = 1;
-
- int mode = QUERY;
- Fragmenter frag = new SimpleFragmenter(20);
-
- public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter) {
- return getHighlighter(query, fieldName, stream, formatter, true);
- }
-
- public Highlighter getHighlighter(Query query, String fieldName, TokenStream stream, Formatter formatter, boolean expanMultiTerm) {
- Scorer scorer = null;
- if (mode == QUERY) {
- scorer = new QueryScorer(query, fieldName);
- if(!expanMultiTerm) {
- ((QueryScorer)scorer).setExpandMultiTermQuery(false);
- }
- } else if (mode == QUERY_TERM) {
- scorer = new QueryTermScorer(query);
- } else {
- throw new RuntimeException("Unknown highlight mode");
- }
-
- return new Highlighter(formatter, scorer);
- }
-
- Highlighter getHighlighter(WeightedTerm[] weightedTerms, Formatter formatter) {
- if (mode == QUERY) {
- return new Highlighter(formatter, new QueryScorer((WeightedSpanTerm[]) weightedTerms));
- } else if (mode == QUERY_TERM) {
- return new Highlighter(formatter, new QueryTermScorer(weightedTerms));
-
- } else {
- throw new RuntimeException("Unknown highlight mode");
- }
- }
-
- void doStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, Formatter formatter)
- throws Exception {
- doStandardHighlights(analyzer, searcher, hits, query, formatter, false);
- }
-
- void doStandardHighlights(Analyzer analyzer, IndexSearcher searcher, TopDocs hits, Query query, Formatter formatter, boolean expandMT)
- throws Exception {
-
- for (int i = 0; i < hits.totalHits; i++) {
- String text = searcher.doc(hits.scoreDocs[i].doc).get(HighlighterTest.FIELD_NAME);
- int maxNumFragmentsRequired = 2;
- String fragmentSeparator = "...";
- Scorer scorer = null;
- TokenStream tokenStream = analyzer.tokenStream(HighlighterTest.FIELD_NAME, new StringReader(text));
- if (mode == QUERY) {
- scorer = new QueryScorer(query);
- } else if (mode == QUERY_TERM) {
- scorer = new QueryTermScorer(query);
- }
- Highlighter highlighter = new Highlighter(formatter, scorer);
- highlighter.setTextFragmenter(frag);
-
- String result = highlighter.getBestFragments(tokenStream, text, maxNumFragmentsRequired,
- fragmentSeparator);
- if (HighlighterTest.VERBOSE) System.out.println("\t" + result);
- }
- }
-
- abstract void run() throws Exception;
-
- void start() throws Exception {
- if (HighlighterTest.VERBOSE) System.out.println("Run QueryScorer");
- run();
- if (HighlighterTest.VERBOSE) System.out.println("Run QueryTermScorer");
- mode = QUERY_TERM;
- run();
- }
- }
-}