1 package org.apache.lucene.search.highlight;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.Token;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.document.Field;
31 import org.apache.lucene.document.Field.TermVector;
32 import org.apache.lucene.index.CorruptIndexException;
33 import org.apache.lucene.index.IndexReader;
34 import org.apache.lucene.index.IndexWriter;
35 import org.apache.lucene.index.Term;
36 import org.apache.lucene.index.TermPositionVector;
37 import org.apache.lucene.search.DisjunctionMaxQuery;
38 import org.apache.lucene.search.IndexSearcher;
39 import org.apache.lucene.search.Query;
40 import org.apache.lucene.search.TopDocs;
41 import org.apache.lucene.search.spans.SpanNearQuery;
42 import org.apache.lucene.search.spans.SpanQuery;
43 import org.apache.lucene.search.spans.SpanTermQuery;
44 import org.apache.lucene.store.Directory;
45 import org.apache.lucene.store.LockObtainFailedException;
46 import org.apache.lucene.util.LuceneTestCase;
49 public class TokenSourcesTest extends LuceneTestCase {
50 private static final String FIELD = "text";
52 private static final class OverlapAnalyzer extends Analyzer {
55 public TokenStream tokenStream(String fieldName, Reader reader) {
56 return new TokenStreamOverlap();
60 private static final class TokenStreamOverlap extends TokenStream {
61 private Token[] tokens;
65 private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
66 private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
67 private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
69 public TokenStreamOverlap() {
74 public boolean incrementToken() throws IOException {
76 if (this.i >= this.tokens.length) {
80 termAttribute.setEmpty().append(this.tokens[i]);
81 offsetAttribute.setOffset(this.tokens[i].startOffset(),
82 this.tokens[i].endOffset());
83 positionIncrementAttribute.setPositionIncrement(this.tokens[i]
84 .getPositionIncrement());
91 this.tokens = new Token[] {
92 new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
93 new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
94 new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
95 new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
96 new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
97 new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
98 this.tokens[1].setPositionIncrement(0);
102 public void testOverlapWithOffset() throws CorruptIndexException,
103 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
104 final String TEXT = "the fox did not jump";
105 final Directory directory = newDirectory();
106 final IndexWriter indexWriter = new IndexWriter(directory,
107 newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
109 final Document document = new Document();
110 document.add(new Field(FIELD, new TokenStreamOverlap(),
111 TermVector.WITH_OFFSETS));
112 indexWriter.addDocument(document);
116 final IndexReader indexReader = IndexReader.open(directory, true);
118 assertEquals(1, indexReader.numDocs());
119 final IndexSearcher indexSearcher = newSearcher(indexReader);
121 final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
122 query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
123 query.add(new SpanTermQuery(new Term(FIELD, "fox")));
124 // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
125 // new SpanTermQuery(new Term(FIELD, "{fox}")),
126 // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
128 TopDocs hits = indexSearcher.search(query, 1);
129 assertEquals(1, hits.totalHits);
130 final Highlighter highlighter = new Highlighter(
131 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
132 new QueryScorer(query));
133 final TokenStream tokenStream = TokenSources
135 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
137 assertEquals("<B>the fox</B> did not jump",
138 highlighter.getBestFragment(tokenStream, TEXT));
140 indexSearcher.close();
148 public void testOverlapWithPositionsAndOffset() throws CorruptIndexException,
149 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
150 final String TEXT = "the fox did not jump";
151 final Directory directory = newDirectory();
152 final IndexWriter indexWriter = new IndexWriter(directory,
153 newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
155 final Document document = new Document();
156 document.add(new Field(FIELD, new TokenStreamOverlap(),
157 TermVector.WITH_POSITIONS_OFFSETS));
158 indexWriter.addDocument(document);
162 final IndexReader indexReader = IndexReader.open(directory, true);
164 assertEquals(1, indexReader.numDocs());
165 final IndexSearcher indexSearcher = newSearcher(indexReader);
167 final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
168 query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
169 query.add(new SpanTermQuery(new Term(FIELD, "fox")));
170 // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
171 // new SpanTermQuery(new Term(FIELD, "{fox}")),
172 // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
174 TopDocs hits = indexSearcher.search(query, 1);
175 assertEquals(1, hits.totalHits);
176 final Highlighter highlighter = new Highlighter(
177 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
178 new QueryScorer(query));
179 final TokenStream tokenStream = TokenSources
181 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
183 assertEquals("<B>the fox</B> did not jump",
184 highlighter.getBestFragment(tokenStream, TEXT));
186 indexSearcher.close();
194 public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
195 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
196 final String TEXT = "the fox did not jump";
197 final Directory directory = newDirectory();
198 final IndexWriter indexWriter = new IndexWriter(directory,
199 newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
201 final Document document = new Document();
202 document.add(new Field(FIELD, new TokenStreamOverlap(),
203 TermVector.WITH_OFFSETS));
204 indexWriter.addDocument(document);
208 final IndexReader indexReader = IndexReader.open(directory, true);
210 assertEquals(1, indexReader.numDocs());
211 final IndexSearcher indexSearcher = newSearcher(indexReader);
213 // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
214 // query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
215 // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
216 final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
217 new SpanTermQuery(new Term(FIELD, "the")),
218 new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
220 TopDocs hits = indexSearcher.search(phraseQuery, 1);
221 assertEquals(1, hits.totalHits);
222 final Highlighter highlighter = new Highlighter(
223 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
224 new QueryScorer(phraseQuery));
225 final TokenStream tokenStream = TokenSources
227 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
229 assertEquals("<B>the fox</B> did not jump",
230 highlighter.getBestFragment(tokenStream, TEXT));
232 indexSearcher.close();
240 public void testOverlapWithPositionsAndOffsetExactPhrase()
241 throws CorruptIndexException, LockObtainFailedException, IOException,
242 InvalidTokenOffsetsException {
243 final String TEXT = "the fox did not jump";
244 final Directory directory = newDirectory();
245 final IndexWriter indexWriter = new IndexWriter(directory,
246 newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
248 final Document document = new Document();
249 document.add(new Field(FIELD, new TokenStreamOverlap(),
250 TermVector.WITH_POSITIONS_OFFSETS));
251 indexWriter.addDocument(document);
255 final IndexReader indexReader = IndexReader.open(directory, true);
257 assertEquals(1, indexReader.numDocs());
258 final IndexSearcher indexSearcher = newSearcher(indexReader);
260 // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
261 // query.add(new SpanTermQuery(new Term(FIELD, "the")));
262 // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
263 final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
264 new SpanTermQuery(new Term(FIELD, "the")),
265 new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
267 TopDocs hits = indexSearcher.search(phraseQuery, 1);
268 assertEquals(1, hits.totalHits);
269 final Highlighter highlighter = new Highlighter(
270 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
271 new QueryScorer(phraseQuery));
272 final TokenStream tokenStream = TokenSources
274 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
276 assertEquals("<B>the fox</B> did not jump",
277 highlighter.getBestFragment(tokenStream, TEXT));
279 indexSearcher.close();