1 package org.apache.lucene.search.highlight;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.MockAnalyzer;
23 import org.apache.lucene.analysis.MockTokenizer;
24 import org.apache.lucene.analysis.Token;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
27 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.document.Field;
31 import org.apache.lucene.document.Field.Index;
32 import org.apache.lucene.document.Field.Store;
33 import org.apache.lucene.document.Field.TermVector;
34 import org.apache.lucene.index.CorruptIndexException;
35 import org.apache.lucene.index.IndexReader;
36 import org.apache.lucene.index.IndexWriter;
37 import org.apache.lucene.index.Term;
38 import org.apache.lucene.index.TermPositionVector;
39 import org.apache.lucene.search.Collector;
40 import org.apache.lucene.search.IndexSearcher;
41 import org.apache.lucene.search.PhraseQuery;
42 import org.apache.lucene.search.Query;
43 import org.apache.lucene.search.TopDocs;
45 import org.apache.lucene.search.spans.SpanNearQuery;
46 import org.apache.lucene.search.spans.SpanQuery;
47 import org.apache.lucene.search.spans.SpanTermQuery;
48 import org.apache.lucene.store.Directory;
49 import org.apache.lucene.store.LockObtainFailedException;
50 import org.apache.lucene.util.LuceneTestCase;
51 import org.apache.lucene.util.FixedBitSet;
53 public class HighlighterPhraseTest extends LuceneTestCase {
54 private static final String FIELD = "text";
55 public void testConcurrentPhrase() throws CorruptIndexException,
56 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
57 final String TEXT = "the fox jumped";
58 final Directory directory = newDirectory();
59 final IndexWriter indexWriter = new IndexWriter(directory,
60 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
62 final Document document = new Document();
63 document.add(new Field(FIELD, new TokenStreamConcurrent(),
64 TermVector.WITH_POSITIONS_OFFSETS));
65 indexWriter.addDocument(document);
69 final IndexReader indexReader = IndexReader.open(directory, true);
71 assertEquals(1, indexReader.numDocs());
72 final IndexSearcher indexSearcher = newSearcher(indexReader);
74 final PhraseQuery phraseQuery = new PhraseQuery();
75 phraseQuery.add(new Term(FIELD, "fox"));
76 phraseQuery.add(new Term(FIELD, "jumped"));
77 phraseQuery.setSlop(0);
78 TopDocs hits = indexSearcher.search(phraseQuery, 1);
79 assertEquals(1, hits.totalHits);
80 final Highlighter highlighter = new Highlighter(
81 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
82 new QueryScorer(phraseQuery));
84 final TokenStream tokenStream = TokenSources
85 .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
87 assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
88 TEXT), highlighter.getBestFragment(tokenStream, TEXT));
91 indexSearcher.close();
99 public void testConcurrentSpan() throws CorruptIndexException,
100 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
101 final String TEXT = "the fox jumped";
102 final Directory directory = newDirectory();
103 final IndexWriter indexWriter = new IndexWriter(directory,
104 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
106 final Document document = new Document();
107 document.add(new Field(FIELD, new TokenStreamConcurrent(),
108 TermVector.WITH_POSITIONS_OFFSETS));
109 indexWriter.addDocument(document);
113 final IndexReader indexReader = IndexReader.open(directory, true);
115 assertEquals(1, indexReader.numDocs());
116 final IndexSearcher indexSearcher = newSearcher(indexReader);
118 final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
119 new SpanTermQuery(new Term(FIELD, "fox")),
120 new SpanTermQuery(new Term(FIELD, "jumped")) }, 0, true);
121 final FixedBitSet bitset = new FixedBitSet(indexReader.maxDoc());
122 indexSearcher.search(phraseQuery, new Collector() {
126 public boolean acceptsDocsOutOfOrder() {
131 public void collect(int i) throws IOException {
132 bitset.set(this.baseDoc + i);
136 public void setNextReader(IndexReader indexreader, int i)
142 public void setScorer(org.apache.lucene.search.Scorer scorer)
147 assertEquals(1, bitset.cardinality());
148 final int maxDoc = indexReader.maxDoc();
149 final Highlighter highlighter = new Highlighter(
150 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
151 new QueryScorer(phraseQuery));
152 for (int position = bitset.nextSetBit(0); position >= 0 && position < maxDoc-1; position = bitset
153 .nextSetBit(position + 1)) {
154 assertEquals(0, position);
155 final TokenStream tokenStream = TokenSources.getTokenStream(
156 (TermPositionVector) indexReader.getTermFreqVector(position,
158 assertEquals(highlighter.getBestFragment(new TokenStreamConcurrent(),
159 TEXT), highlighter.getBestFragment(tokenStream, TEXT));
162 indexSearcher.close();
170 public void testSparsePhrase() throws CorruptIndexException,
171 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
172 final String TEXT = "the fox did not jump";
173 final Directory directory = newDirectory();
174 final IndexWriter indexWriter = new IndexWriter(directory,
175 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
177 final Document document = new Document();
178 document.add(new Field(FIELD, new TokenStreamSparse(),
179 TermVector.WITH_POSITIONS_OFFSETS));
180 indexWriter.addDocument(document);
184 final IndexReader indexReader = IndexReader.open(directory, true);
186 assertEquals(1, indexReader.numDocs());
187 final IndexSearcher indexSearcher = newSearcher(indexReader);
189 final PhraseQuery phraseQuery = new PhraseQuery();
190 phraseQuery.add(new Term(FIELD, "did"));
191 phraseQuery.add(new Term(FIELD, "jump"));
192 phraseQuery.setSlop(0);
193 TopDocs hits = indexSearcher.search(phraseQuery, 1);
194 assertEquals(0, hits.totalHits);
195 final Highlighter highlighter = new Highlighter(
196 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
197 new QueryScorer(phraseQuery));
198 final TokenStream tokenStream = TokenSources
199 .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
202 highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
203 highlighter.getBestFragment(tokenStream, TEXT));
205 indexSearcher.close();
213 public void testSparsePhraseWithNoPositions() throws CorruptIndexException,
214 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
215 final String TEXT = "the fox did not jump";
216 final Directory directory = newDirectory();
217 final IndexWriter indexWriter = new IndexWriter(directory,
218 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
220 final Document document = new Document();
221 document.add(new Field(FIELD, TEXT, Store.YES, Index.ANALYZED,
222 TermVector.WITH_OFFSETS));
223 indexWriter.addDocument(document);
227 final IndexReader indexReader = IndexReader.open(directory, true);
229 assertEquals(1, indexReader.numDocs());
230 final IndexSearcher indexSearcher = newSearcher(indexReader);
232 final PhraseQuery phraseQuery = new PhraseQuery();
233 phraseQuery.add(new Term(FIELD, "did"));
234 phraseQuery.add(new Term(FIELD, "jump"));
235 phraseQuery.setSlop(1);
236 TopDocs hits = indexSearcher.search(phraseQuery, 1);
237 assertEquals(1, hits.totalHits);
238 final Highlighter highlighter = new Highlighter(
239 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
240 new QueryScorer(phraseQuery));
241 final TokenStream tokenStream = TokenSources.getTokenStream(
242 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD), true);
243 assertEquals("the fox <B>did</B> not <B>jump</B>", highlighter
244 .getBestFragment(tokenStream, TEXT));
246 indexSearcher.close();
254 public void testSparseSpan() throws CorruptIndexException,
255 LockObtainFailedException, IOException, InvalidTokenOffsetsException {
256 final String TEXT = "the fox did not jump";
257 final Directory directory = newDirectory();
258 final IndexWriter indexWriter = new IndexWriter(directory,
259 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random, MockTokenizer.WHITESPACE, false)));
261 final Document document = new Document();
262 document.add(new Field(FIELD, new TokenStreamSparse(),
263 TermVector.WITH_POSITIONS_OFFSETS));
264 indexWriter.addDocument(document);
268 final IndexReader indexReader = IndexReader.open(directory, true);
270 assertEquals(1, indexReader.numDocs());
271 final IndexSearcher indexSearcher = newSearcher(indexReader);
273 final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
274 new SpanTermQuery(new Term(FIELD, "did")),
275 new SpanTermQuery(new Term(FIELD, "jump")) }, 0, true);
277 TopDocs hits = indexSearcher.search(phraseQuery, 1);
278 assertEquals(0, hits.totalHits);
279 final Highlighter highlighter = new Highlighter(
280 new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
281 new QueryScorer(phraseQuery));
282 final TokenStream tokenStream = TokenSources
283 .getTokenStream((TermPositionVector) indexReader.getTermFreqVector(
286 highlighter.getBestFragment(new TokenStreamSparse(), TEXT),
287 highlighter.getBestFragment(tokenStream, TEXT));
289 indexSearcher.close();
297 private static final class TokenStreamSparse extends TokenStream {
298 private Token[] tokens;
302 private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
303 private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
304 private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
306 public TokenStreamSparse() {
311 public boolean incrementToken() throws IOException {
313 if (this.i >= this.tokens.length) {
317 termAttribute.setEmpty().append(this.tokens[i]);
318 offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
320 positionIncrementAttribute.setPositionIncrement(this.tokens[i]
321 .getPositionIncrement());
326 public void reset() {
328 this.tokens = new Token[] {
329 new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
330 new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
331 new Token(new char[] { 'd', 'i', 'd' }, 0, 3, 8, 11),
332 new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 16, 20) };
333 this.tokens[3].setPositionIncrement(2);
337 private static final class TokenStreamConcurrent extends TokenStream {
338 private Token[] tokens;
342 private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
343 private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
344 private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
346 public TokenStreamConcurrent() {
351 public boolean incrementToken() throws IOException {
353 if (this.i >= this.tokens.length) {
357 termAttribute.setEmpty().append(this.tokens[i]);
358 offsetAttribute.setOffset(this.tokens[i].startOffset(), this.tokens[i]
360 positionIncrementAttribute.setPositionIncrement(this.tokens[i]
361 .getPositionIncrement());
366 public void reset() {
368 this.tokens = new Token[] {
369 new Token(new char[] { 't', 'h', 'e' }, 0, 3, 0, 3),
370 new Token(new char[] { 'f', 'o', 'x' }, 0, 3, 4, 7),
371 new Token(new char[] { 'j', 'u', 'm', 'p' }, 0, 4, 8, 14),
372 new Token(new char[] { 'j', 'u', 'm', 'p', 'e', 'd' }, 0, 6, 8, 14) };
373 this.tokens[3].setPositionIncrement(0);