pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / highlighter / src / test / org / apache / lucene / search / highlight / TokenSourcesTest.java
1 package org.apache.lucene.search.highlight;
2
3 /**
4  * Licensed to the Apache Software Foundation (ASF) under one or more
5  * contributor license agreements.  See the NOTICE file distributed with
6  * this work for additional information regarding copyright ownership.
7  * The ASF licenses this file to You under the Apache License, Version 2.0
8  * (the "License"); you may not use this file except in compliance with
9  * the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19
20 import java.io.IOException;
21 import java.io.Reader;
22
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.Token;
25 import org.apache.lucene.analysis.TokenStream;
26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
27 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
28 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.document.Field;
31 import org.apache.lucene.document.Field.TermVector;
32 import org.apache.lucene.index.CorruptIndexException;
33 import org.apache.lucene.index.IndexReader;
34 import org.apache.lucene.index.IndexWriter;
35 import org.apache.lucene.index.Term;
36 import org.apache.lucene.index.TermPositionVector;
37 import org.apache.lucene.search.DisjunctionMaxQuery;
38 import org.apache.lucene.search.IndexSearcher;
39 import org.apache.lucene.search.Query;
40 import org.apache.lucene.search.TopDocs;
41 import org.apache.lucene.search.spans.SpanNearQuery;
42 import org.apache.lucene.search.spans.SpanQuery;
43 import org.apache.lucene.search.spans.SpanTermQuery;
44 import org.apache.lucene.store.Directory;
45 import org.apache.lucene.store.LockObtainFailedException;
46 import org.apache.lucene.util.LuceneTestCase;
47
48 // LUCENE-2874
49 public class TokenSourcesTest extends LuceneTestCase {
50   private static final String FIELD = "text";
51
52   private static final class OverlapAnalyzer extends Analyzer {
53
54     @Override
55     public TokenStream tokenStream(String fieldName, Reader reader) {
56       return new TokenStreamOverlap();
57     }
58   }
59
60   private static final class TokenStreamOverlap extends TokenStream {
61     private Token[] tokens;
62
63     private int i = -1;
64
65     private final CharTermAttribute termAttribute = addAttribute(CharTermAttribute.class);
66     private final OffsetAttribute offsetAttribute = addAttribute(OffsetAttribute.class);
67     private final PositionIncrementAttribute positionIncrementAttribute = addAttribute(PositionIncrementAttribute.class);
68
69     public TokenStreamOverlap() {
70       reset();
71     }
72
73     @Override
74     public boolean incrementToken() throws IOException {
75       this.i++;
76       if (this.i >= this.tokens.length) {
77         return false;
78       }
79       clearAttributes();
80       termAttribute.setEmpty().append(this.tokens[i]);
81       offsetAttribute.setOffset(this.tokens[i].startOffset(),
82           this.tokens[i].endOffset());
83       positionIncrementAttribute.setPositionIncrement(this.tokens[i]
84           .getPositionIncrement());
85       return true;
86     }
87
88     @Override
89     public void reset() {
90       this.i = -1;
91       this.tokens = new Token[] {
92           new Token(new char[] {'t', 'h', 'e'}, 0, 3, 0, 3),
93           new Token(new char[] {'{', 'f', 'o', 'x', '}'}, 0, 5, 0, 7),
94           new Token(new char[] {'f', 'o', 'x'}, 0, 3, 4, 7),
95           new Token(new char[] {'d', 'i', 'd'}, 0, 3, 8, 11),
96           new Token(new char[] {'n', 'o', 't'}, 0, 3, 12, 15),
97           new Token(new char[] {'j', 'u', 'm', 'p'}, 0, 4, 16, 20)};
98       this.tokens[1].setPositionIncrement(0);
99     }
100   }
101
102   public void testOverlapWithOffset() throws CorruptIndexException,
103       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
104     final String TEXT = "the fox did not jump";
105     final Directory directory = newDirectory();
106     final IndexWriter indexWriter = new IndexWriter(directory,
107         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
108     try {
109       final Document document = new Document();
110       document.add(new Field(FIELD, new TokenStreamOverlap(),
111           TermVector.WITH_OFFSETS));
112       indexWriter.addDocument(document);
113     } finally {
114       indexWriter.close();
115     }
116     final IndexReader indexReader = IndexReader.open(directory, true);
117     try {
118       assertEquals(1, indexReader.numDocs());
119       final IndexSearcher indexSearcher = newSearcher(indexReader);
120       try {
121         final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
122         query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
123         query.add(new SpanTermQuery(new Term(FIELD, "fox")));
124         // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
125         // new SpanTermQuery(new Term(FIELD, "{fox}")),
126         // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
127
128         TopDocs hits = indexSearcher.search(query, 1);
129         assertEquals(1, hits.totalHits);
130         final Highlighter highlighter = new Highlighter(
131             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
132             new QueryScorer(query));
133         final TokenStream tokenStream = TokenSources
134             .getTokenStream(
135                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
136                 false);
137         assertEquals("<B>the fox</B> did not jump",
138             highlighter.getBestFragment(tokenStream, TEXT));
139       } finally {
140         indexSearcher.close();
141       }
142     } finally {
143       indexReader.close();
144       directory.close();
145     }
146   }
147
148   public void testOverlapWithPositionsAndOffset() throws CorruptIndexException,
149       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
150     final String TEXT = "the fox did not jump";
151     final Directory directory = newDirectory();
152     final IndexWriter indexWriter = new IndexWriter(directory,
153         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
154     try {
155       final Document document = new Document();
156       document.add(new Field(FIELD, new TokenStreamOverlap(),
157           TermVector.WITH_POSITIONS_OFFSETS));
158       indexWriter.addDocument(document);
159     } finally {
160       indexWriter.close();
161     }
162     final IndexReader indexReader = IndexReader.open(directory, true);
163     try {
164       assertEquals(1, indexReader.numDocs());
165       final IndexSearcher indexSearcher = newSearcher(indexReader);
166       try {
167         final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
168         query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
169         query.add(new SpanTermQuery(new Term(FIELD, "fox")));
170         // final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
171         // new SpanTermQuery(new Term(FIELD, "{fox}")),
172         // new SpanTermQuery(new Term(FIELD, "fox")) }, 0, true);
173
174         TopDocs hits = indexSearcher.search(query, 1);
175         assertEquals(1, hits.totalHits);
176         final Highlighter highlighter = new Highlighter(
177             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
178             new QueryScorer(query));
179         final TokenStream tokenStream = TokenSources
180             .getTokenStream(
181                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
182                 false);
183         assertEquals("<B>the fox</B> did not jump",
184             highlighter.getBestFragment(tokenStream, TEXT));
185       } finally {
186         indexSearcher.close();
187       }
188     } finally {
189       indexReader.close();
190       directory.close();
191     }
192   }
193
194   public void testOverlapWithOffsetExactPhrase() throws CorruptIndexException,
195       LockObtainFailedException, IOException, InvalidTokenOffsetsException {
196     final String TEXT = "the fox did not jump";
197     final Directory directory = newDirectory();
198     final IndexWriter indexWriter = new IndexWriter(directory,
199         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
200     try {
201       final Document document = new Document();
202       document.add(new Field(FIELD, new TokenStreamOverlap(),
203           TermVector.WITH_OFFSETS));
204       indexWriter.addDocument(document);
205     } finally {
206       indexWriter.close();
207     }
208     final IndexReader indexReader = IndexReader.open(directory, true);
209     try {
210       assertEquals(1, indexReader.numDocs());
211       final IndexSearcher indexSearcher = newSearcher(indexReader);
212       try {
213         // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
214         // query.add(new SpanTermQuery(new Term(FIELD, "{fox}")));
215         // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
216         final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
217             new SpanTermQuery(new Term(FIELD, "the")),
218             new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
219
220         TopDocs hits = indexSearcher.search(phraseQuery, 1);
221         assertEquals(1, hits.totalHits);
222         final Highlighter highlighter = new Highlighter(
223             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
224             new QueryScorer(phraseQuery));
225         final TokenStream tokenStream = TokenSources
226             .getTokenStream(
227                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
228                 false);
229         assertEquals("<B>the fox</B> did not jump",
230             highlighter.getBestFragment(tokenStream, TEXT));
231       } finally {
232         indexSearcher.close();
233       }
234     } finally {
235       indexReader.close();
236       directory.close();
237     }
238   }
239
240   public void testOverlapWithPositionsAndOffsetExactPhrase()
241       throws CorruptIndexException, LockObtainFailedException, IOException,
242       InvalidTokenOffsetsException {
243     final String TEXT = "the fox did not jump";
244     final Directory directory = newDirectory();
245     final IndexWriter indexWriter = new IndexWriter(directory,
246         newIndexWriterConfig(TEST_VERSION_CURRENT, new OverlapAnalyzer()));
247     try {
248       final Document document = new Document();
249       document.add(new Field(FIELD, new TokenStreamOverlap(),
250           TermVector.WITH_POSITIONS_OFFSETS));
251       indexWriter.addDocument(document);
252     } finally {
253       indexWriter.close();
254     }
255     final IndexReader indexReader = IndexReader.open(directory, true);
256     try {
257       assertEquals(1, indexReader.numDocs());
258       final IndexSearcher indexSearcher = newSearcher(indexReader);
259       try {
260         // final DisjunctionMaxQuery query = new DisjunctionMaxQuery(1);
261         // query.add(new SpanTermQuery(new Term(FIELD, "the")));
262         // query.add(new SpanTermQuery(new Term(FIELD, "fox")));
263         final Query phraseQuery = new SpanNearQuery(new SpanQuery[] {
264             new SpanTermQuery(new Term(FIELD, "the")),
265             new SpanTermQuery(new Term(FIELD, "fox"))}, 0, true);
266
267         TopDocs hits = indexSearcher.search(phraseQuery, 1);
268         assertEquals(1, hits.totalHits);
269         final Highlighter highlighter = new Highlighter(
270             new SimpleHTMLFormatter(), new SimpleHTMLEncoder(),
271             new QueryScorer(phraseQuery));
272         final TokenStream tokenStream = TokenSources
273             .getTokenStream(
274                 (TermPositionVector) indexReader.getTermFreqVector(0, FIELD),
275                 false);
276         assertEquals("<B>the fox</B> did not jump",
277             highlighter.getBestFragment(tokenStream, TEXT));
278       } finally {
279         indexSearcher.close();
280       }
281     } finally {
282       indexReader.close();
283       directory.close();
284     }
285   }
286
287 }