1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.util.Arrays;
23 import java.util.Iterator;
25 import java.util.SortedSet;
27 import org.apache.lucene.analysis.Analyzer;
28 import org.apache.lucene.analysis.TokenStream;
29 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
30 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
31 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
32 import org.apache.lucene.document.Document;
33 import org.apache.lucene.document.Field;
34 import org.apache.lucene.store.Directory;
35 import org.apache.lucene.util.LuceneTestCase;
37 public class TestTermVectorsReader extends LuceneTestCase {
38 //Must be lexicographically sorted, will do in setup, versus trying to maintain here
39 private String[] testFields = {"f1", "f2", "f3", "f4"};
40 private boolean[] testFieldsStorePos = {true, false, true, false};
41 private boolean[] testFieldsStoreOff = {true, false, false, true};
42 private String[] testTerms = {"this", "is", "a", "test"};
43 private int[][] positions = new int[testTerms.length][];
44 private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
45 private Directory dir;
47 private FieldInfos fieldInfos = new FieldInfos();
48 private static int TERM_FREQ = 3;
50 private class TestToken implements Comparable<TestToken> {
55 public int compareTo(TestToken other) {
56 return pos - other.pos;
60 TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
63 public void setUp() throws Exception {
66 for (int i = 0; i < testFields.length; i++) {
67 fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
71 Arrays.sort(testTerms);
73 for (int i = 0; i < testTerms.length; i++) {
74 positions[i] = new int[TERM_FREQ];
75 offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
76 // first position must be 0
77 for (int j = 0; j < TERM_FREQ; j++) {
78 // positions are always sorted in increasing order
79 positions[i][j] = (int) (j * 10 + Math.random() * 10);
80 // offsets are always sorted in increasing order
81 offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
82 TestToken token = tokens[tokenUpto++] = new TestToken();
83 token.text = testTerms[i];
84 token.pos = positions[i][j];
85 token.startOffset = offsets[i][j].getStartOffset();
86 token.endOffset = offsets[i][j].getEndOffset();
92 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MyAnalyzer()).setMaxBufferedDocs(-1).setMergePolicy(newLogMergePolicy(false, 10)));
94 Document doc = new Document();
95 for(int i=0;i<testFields.length;i++) {
96 final Field.TermVector tv;
97 if (testFieldsStorePos[i] && testFieldsStoreOff[i])
98 tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
99 else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
100 tv = Field.TermVector.WITH_POSITIONS;
101 else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
102 tv = Field.TermVector.WITH_OFFSETS;
104 tv = Field.TermVector.YES;
105 doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
108 //Create 5 documents for testing, they all have the same
111 writer.addDocument(doc);
113 seg = writer.newestSegment().name;
116 fieldInfos = new FieldInfos(dir, IndexFileNames.segmentFileName(seg, IndexFileNames.FIELD_INFOS_EXTENSION));
120 public void tearDown() throws Exception {
125 private class MyTokenStream extends TokenStream {
128 CharTermAttribute termAtt;
129 PositionIncrementAttribute posIncrAtt;
130 OffsetAttribute offsetAtt;
132 public MyTokenStream() {
133 termAtt = addAttribute(CharTermAttribute.class);
134 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
135 offsetAtt = addAttribute(OffsetAttribute.class);
139 public boolean incrementToken() {
140 if (tokenUpto >= tokens.length)
143 final TestToken testToken = tokens[tokenUpto++];
145 termAtt.append(testToken.text);
146 offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
148 posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
150 posIncrAtt.setPositionIncrement(testToken.pos+1);
157 private class MyAnalyzer extends Analyzer {
159 public TokenStream tokenStream(String fieldName, Reader reader) {
160 return new MyTokenStream();
164 public void test() throws IOException {
165 //Check to see the files were created properly in setup
166 assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)));
167 assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_INDEX_EXTENSION)));
170 public void testReader() throws IOException {
171 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
172 for (int j = 0; j < 5; j++) {
173 TermFreqVector vector = reader.get(j, testFields[0]);
174 assertTrue(vector != null);
175 String[] terms = vector.getTerms();
176 assertTrue(terms != null);
177 assertTrue(terms.length == testTerms.length);
178 for (int i = 0; i < terms.length; i++) {
179 String term = terms[i];
180 //System.out.println("Term: " + term);
181 assertTrue(term.equals(testTerms[i]));
187 public void testPositionReader() throws IOException {
188 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
189 TermPositionVector vector;
191 vector = (TermPositionVector) reader.get(0, testFields[0]);
192 assertTrue(vector != null);
193 terms = vector.getTerms();
194 assertTrue(terms != null);
195 assertTrue(terms.length == testTerms.length);
196 for (int i = 0; i < terms.length; i++) {
197 String term = terms[i];
198 //System.out.println("Term: " + term);
199 assertTrue(term.equals(testTerms[i]));
200 int[] positions = vector.getTermPositions(i);
201 assertTrue(positions != null);
202 assertTrue(positions.length == this.positions[i].length);
203 for (int j = 0; j < positions.length; j++) {
204 int position = positions[j];
205 assertTrue(position == this.positions[i][j]);
207 TermVectorOffsetInfo[] offset = vector.getOffsets(i);
208 assertTrue(offset != null);
209 assertTrue(offset.length == this.offsets[i].length);
210 for (int j = 0; j < offset.length; j++) {
211 TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
212 assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
216 TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
217 assertTrue(freqVector != null);
218 assertTrue(freqVector instanceof TermPositionVector == false);
219 terms = freqVector.getTerms();
220 assertTrue(terms != null);
221 assertTrue(terms.length == testTerms.length);
222 for (int i = 0; i < terms.length; i++) {
223 String term = terms[i];
224 //System.out.println("Term: " + term);
225 assertTrue(term.equals(testTerms[i]));
230 public void testOffsetReader() throws IOException {
231 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
232 TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
233 assertTrue(vector != null);
234 String[] terms = vector.getTerms();
235 assertTrue(terms != null);
236 assertTrue(terms.length == testTerms.length);
237 for (int i = 0; i < terms.length; i++) {
238 String term = terms[i];
239 //System.out.println("Term: " + term);
240 assertTrue(term.equals(testTerms[i]));
241 int[] positions = vector.getTermPositions(i);
242 assertTrue(positions != null);
243 assertTrue(positions.length == this.positions[i].length);
244 for (int j = 0; j < positions.length; j++) {
245 int position = positions[j];
246 assertTrue(position == this.positions[i][j]);
248 TermVectorOffsetInfo[] offset = vector.getOffsets(i);
249 assertTrue(offset != null);
250 assertTrue(offset.length == this.offsets[i].length);
251 for (int j = 0; j < offset.length; j++) {
252 TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
253 assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
259 public void testMapper() throws IOException {
260 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
261 SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
262 reader.get(0, mapper);
263 SortedSet<TermVectorEntry> set = mapper.getTermVectorEntrySet();
264 assertTrue("set is null and it shouldn't be", set != null);
265 //three fields, 4 terms, all terms are the same
266 assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
267 //Check offsets and positions
268 for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
269 TermVectorEntry tve = iterator.next();
270 assertTrue("tve is null and it shouldn't be", tve != null);
271 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
272 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
276 mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
277 reader.get(1, mapper);
278 set = mapper.getTermVectorEntrySet();
279 assertTrue("set is null and it shouldn't be", set != null);
280 //three fields, 4 terms, all terms are the same
281 assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
282 //Should have offsets and positions b/c we are munging all the fields together
283 for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
284 TermVectorEntry tve = iterator.next();
285 assertTrue("tve is null and it shouldn't be", tve != null);
286 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
287 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
292 FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
293 reader.get(0, fsMapper);
294 Map<String,SortedSet<TermVectorEntry>> map = fsMapper.getFieldToTerms();
295 assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
296 for (Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
297 SortedSet<TermVectorEntry> sortedSet = entry.getValue();
298 assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
299 for (final TermVectorEntry tve : sortedSet) {
300 assertTrue("tve is null and it shouldn't be", tve != null);
301 //Check offsets and positions.
302 assertTrue("tve is null and it shouldn't be", tve != null);
303 String field = tve.getField();
304 if (field.equals(testFields[0])) {
305 //should have offsets
307 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
308 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
310 else if (field.equals(testFields[1])) {
311 //should not have offsets
313 assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
314 assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
318 //Try mapper that ignores offs and positions
319 fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
320 reader.get(0, fsMapper);
321 map = fsMapper.getFieldToTerms();
322 assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
323 for (final Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
324 SortedSet<TermVectorEntry> sortedSet = entry.getValue();
325 assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
326 for (final TermVectorEntry tve : sortedSet) {
327 assertTrue("tve is null and it shouldn't be", tve != null);
328 //Check offsets and positions.
329 assertTrue("tve is null and it shouldn't be", tve != null);
330 String field = tve.getField();
331 if (field.equals(testFields[0])) {
332 //should have offsets
334 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
335 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
337 else if (field.equals(testFields[1])) {
338 //should not have offsets
340 assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
341 assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
346 // test setDocumentNumber()
347 IndexReader ir = IndexReader.open(dir, true);
348 DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
349 assertEquals(-1, docNumAwareMapper.getDocumentNumber());
351 ir.getTermFreqVector(0, docNumAwareMapper);
352 assertEquals(0, docNumAwareMapper.getDocumentNumber());
353 docNumAwareMapper.setDocumentNumber(-1);
355 ir.getTermFreqVector(1, docNumAwareMapper);
356 assertEquals(1, docNumAwareMapper.getDocumentNumber());
357 docNumAwareMapper.setDocumentNumber(-1);
359 ir.getTermFreqVector(0, "f1", docNumAwareMapper);
360 assertEquals(0, docNumAwareMapper.getDocumentNumber());
361 docNumAwareMapper.setDocumentNumber(-1);
363 ir.getTermFreqVector(1, "f2", docNumAwareMapper);
364 assertEquals(1, docNumAwareMapper.getDocumentNumber());
365 docNumAwareMapper.setDocumentNumber(-1);
367 ir.getTermFreqVector(0, "f1", docNumAwareMapper);
368 assertEquals(0, docNumAwareMapper.getDocumentNumber());
376 * Make sure exceptions and bad params are handled appropriately
378 public void testBadParams() throws IOException {
379 TermVectorsReader reader = null;
381 reader = new TermVectorsReader(dir, seg, fieldInfos);
382 //Bad document number, good field number
383 reader.get(50, testFields[0]);
385 } catch (IOException e) {
386 // expected exception
391 reader = new TermVectorsReader(dir, seg, fieldInfos);
392 //Bad document number, no field
395 } catch (IOException e) {
396 // expected exception
401 reader = new TermVectorsReader(dir, seg, fieldInfos);
402 //good document number, bad field number
403 TermFreqVector vector = reader.get(0, "f50");
404 assertTrue(vector == null);
406 } catch (IOException e) {
414 public static class DocNumAwareMapper extends TermVectorMapper {
416 public DocNumAwareMapper() {
419 private int documentNumber = -1;
422 public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
423 if (documentNumber == -1) {
424 throw new RuntimeException("Documentnumber should be set at this point!");
429 public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
430 if (documentNumber == -1) {
431 throw new RuntimeException("Documentnumber should be set at this point!");
435 public int getDocumentNumber() {
436 return documentNumber;
440 public void setDocumentNumber(int documentNumber) {
441 this.documentNumber = documentNumber;