1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.Reader;
22 import java.util.Arrays;
23 import java.util.Iterator;
25 import java.util.SortedSet;
27 import org.apache.lucene.analysis.*;
28 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 import org.apache.lucene.document.Document;
32 import org.apache.lucene.document.Field;
33 import org.apache.lucene.store.Directory;
34 import org.apache.lucene.util.LuceneTestCase;
36 public class TestTermVectorsReader extends LuceneTestCase {
37 //Must be lexicographically sorted, will do in setup, versus trying to maintain here
38 private String[] testFields = {"f1", "f2", "f3", "f4"};
39 private boolean[] testFieldsStorePos = {true, false, true, false};
40 private boolean[] testFieldsStoreOff = {true, false, false, true};
41 private String[] testTerms = {"this", "is", "a", "test"};
42 private int[][] positions = new int[testTerms.length][];
43 private TermVectorOffsetInfo[][] offsets = new TermVectorOffsetInfo[testTerms.length][];
44 private Directory dir;
46 private FieldInfos fieldInfos = new FieldInfos();
47 private static int TERM_FREQ = 3;
49 private class TestToken implements Comparable<TestToken> {
54 public int compareTo(TestToken other) {
55 return pos - other.pos;
59 TestToken[] tokens = new TestToken[testTerms.length * TERM_FREQ];
62 public void setUp() throws Exception {
65 for (int i = 0; i < testFields.length; i++) {
66 fieldInfos.add(testFields[i], true, true, testFieldsStorePos[i], testFieldsStoreOff[i]);
70 Arrays.sort(testTerms);
72 for (int i = 0; i < testTerms.length; i++) {
73 positions[i] = new int[TERM_FREQ];
74 offsets[i] = new TermVectorOffsetInfo[TERM_FREQ];
75 // first position must be 0
76 for (int j = 0; j < TERM_FREQ; j++) {
77 // positions are always sorted in increasing order
78 positions[i][j] = (int) (j * 10 + Math.random() * 10);
79 // offsets are always sorted in increasing order
80 offsets[i][j] = new TermVectorOffsetInfo(j * 10, j * 10 + testTerms[i].length());
81 TestToken token = tokens[tokenUpto++] = new TestToken();
82 token.text = testTerms[i];
83 token.pos = positions[i][j];
84 token.startOffset = offsets[i][j].getStartOffset();
85 token.endOffset = offsets[i][j].getEndOffset();
91 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MyAnalyzer()).setMaxBufferedDocs(-1).setMergePolicy(newLogMergePolicy(false, 10)));
93 Document doc = new Document();
94 for(int i=0;i<testFields.length;i++) {
95 final Field.TermVector tv;
96 if (testFieldsStorePos[i] && testFieldsStoreOff[i])
97 tv = Field.TermVector.WITH_POSITIONS_OFFSETS;
98 else if (testFieldsStorePos[i] && !testFieldsStoreOff[i])
99 tv = Field.TermVector.WITH_POSITIONS;
100 else if (!testFieldsStorePos[i] && testFieldsStoreOff[i])
101 tv = Field.TermVector.WITH_OFFSETS;
103 tv = Field.TermVector.YES;
104 doc.add(new Field(testFields[i], "", Field.Store.NO, Field.Index.ANALYZED, tv));
107 //Create 5 documents for testing, they all have the same
110 writer.addDocument(doc);
112 seg = writer.newestSegment().name;
115 fieldInfos = new FieldInfos(dir, IndexFileNames.segmentFileName(seg, IndexFileNames.FIELD_INFOS_EXTENSION));
119 public void tearDown() throws Exception {
124 private class MyTokenStream extends TokenStream {
125 private int tokenUpto;
127 private final CharTermAttribute termAtt;
128 private final PositionIncrementAttribute posIncrAtt;
129 private final OffsetAttribute offsetAtt;
131 public MyTokenStream() {
132 termAtt = addAttribute(CharTermAttribute.class);
133 posIncrAtt = addAttribute(PositionIncrementAttribute.class);
134 offsetAtt = addAttribute(OffsetAttribute.class);
138 public boolean incrementToken() {
139 if (tokenUpto >= tokens.length)
142 final TestToken testToken = tokens[tokenUpto++];
144 termAtt.append(testToken.text);
145 offsetAtt.setOffset(testToken.startOffset, testToken.endOffset);
147 posIncrAtt.setPositionIncrement(testToken.pos - tokens[tokenUpto-2].pos);
149 posIncrAtt.setPositionIncrement(testToken.pos+1);
156 public void reset() throws IOException {
162 private class MyAnalyzer extends Analyzer {
164 public TokenStream tokenStream(String fieldName, Reader reader) {
165 return new MyTokenStream();
169 public void test() throws IOException {
170 //Check to see the files were created properly in setup
171 assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_DOCUMENTS_EXTENSION)));
172 assertTrue(dir.fileExists(IndexFileNames.segmentFileName(seg, IndexFileNames.VECTORS_INDEX_EXTENSION)));
175 public void testReader() throws IOException {
176 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
177 for (int j = 0; j < 5; j++) {
178 TermFreqVector vector = reader.get(j, testFields[0]);
179 assertTrue(vector != null);
180 String[] terms = vector.getTerms();
181 assertTrue(terms != null);
182 assertTrue(terms.length == testTerms.length);
183 for (int i = 0; i < terms.length; i++) {
184 String term = terms[i];
185 //System.out.println("Term: " + term);
186 assertTrue(term.equals(testTerms[i]));
192 public void testPositionReader() throws IOException {
193 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
194 TermPositionVector vector;
196 vector = (TermPositionVector) reader.get(0, testFields[0]);
197 assertTrue(vector != null);
198 terms = vector.getTerms();
199 assertTrue(terms != null);
200 assertTrue(terms.length == testTerms.length);
201 for (int i = 0; i < terms.length; i++) {
202 String term = terms[i];
203 //System.out.println("Term: " + term);
204 assertTrue(term.equals(testTerms[i]));
205 int[] positions = vector.getTermPositions(i);
206 assertTrue(positions != null);
207 assertTrue(positions.length == this.positions[i].length);
208 for (int j = 0; j < positions.length; j++) {
209 int position = positions[j];
210 assertTrue(position == this.positions[i][j]);
212 TermVectorOffsetInfo[] offset = vector.getOffsets(i);
213 assertTrue(offset != null);
214 assertTrue(offset.length == this.offsets[i].length);
215 for (int j = 0; j < offset.length; j++) {
216 TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
217 assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
221 TermFreqVector freqVector = reader.get(0, testFields[1]); //no pos, no offset
222 assertTrue(freqVector != null);
223 assertTrue(freqVector instanceof TermPositionVector == false);
224 terms = freqVector.getTerms();
225 assertTrue(terms != null);
226 assertTrue(terms.length == testTerms.length);
227 for (int i = 0; i < terms.length; i++) {
228 String term = terms[i];
229 //System.out.println("Term: " + term);
230 assertTrue(term.equals(testTerms[i]));
235 public void testOffsetReader() throws IOException {
236 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
237 TermPositionVector vector = (TermPositionVector) reader.get(0, testFields[0]);
238 assertTrue(vector != null);
239 String[] terms = vector.getTerms();
240 assertTrue(terms != null);
241 assertTrue(terms.length == testTerms.length);
242 for (int i = 0; i < terms.length; i++) {
243 String term = terms[i];
244 //System.out.println("Term: " + term);
245 assertTrue(term.equals(testTerms[i]));
246 int[] positions = vector.getTermPositions(i);
247 assertTrue(positions != null);
248 assertTrue(positions.length == this.positions[i].length);
249 for (int j = 0; j < positions.length; j++) {
250 int position = positions[j];
251 assertTrue(position == this.positions[i][j]);
253 TermVectorOffsetInfo[] offset = vector.getOffsets(i);
254 assertTrue(offset != null);
255 assertTrue(offset.length == this.offsets[i].length);
256 for (int j = 0; j < offset.length; j++) {
257 TermVectorOffsetInfo termVectorOffsetInfo = offset[j];
258 assertTrue(termVectorOffsetInfo.equals(offsets[i][j]));
264 public void testMapper() throws IOException {
265 TermVectorsReader reader = new TermVectorsReader(dir, seg, fieldInfos);
266 SortedTermVectorMapper mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
267 reader.get(0, mapper);
268 SortedSet<TermVectorEntry> set = mapper.getTermVectorEntrySet();
269 assertTrue("set is null and it shouldn't be", set != null);
270 //three fields, 4 terms, all terms are the same
271 assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
272 //Check offsets and positions
273 for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
274 TermVectorEntry tve = iterator.next();
275 assertTrue("tve is null and it shouldn't be", tve != null);
276 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
277 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
281 mapper = new SortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
282 reader.get(1, mapper);
283 set = mapper.getTermVectorEntrySet();
284 assertTrue("set is null and it shouldn't be", set != null);
285 //three fields, 4 terms, all terms are the same
286 assertTrue("set Size: " + set.size() + " is not: " + 4, set.size() == 4);
287 //Should have offsets and positions b/c we are munging all the fields together
288 for (Iterator<TermVectorEntry> iterator = set.iterator(); iterator.hasNext();) {
289 TermVectorEntry tve = iterator.next();
290 assertTrue("tve is null and it shouldn't be", tve != null);
291 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
292 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
297 FieldSortedTermVectorMapper fsMapper = new FieldSortedTermVectorMapper(new TermVectorEntryFreqSortedComparator());
298 reader.get(0, fsMapper);
299 Map<String,SortedSet<TermVectorEntry>> map = fsMapper.getFieldToTerms();
300 assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
301 for (Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
302 SortedSet<TermVectorEntry> sortedSet = entry.getValue();
303 assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
304 for (final TermVectorEntry tve : sortedSet) {
305 assertTrue("tve is null and it shouldn't be", tve != null);
306 //Check offsets and positions.
307 assertTrue("tve is null and it shouldn't be", tve != null);
308 String field = tve.getField();
309 if (field.equals(testFields[0])) {
310 //should have offsets
312 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() != null);
313 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() != null);
315 else if (field.equals(testFields[1])) {
316 //should not have offsets
318 assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
319 assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
323 //Try mapper that ignores offs and positions
324 fsMapper = new FieldSortedTermVectorMapper(true, true, new TermVectorEntryFreqSortedComparator());
325 reader.get(0, fsMapper);
326 map = fsMapper.getFieldToTerms();
327 assertTrue("map Size: " + map.size() + " is not: " + testFields.length, map.size() == testFields.length);
328 for (final Map.Entry<String,SortedSet<TermVectorEntry>> entry : map.entrySet()) {
329 SortedSet<TermVectorEntry> sortedSet = entry.getValue();
330 assertTrue("sortedSet Size: " + sortedSet.size() + " is not: " + 4, sortedSet.size() == 4);
331 for (final TermVectorEntry tve : sortedSet) {
332 assertTrue("tve is null and it shouldn't be", tve != null);
333 //Check offsets and positions.
334 assertTrue("tve is null and it shouldn't be", tve != null);
335 String field = tve.getField();
336 if (field.equals(testFields[0])) {
337 //should have offsets
339 assertTrue("tve.getOffsets() is null and it shouldn't be", tve.getOffsets() == null);
340 assertTrue("tve.getPositions() is null and it shouldn't be", tve.getPositions() == null);
342 else if (field.equals(testFields[1])) {
343 //should not have offsets
345 assertTrue("tve.getOffsets() is not null and it shouldn't be", tve.getOffsets() == null);
346 assertTrue("tve.getPositions() is not null and it shouldn't be", tve.getPositions() == null);
351 // test setDocumentNumber()
352 IndexReader ir = IndexReader.open(dir, true);
353 DocNumAwareMapper docNumAwareMapper = new DocNumAwareMapper();
354 assertEquals(-1, docNumAwareMapper.getDocumentNumber());
356 ir.getTermFreqVector(0, docNumAwareMapper);
357 assertEquals(0, docNumAwareMapper.getDocumentNumber());
358 docNumAwareMapper.setDocumentNumber(-1);
360 ir.getTermFreqVector(1, docNumAwareMapper);
361 assertEquals(1, docNumAwareMapper.getDocumentNumber());
362 docNumAwareMapper.setDocumentNumber(-1);
364 ir.getTermFreqVector(0, "f1", docNumAwareMapper);
365 assertEquals(0, docNumAwareMapper.getDocumentNumber());
366 docNumAwareMapper.setDocumentNumber(-1);
368 ir.getTermFreqVector(1, "f2", docNumAwareMapper);
369 assertEquals(1, docNumAwareMapper.getDocumentNumber());
370 docNumAwareMapper.setDocumentNumber(-1);
372 ir.getTermFreqVector(0, "f1", docNumAwareMapper);
373 assertEquals(0, docNumAwareMapper.getDocumentNumber());
381 * Make sure exceptions and bad params are handled appropriately
383 public void testBadParams() throws IOException {
384 TermVectorsReader reader = null;
386 reader = new TermVectorsReader(dir, seg, fieldInfos);
387 //Bad document number, good field number
388 reader.get(50, testFields[0]);
390 } catch (IOException e) {
391 // expected exception
396 reader = new TermVectorsReader(dir, seg, fieldInfos);
397 //Bad document number, no field
400 } catch (IOException e) {
401 // expected exception
406 reader = new TermVectorsReader(dir, seg, fieldInfos);
407 //good document number, bad field number
408 TermFreqVector vector = reader.get(0, "f50");
409 assertTrue(vector == null);
411 } catch (IOException e) {
419 public static class DocNumAwareMapper extends TermVectorMapper {
421 public DocNumAwareMapper() {
424 private int documentNumber = -1;
427 public void setExpectations(String field, int numTerms, boolean storeOffsets, boolean storePositions) {
428 if (documentNumber == -1) {
429 throw new RuntimeException("Documentnumber should be set at this point!");
434 public void map(String term, int frequency, TermVectorOffsetInfo[] offsets, int[] positions) {
435 if (documentNumber == -1) {
436 throw new RuntimeException("Documentnumber should be set at this point!");
440 public int getDocumentNumber() {
441 return documentNumber;
445 public void setDocumentNumber(int documentNumber) {
446 this.documentNumber = documentNumber;