1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.StringReader;
23 import org.apache.lucene.analysis.Analyzer;
24 import org.apache.lucene.analysis.CachingTokenFilter;
25 import org.apache.lucene.analysis.MockAnalyzer;
26 import org.apache.lucene.analysis.TeeSinkTokenFilter;
27 import org.apache.lucene.analysis.TokenStream;
28 import org.apache.lucene.analysis.WhitespaceAnalyzer;
29 import org.apache.lucene.analysis.standard.StandardAnalyzer;
30 import org.apache.lucene.document.Document;
31 import org.apache.lucene.document.Field;
32 import org.apache.lucene.store.Directory;
33 import org.apache.lucene.store.MockDirectoryWrapper;
34 import org.apache.lucene.store.RAMDirectory;
35 import org.apache.lucene.util.LuceneTestCase;
37 /** tests for writing term vectors */
38 public class TestTermVectorsWriter extends LuceneTestCase {
40 public void testDoubleOffsetCounting() throws Exception {
41 Directory dir = newDirectory();
42 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
43 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
44 Document doc = new Document();
45 Field f = newField("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
48 Field f2 = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
54 IndexReader r = IndexReader.open(dir, true);
55 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
57 // Token "" occurred once
58 assertEquals(1, termOffsets.length);
59 assertEquals(8, termOffsets[0].getStartOffset());
60 assertEquals(8, termOffsets[0].getEndOffset());
62 // Token "abcd" occurred three times
63 termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
64 assertEquals(3, termOffsets.length);
65 assertEquals(0, termOffsets[0].getStartOffset());
66 assertEquals(4, termOffsets[0].getEndOffset());
67 assertEquals(4, termOffsets[1].getStartOffset());
68 assertEquals(8, termOffsets[1].getEndOffset());
69 assertEquals(8, termOffsets[2].getStartOffset());
70 assertEquals(12, termOffsets[2].getEndOffset());
76 public void testDoubleOffsetCounting2() throws Exception {
77 Directory dir = newDirectory();
78 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
79 Document doc = new Document();
80 Field f = newField("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
86 IndexReader r = IndexReader.open(dir, true);
87 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
88 assertEquals(2, termOffsets.length);
89 assertEquals(0, termOffsets[0].getStartOffset());
90 assertEquals(4, termOffsets[0].getEndOffset());
91 assertEquals(5, termOffsets[1].getStartOffset());
92 assertEquals(9, termOffsets[1].getEndOffset());
98 public void testEndOffsetPositionCharAnalyzer() throws Exception {
99 Directory dir = newDirectory();
100 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
101 Document doc = new Document();
102 Field f = newField("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
108 IndexReader r = IndexReader.open(dir, true);
109 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
110 assertEquals(2, termOffsets.length);
111 assertEquals(0, termOffsets[0].getStartOffset());
112 assertEquals(4, termOffsets[0].getEndOffset());
113 assertEquals(8, termOffsets[1].getStartOffset());
114 assertEquals(12, termOffsets[1].getEndOffset());
120 public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
121 Directory dir = newDirectory();
122 Analyzer analyzer = new MockAnalyzer(random);
123 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
124 Document doc = new Document();
125 TokenStream stream = analyzer.tokenStream("field", new StringReader("abcd "));
126 stream.reset(); // TODO: wierd to reset before wrapping with CachingTokenFilter... correct?
127 stream = new CachingTokenFilter(stream);
128 Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
134 IndexReader r = IndexReader.open(dir, true);
135 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
136 assertEquals(2, termOffsets.length);
137 assertEquals(0, termOffsets[0].getStartOffset());
138 assertEquals(4, termOffsets[0].getEndOffset());
139 assertEquals(8, termOffsets[1].getStartOffset());
140 assertEquals(12, termOffsets[1].getEndOffset());
146 public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
147 MockDirectoryWrapper dir = newDirectory();
148 Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
149 IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
150 Document doc = new Document();
151 TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
152 TokenStream sink = tee.newSinkTokenStream();
153 Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
154 Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
160 IndexReader r = IndexReader.open(dir, true);
161 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
162 assertEquals(2, termOffsets.length);
163 assertEquals(0, termOffsets[0].getStartOffset());
164 assertEquals(4, termOffsets[0].getEndOffset());
165 assertEquals(8, termOffsets[1].getStartOffset());
166 assertEquals(12, termOffsets[1].getEndOffset());
172 public void testEndOffsetPositionStopFilter() throws Exception {
173 Directory dir = newDirectory();
174 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
175 TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
176 Document doc = new Document();
177 Field f = newField("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
183 IndexReader r = IndexReader.open(dir, true);
184 TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
185 assertEquals(2, termOffsets.length);
186 assertEquals(0, termOffsets[0].getStartOffset());
187 assertEquals(4, termOffsets[0].getEndOffset());
188 assertEquals(9, termOffsets[1].getStartOffset());
189 assertEquals(13, termOffsets[1].getEndOffset());
195 public void testEndOffsetPositionStandard() throws Exception {
196 Directory dir = newDirectory();
197 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
198 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
199 Document doc = new Document();
200 Field f = newField("field", "abcd the ", Field.Store.NO,
201 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
202 Field f2 = newField("field", "crunch man", Field.Store.NO,
203 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
209 IndexReader r = IndexReader.open(dir, true);
210 TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
211 TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
212 assertEquals(1, termOffsets.length);
213 assertEquals(0, termOffsets[0].getStartOffset());
214 assertEquals(4, termOffsets[0].getEndOffset());
215 termOffsets = tpv.getOffsets(1);
216 assertEquals(11, termOffsets[0].getStartOffset());
217 assertEquals(17, termOffsets[0].getEndOffset());
218 termOffsets = tpv.getOffsets(2);
219 assertEquals(18, termOffsets[0].getStartOffset());
220 assertEquals(21, termOffsets[0].getEndOffset());
226 public void testEndOffsetPositionStandardEmptyField() throws Exception {
227 Directory dir = newDirectory();
228 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
229 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
230 Document doc = new Document();
231 Field f = newField("field", "", Field.Store.NO,
232 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
233 Field f2 = newField("field", "crunch man", Field.Store.NO,
234 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
240 IndexReader r = IndexReader.open(dir, true);
241 TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
242 TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
243 assertEquals(1, termOffsets.length);
244 assertEquals(1, termOffsets[0].getStartOffset());
245 assertEquals(7, termOffsets[0].getEndOffset());
246 termOffsets = tpv.getOffsets(1);
247 assertEquals(8, termOffsets[0].getStartOffset());
248 assertEquals(11, termOffsets[0].getEndOffset());
254 public void testEndOffsetPositionStandardEmptyField2() throws Exception {
255 Directory dir = newDirectory();
256 IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
257 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
258 Document doc = new Document();
260 Field f = newField("field", "abcd", Field.Store.NO,
261 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
263 doc.add(newField("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
265 Field f2 = newField("field", "crunch", Field.Store.NO,
266 Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
272 IndexReader r = IndexReader.open(dir, true);
273 TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
274 TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
275 assertEquals(1, termOffsets.length);
276 assertEquals(0, termOffsets[0].getStartOffset());
277 assertEquals(4, termOffsets[0].getEndOffset());
278 termOffsets = tpv.getOffsets(1);
279 assertEquals(6, termOffsets[0].getStartOffset());
280 assertEquals(12, termOffsets[0].getEndOffset());
286 public void testTermVectorCorruption() throws IOException {
288 Directory dir = newDirectory();
289 for(int iter=0;iter<2;iter++) {
290 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
291 TEST_VERSION_CURRENT, new MockAnalyzer(random))
292 .setMaxBufferedDocs(2).setRAMBufferSizeMB(
293 IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
294 new SerialMergeScheduler()).setMergePolicy(
295 new LogDocMergePolicy()));
297 Document document = new Document();
299 Field storedField = newField("stored", "stored", Field.Store.YES,
301 document.add(storedField);
302 writer.addDocument(document);
303 writer.addDocument(document);
305 document = new Document();
306 document.add(storedField);
307 Field termVectorField = newField("termVector", "termVector",
308 Field.Store.NO, Field.Index.NOT_ANALYZED,
309 Field.TermVector.WITH_POSITIONS_OFFSETS);
311 document.add(termVectorField);
312 writer.addDocument(document);
313 writer.forceMerge(1);
316 IndexReader reader = IndexReader.open(dir, true);
317 for(int i=0;i<reader.numDocs();i++) {
319 reader.getTermFreqVectors(i);
323 writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
324 new MockAnalyzer(random)).setMaxBufferedDocs(2)
325 .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
326 .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
327 new LogDocMergePolicy()));
329 Directory[] indexDirs = {new MockDirectoryWrapper(random, new RAMDirectory(dir))};
330 writer.addIndexes(indexDirs);
331 writer.forceMerge(1);
338 public void testTermVectorCorruption2() throws IOException {
339 Directory dir = newDirectory();
340 for(int iter=0;iter<2;iter++) {
341 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
342 TEST_VERSION_CURRENT, new MockAnalyzer(random))
343 .setMaxBufferedDocs(2).setRAMBufferSizeMB(
344 IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
345 new SerialMergeScheduler()).setMergePolicy(
346 new LogDocMergePolicy()));
348 Document document = new Document();
350 Field storedField = newField("stored", "stored", Field.Store.YES,
352 document.add(storedField);
353 writer.addDocument(document);
354 writer.addDocument(document);
356 document = new Document();
357 document.add(storedField);
358 Field termVectorField = newField("termVector", "termVector",
359 Field.Store.NO, Field.Index.NOT_ANALYZED,
360 Field.TermVector.WITH_POSITIONS_OFFSETS);
361 document.add(termVectorField);
362 writer.addDocument(document);
363 writer.forceMerge(1);
366 IndexReader reader = IndexReader.open(dir, true);
367 assertTrue(reader.getTermFreqVectors(0)==null);
368 assertTrue(reader.getTermFreqVectors(1)==null);
369 assertTrue(reader.getTermFreqVectors(2)!=null);
376 public void testTermVectorCorruption3() throws IOException {
377 Directory dir = newDirectory();
378 IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
379 TEST_VERSION_CURRENT, new MockAnalyzer(random))
380 .setMaxBufferedDocs(2).setRAMBufferSizeMB(
381 IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
382 new SerialMergeScheduler()).setMergePolicy(new LogDocMergePolicy()));
384 Document document = new Document();
386 document = new Document();
387 Field storedField = newField("stored", "stored", Field.Store.YES,
389 document.add(storedField);
390 Field termVectorField = newField("termVector", "termVector",
391 Field.Store.NO, Field.Index.NOT_ANALYZED,
392 Field.TermVector.WITH_POSITIONS_OFFSETS);
393 document.add(termVectorField);
394 for(int i=0;i<10;i++)
395 writer.addDocument(document);
398 writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
399 new MockAnalyzer(random)).setMaxBufferedDocs(2)
400 .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
401 .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
402 new LogDocMergePolicy()));
404 writer.addDocument(document);
406 writer.forceMerge(1);
409 IndexReader reader = IndexReader.open(dir, true);
410 for(int i=0;i<10;i++) {
411 reader.getTermFreqVectors(i);
419 public void testNoTermVectorAfterTermVector() throws IOException {
420 Directory dir = newDirectory();
421 IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
422 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
423 Document document = new Document();
424 document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
425 Field.TermVector.YES));
426 iw.addDocument(document);
427 document = new Document();
428 document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
429 Field.TermVector.NO));
430 iw.addDocument(document);
431 // Make first segment
434 document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
435 Field.TermVector.YES));
436 iw.addDocument(document);
446 public void testNoTermVectorAfterTermVectorMerge() throws IOException {
447 Directory dir = newDirectory();
448 IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
449 TEST_VERSION_CURRENT, new MockAnalyzer(random)));
450 Document document = new Document();
451 document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
452 Field.TermVector.YES));
453 iw.addDocument(document);
456 document = new Document();
457 document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
458 Field.TermVector.NO));
459 iw.addDocument(document);
460 // Make first segment
465 document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
466 Field.TermVector.YES));
467 iw.addDocument(document);