+++ /dev/null
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.io.StringReader;
-
-import org.apache.lucene.analysis.Analyzer;
-import org.apache.lucene.analysis.CachingTokenFilter;
-import org.apache.lucene.analysis.MockAnalyzer;
-import org.apache.lucene.analysis.TeeSinkTokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.WhitespaceAnalyzer;
-import org.apache.lucene.analysis.standard.StandardAnalyzer;
-import org.apache.lucene.document.Document;
-import org.apache.lucene.document.Field;
-import org.apache.lucene.store.Directory;
-import org.apache.lucene.store.MockDirectoryWrapper;
-import org.apache.lucene.store.RAMDirectory;
-import org.apache.lucene.util.LuceneTestCase;
-
-/** tests for writing term vectors */
-public class TestTermVectorsWriter extends LuceneTestCase {
- // LUCENE-1442
- public void testDoubleOffsetCounting() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
- Field f = newField("field", "abcd", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f);
- Field f2 = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f2);
- doc.add(f);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
-
- // Token "" occurred once
- assertEquals(1, termOffsets.length);
- assertEquals(8, termOffsets[0].getStartOffset());
- assertEquals(8, termOffsets[0].getEndOffset());
-
- // Token "abcd" occurred three times
- termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(1);
- assertEquals(3, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(4, termOffsets[1].getStartOffset());
- assertEquals(8, termOffsets[1].getEndOffset());
- assertEquals(8, termOffsets[2].getStartOffset());
- assertEquals(12, termOffsets[2].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1442
- public void testDoubleOffsetCounting2() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
- Field f = newField("field", "abcd", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(5, termOffsets[1].getStartOffset());
- assertEquals(9, termOffsets[1].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionCharAnalyzer() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
- Field f = newField("field", "abcd ", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(8, termOffsets[1].getStartOffset());
- assertEquals(12, termOffsets[1].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionWithCachingTokenFilter() throws Exception {
- Directory dir = newDirectory();
- Analyzer analyzer = new MockAnalyzer(random);
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT, analyzer));
- Document doc = new Document();
- TokenStream stream = analyzer.tokenStream("field", new StringReader("abcd "));
- stream.reset(); // TODO: wierd to reset before wrapping with CachingTokenFilter... correct?
- stream = new CachingTokenFilter(stream);
- Field f = new Field("field", stream, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(8, termOffsets[1].getStartOffset());
- assertEquals(12, termOffsets[1].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception {
- MockDirectoryWrapper dir = newDirectory();
- Analyzer analyzer = new WhitespaceAnalyzer(TEST_VERSION_CURRENT);
- IndexWriter w = new IndexWriter(dir, new IndexWriterConfig(TEST_VERSION_CURRENT, analyzer));
- Document doc = new Document();
- TeeSinkTokenFilter tee = new TeeSinkTokenFilter(analyzer.tokenStream("field", new StringReader("abcd ")));
- TokenStream sink = tee.newSinkTokenStream();
- Field f1 = new Field("field", tee, Field.TermVector.WITH_POSITIONS_OFFSETS);
- Field f2 = new Field("field", sink, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f1);
- doc.add(f2);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(8, termOffsets[1].getStartOffset());
- assertEquals(12, termOffsets[1].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionStopFilter() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new StandardAnalyzer(TEST_VERSION_CURRENT)));
- Document doc = new Document();
- Field f = newField("field", "abcd the", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermVectorOffsetInfo[] termOffsets = ((TermPositionVector) r.getTermFreqVector(0, "field")).getOffsets(0);
- assertEquals(2, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- assertEquals(9, termOffsets[1].getStartOffset());
- assertEquals(13, termOffsets[1].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionStandard() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
- Field f = newField("field", "abcd the ", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- Field f2 = newField("field", "crunch man", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f2);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
- TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
- assertEquals(1, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- termOffsets = tpv.getOffsets(1);
- assertEquals(11, termOffsets[0].getStartOffset());
- assertEquals(17, termOffsets[0].getEndOffset());
- termOffsets = tpv.getOffsets(2);
- assertEquals(18, termOffsets[0].getStartOffset());
- assertEquals(21, termOffsets[0].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionStandardEmptyField() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
- Field f = newField("field", "", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- Field f2 = newField("field", "crunch man", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(f2);
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
- TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
- assertEquals(1, termOffsets.length);
- assertEquals(1, termOffsets[0].getStartOffset());
- assertEquals(7, termOffsets[0].getEndOffset());
- termOffsets = tpv.getOffsets(1);
- assertEquals(8, termOffsets[0].getStartOffset());
- assertEquals(11, termOffsets[0].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1448
- public void testEndOffsetPositionStandardEmptyField2() throws Exception {
- Directory dir = newDirectory();
- IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document doc = new Document();
-
- Field f = newField("field", "abcd", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f);
- doc.add(newField("field", "", Field.Store.NO, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS));
-
- Field f2 = newField("field", "crunch", Field.Store.NO,
- Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS);
- doc.add(f2);
-
- w.addDocument(doc);
- w.close();
-
- IndexReader r = IndexReader.open(dir, true);
- TermPositionVector tpv = ((TermPositionVector) r.getTermFreqVector(0, "field"));
- TermVectorOffsetInfo[] termOffsets = tpv.getOffsets(0);
- assertEquals(1, termOffsets.length);
- assertEquals(0, termOffsets[0].getStartOffset());
- assertEquals(4, termOffsets[0].getEndOffset());
- termOffsets = tpv.getOffsets(1);
- assertEquals(6, termOffsets[0].getStartOffset());
- assertEquals(12, termOffsets[0].getEndOffset());
- r.close();
- dir.close();
- }
-
- // LUCENE-1168
- public void testTermVectorCorruption() throws IOException {
-
- Directory dir = newDirectory();
- for(int iter=0;iter<2;iter++) {
- IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random))
- .setMaxBufferedDocs(2).setRAMBufferSizeMB(
- IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
- new SerialMergeScheduler()).setMergePolicy(
- new LogDocMergePolicy()));
-
- Document document = new Document();
-
- Field storedField = newField("stored", "stored", Field.Store.YES,
- Field.Index.NO);
- document.add(storedField);
- writer.addDocument(document);
- writer.addDocument(document);
-
- document = new Document();
- document.add(storedField);
- Field termVectorField = newField("termVector", "termVector",
- Field.Store.NO, Field.Index.NOT_ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS);
-
- document.add(termVectorField);
- writer.addDocument(document);
- writer.optimize();
- writer.close();
-
- IndexReader reader = IndexReader.open(dir, true);
- for(int i=0;i<reader.numDocs();i++) {
- reader.document(i);
- reader.getTermFreqVectors(i);
- }
- reader.close();
-
- writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
- new MockAnalyzer(random)).setMaxBufferedDocs(2)
- .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
- .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
- new LogDocMergePolicy()));
-
- Directory[] indexDirs = {new MockDirectoryWrapper(random, new RAMDirectory(dir))};
- writer.addIndexes(indexDirs);
- writer.optimize();
- writer.close();
- }
- dir.close();
- }
-
- // LUCENE-1168
- public void testTermVectorCorruption2() throws IOException {
- Directory dir = newDirectory();
- for(int iter=0;iter<2;iter++) {
- IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random))
- .setMaxBufferedDocs(2).setRAMBufferSizeMB(
- IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
- new SerialMergeScheduler()).setMergePolicy(
- new LogDocMergePolicy()));
-
- Document document = new Document();
-
- Field storedField = newField("stored", "stored", Field.Store.YES,
- Field.Index.NO);
- document.add(storedField);
- writer.addDocument(document);
- writer.addDocument(document);
-
- document = new Document();
- document.add(storedField);
- Field termVectorField = newField("termVector", "termVector",
- Field.Store.NO, Field.Index.NOT_ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS);
- document.add(termVectorField);
- writer.addDocument(document);
- writer.optimize();
- writer.close();
-
- IndexReader reader = IndexReader.open(dir, true);
- assertTrue(reader.getTermFreqVectors(0)==null);
- assertTrue(reader.getTermFreqVectors(1)==null);
- assertTrue(reader.getTermFreqVectors(2)!=null);
- reader.close();
- }
- dir.close();
- }
-
- // LUCENE-1168
- public void testTermVectorCorruption3() throws IOException {
- Directory dir = newDirectory();
- IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random))
- .setMaxBufferedDocs(2).setRAMBufferSizeMB(
- IndexWriterConfig.DISABLE_AUTO_FLUSH).setMergeScheduler(
- new SerialMergeScheduler()).setMergePolicy(new LogDocMergePolicy()));
-
- Document document = new Document();
-
- document = new Document();
- Field storedField = newField("stored", "stored", Field.Store.YES,
- Field.Index.NO);
- document.add(storedField);
- Field termVectorField = newField("termVector", "termVector",
- Field.Store.NO, Field.Index.NOT_ANALYZED,
- Field.TermVector.WITH_POSITIONS_OFFSETS);
- document.add(termVectorField);
- for(int i=0;i<10;i++)
- writer.addDocument(document);
- writer.close();
-
- writer = new IndexWriter(dir, newIndexWriterConfig( TEST_VERSION_CURRENT,
- new MockAnalyzer(random)).setMaxBufferedDocs(2)
- .setRAMBufferSizeMB(IndexWriterConfig.DISABLE_AUTO_FLUSH)
- .setMergeScheduler(new SerialMergeScheduler()).setMergePolicy(
- new LogDocMergePolicy()));
- for(int i=0;i<6;i++)
- writer.addDocument(document);
-
- writer.optimize();
- writer.close();
-
- IndexReader reader = IndexReader.open(dir, true);
- for(int i=0;i<10;i++) {
- reader.getTermFreqVectors(i);
- reader.document(i);
- }
- reader.close();
- dir.close();
- }
-
- // LUCENE-1008
- public void testNoTermVectorAfterTermVector() throws IOException {
- Directory dir = newDirectory();
- IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document document = new Document();
- document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.YES));
- iw.addDocument(document);
- document = new Document();
- document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.NO));
- iw.addDocument(document);
- // Make first segment
- iw.commit();
-
- document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.YES));
- iw.addDocument(document);
- // Make 2nd segment
- iw.commit();
-
- iw.optimize();
- iw.close();
- dir.close();
- }
-
- // LUCENE-1010
- public void testNoTermVectorAfterTermVectorMerge() throws IOException {
- Directory dir = newDirectory();
- IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(
- TEST_VERSION_CURRENT, new MockAnalyzer(random)));
- Document document = new Document();
- document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.YES));
- iw.addDocument(document);
- iw.commit();
-
- document = new Document();
- document.add(newField("tvtest", "x y z", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.NO));
- iw.addDocument(document);
- // Make first segment
- iw.commit();
-
- iw.optimize();
-
- document.add(newField("tvtest", "a b c", Field.Store.NO, Field.Index.ANALYZED,
- Field.TermVector.YES));
- iw.addDocument(document);
- // Make 2nd segment
- iw.commit();
- iw.optimize();
-
- iw.close();
- dir.close();
- }
-}