1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.ArrayList;
22 import java.util.Arrays;
23 import java.util.Collections;
24 import java.util.HashSet;
25 import java.util.List;
28 import org.apache.lucene.analysis.MockAnalyzer;
29 import org.apache.lucene.document.Document;
30 import org.apache.lucene.document.Field;
31 import org.apache.lucene.store.Directory;
32 import org.apache.lucene.util.BytesRef;
33 import org.apache.lucene.util.LineFileDocs;
34 import org.apache.lucene.util.LuceneTestCase;
35 import org.apache.lucene.util._TestUtil;
37 public class TestTermsEnum extends LuceneTestCase {
39 public void test() throws Exception {
40 final LineFileDocs docs = new LineFileDocs(random);
41 final Directory d = newDirectory();
42 final RandomIndexWriter w = new RandomIndexWriter(random, d);
43 final int numDocs = atLeast(10);
44 for(int docCount=0;docCount<numDocs;docCount++) {
45 w.addDocument(docs.nextDoc());
47 final IndexReader r = w.getReader();
50 final List<Term> terms = new ArrayList<Term>();
51 TermEnum termEnum = r.terms(new Term("body"));
53 Term term = termEnum.term();
54 if (term == null || !"body".equals(term.field())) {
58 } while (termEnum.next());
61 System.out.println("TEST: " + terms.size() + " terms");
65 final int iters = atLeast(200);
66 for(int iter=0;iter<iters;iter++) {
68 if (upto != -1 && random.nextBoolean()) {
71 System.out.println("TEST: iter next");
74 isEnd = termEnum.term() == null || !"body".equals(termEnum.term().field());
78 System.out.println(" end");
80 assertEquals(upto, terms.size());
84 System.out.println(" got term=" + termEnum.term() + " expected=" + terms.get(upto));
86 assertTrue(upto < terms.size());
87 assertEquals(terms.get(upto), termEnum.term());
93 if (random.nextBoolean()) {
95 if (random.nextBoolean()) {
96 target = new Term("body",
97 _TestUtil.randomSimpleString(random));
99 target = new Term("body",
100 _TestUtil.randomRealisticUnicodeString(random));
102 exists = "likely not";
105 target = terms.get(random.nextInt(terms.size()));
109 upto = Collections.binarySearch(terms, target);
112 System.out.println("TEST: iter seekCeil target=" + target + " exists=" + exists);
114 termEnum = r.terms(target);
115 final Term actualTerm = termEnum.term();
118 System.out.println(" got term=" + actualTerm);
123 if (upto >= terms.size()) {
124 assertTrue(actualTerm == null || !"body".equals(actualTerm.field()));
127 assertTrue(actualTerm != null && "body".equals(actualTerm.field()));
128 assertEquals(terms.get(upto), actualTerm);
131 assertEquals(terms.get(upto), actualTerm);
141 private IndexReader r;
143 private final String FIELD = "field";
145 private IndexReader makeIndex(String... terms) throws Exception {
147 IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
150 CoreCodecProvider cp = new CoreCodecProvider();
151 cp.unregister(cp.lookup("Standard"));
152 cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock));
153 cp.setDefaultFieldCodec("Standard");
154 iwc.setCodecProvider(cp);
157 final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc);
158 w.w.setInfoStream(VERBOSE ? System.out : null);
159 for(String term : terms) {
160 Document doc = new Document();
161 Field f = newField(FIELD, term, Field.Store.NO, Field.Index.NOT_ANALYZED);
173 private void close() throws Exception {
174 final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory();
179 private int docFreq(IndexReader r, String term) throws Exception {
180 return r.docFreq(new Term(FIELD, term));
183 public void testEasy() throws Exception {
185 r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
187 // First term in block:
188 assertEquals(1, docFreq(r, "aa0"));
190 // Scan forward to another term in same block
191 assertEquals(1, docFreq(r, "aa2"));
193 assertEquals(1, docFreq(r, "aa"));
195 // Reset same block then scan forwards
196 assertEquals(1, docFreq(r, "aa1"));
198 // Not found, in same block
199 assertEquals(0, docFreq(r, "aa5"));
201 // Found, in same block
202 assertEquals(1, docFreq(r, "aa2"));
204 // Not found in index:
205 assertEquals(0, docFreq(r, "b0"));
208 assertEquals(1, docFreq(r, "aa2"));
211 assertEquals(1, docFreq(r, "aa0"));
214 // First term in block:
215 assertEquals(1, docFreq(r, "bb0"));
217 // Scan forward to another term in same block
218 assertEquals(1, docFreq(r, "bb2"));
220 // Reset same block then scan forwards
221 assertEquals(1, docFreq(r, "bb1"));
223 // Not found, in same block
224 assertEquals(0, docFreq(r, "bb5"));
226 // Found, in same block
227 assertEquals(1, docFreq(r, "bb2"));
229 // Not found in index:
230 assertEquals(0, docFreq(r, "b0"));
233 assertEquals(1, docFreq(r, "bb2"));
236 assertEquals(1, docFreq(r, "bb0"));
242 // - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
243 // - term that's entirely in the index
245 public void testFloorBlocks() throws Exception {
246 final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"};
247 r = makeIndex(terms);
248 //r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9");
250 // First term in first block:
251 assertEquals(1, docFreq(r, "aa0"));
252 assertEquals(1, docFreq(r, "aa4"));
255 assertEquals(0, docFreq(r, "bb0"));
258 assertEquals(1, docFreq(r, "aa4"));
260 // Backwards to prior floor block:
261 assertEquals(1, docFreq(r, "aa0"));
263 // Forwards to last floor block:
264 assertEquals(1, docFreq(r, "aa9"));
266 assertEquals(0, docFreq(r, "a"));
267 assertEquals(1, docFreq(r, "aa"));
268 assertEquals(0, docFreq(r, "a"));
269 assertEquals(1, docFreq(r, "aa"));
271 // Forwards to last floor block:
272 assertEquals(1, docFreq(r, "xx"));
273 assertEquals(1, docFreq(r, "aa1"));
274 assertEquals(0, docFreq(r, "yy"));
276 assertEquals(1, docFreq(r, "xx"));
277 assertEquals(1, docFreq(r, "aa9"));
279 assertEquals(1, docFreq(r, "xx"));
280 assertEquals(1, docFreq(r, "aa4"));
282 final TermEnum te = r.terms(new Term(FIELD));
284 //System.out.println("TEST: next term=" + te.term().utf8ToString());
287 testRandomSeeks(r, terms);
291 public void testZeroTerms() throws Exception {
293 final RandomIndexWriter w = new RandomIndexWriter(random, d);
294 w.w.setInfoStream(VERBOSE ? System.out : null);
295 Document doc = new Document();
296 doc.add(newField("field", "one two three", Field.Store.NO, Field.Index.ANALYZED));
297 doc = new Document();
298 doc.add(newField("field2", "one two three", Field.Store.NO, Field.Index.ANALYZED));
301 w.deleteDocuments(new Term("field", "one"));
303 IndexReader r = w.getReader();
305 assertEquals(1, r.numDocs());
306 assertEquals(1, r.maxDoc());
307 TermEnum terms = r.terms(new Term("field"));
309 assertTrue(!terms.next() || !"field".equals(terms.term().field()));
315 private String getRandomString() {
316 //return _TestUtil.randomSimpleString(random);
317 return _TestUtil.randomRealisticUnicodeString(random);
320 public void testRandomTerms() throws Exception {
321 final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))];
322 final Set<String> seen = new HashSet<String>();
324 final boolean allowEmptyString = random.nextBoolean();
326 if (random.nextInt(10) == 7 && terms.length > 2) {
327 // Sometimes add a bunch of terms sharing a longish common prefix:
328 final int numTermsSamePrefix = random.nextInt(terms.length/2);
329 if (numTermsSamePrefix > 0) {
332 prefix = getRandomString();
333 if (prefix.length() < 5) {
339 while(seen.size() < numTermsSamePrefix) {
340 final String t = prefix + getRandomString();
341 if (!seen.contains(t)) {
342 terms[seen.size()] = t;
349 while(seen.size() < terms.length) {
350 final String t = getRandomString();
351 if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) {
352 terms[seen.size()] = t;
356 r = makeIndex(terms);
357 testRandomSeeks(r, terms);
361 private BytesRef getNonExistTerm(BytesRef[] terms) {
364 final String ts = getRandomString();
365 t = new BytesRef(ts);
366 if (Arrays.binarySearch(terms, t) < 0) {
372 private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException {
373 final BytesRef[] validTerms = new BytesRef[validTermStrings.length];
374 for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) {
375 validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
377 Arrays.sort(validTerms, BytesRef.getUTF8SortedAsUTF16Comparator());
379 System.out.println("TEST: " + validTerms.length + " terms:");
380 for(int idx=0;idx<validTerms.length;idx++) {
381 System.out.println(" " + idx + ": " + validTerms[idx]);
385 final int END_LOC = -validTerms.length-1;
387 for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
391 if (random.nextInt(6) == 4) {
392 // pick term that doens't exist:
393 t = getNonExistTerm(validTerms);
395 System.out.println("\nTEST: invalid term=" + t.utf8ToString());
397 loc = Arrays.binarySearch(validTerms, t, BytesRef.getUTF8SortedAsUTF16Comparator());
400 loc = random.nextInt(validTerms.length);
401 t = new BytesRef(validTerms[loc]);
403 System.out.println("\nTEST: valid term=" + t.utf8ToString());
406 final Term targetTerm = new Term(FIELD, t.utf8ToString());
409 System.out.println(" seek term=" + targetTerm);
412 final TermEnum te = r.terms(targetTerm);
413 Term actualTerm = te.term();
415 System.out.println(" got " + actualTerm);
419 // assertEquals(TermsEnum.SeekStatus.FOUND, result);
420 } else if (loc == END_LOC) {
421 assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
423 assert loc >= -validTerms.length;
424 assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
425 //assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result);
429 assertEquals(targetTerm, actualTerm);
430 } else if (loc == END_LOC) {
434 assertEquals(new Term(FIELD, validTerms[loc].utf8ToString()), actualTerm);
437 // Do a bunch of next's after the seek
438 final int numNext = random.nextInt(validTerms.length);
441 System.out.println("\nTEST: numNext=" + numNext);
444 for(int nextCount=0;nextCount<numNext;nextCount++) {
446 System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length);
448 boolean result = te.next();
449 actualTerm = te.term();
452 if (loc == validTerms.length) {
454 System.out.println(" actual=null");
457 assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
461 System.out.println(" actual=" + new BytesRef(actualTerm.text()));
464 assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
465 assertEquals(validTerms[loc], new BytesRef(actualTerm.text()));