lucene-java-3.5.0/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java

   1 package org.apache.lucene.index;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.ArrayList;
  22 import java.util.Arrays;
  23 import java.util.Collections;
  24 import java.util.HashSet;
  25 import java.util.List;
  26 import java.util.Set;
  27
  28 import org.apache.lucene.analysis.MockAnalyzer;
  29 import org.apache.lucene.document.Document;
  30 import org.apache.lucene.document.Field;
  31 import org.apache.lucene.store.Directory;
  32 import org.apache.lucene.util.BytesRef;
  33 import org.apache.lucene.util.LineFileDocs;
  34 import org.apache.lucene.util.LuceneTestCase;
  35 import org.apache.lucene.util._TestUtil;
  36
  37 public class TestTermsEnum extends LuceneTestCase {
  38
  39   public void test() throws Exception {
  40     final LineFileDocs docs = new LineFileDocs(random);
  41     final Directory d = newDirectory();
  42     final RandomIndexWriter w = new RandomIndexWriter(random, d);
  43     final int numDocs = atLeast(10);
  44     for(int docCount=0;docCount<numDocs;docCount++) {
  45       w.addDocument(docs.nextDoc());
  46     }
  47     final IndexReader r = w.getReader();
  48     w.close();
  49
  50     final List<Term> terms = new ArrayList<Term>();
  51     TermEnum termEnum = r.terms(new Term("body"));
  52     do {
  53       Term term = termEnum.term();
  54       if (term == null || !"body".equals(term.field())) {
  55         break;
  56       }
  57       terms.add(term);
  58     } while (termEnum.next());
  59
  60     if (VERBOSE) {
  61       System.out.println("TEST: " + terms.size() + " terms");
  62     }
  63
  64     int upto = -1;
  65     final int iters = atLeast(200);
  66     for(int iter=0;iter<iters;iter++) {
  67       final boolean isEnd;
  68       if (upto != -1 && random.nextBoolean()) {
  69         // next
  70         if (VERBOSE) {
  71           System.out.println("TEST: iter next");
  72         }
  73         termEnum.next();
  74         isEnd = termEnum.term() == null || !"body".equals(termEnum.term().field());
  75         upto++;
  76         if (isEnd) {
  77           if (VERBOSE) {
  78             System.out.println("  end");
  79           }
  80           assertEquals(upto, terms.size());
  81           upto = -1;
  82         } else {
  83           if (VERBOSE) {
  84             System.out.println("  got term=" + termEnum.term() + " expected=" + terms.get(upto));
  85           }
  86           assertTrue(upto < terms.size());
  87           assertEquals(terms.get(upto), termEnum.term());
  88         }
  89       } else {
  90
  91         final Term target;
  92         final String exists;
  93         if (random.nextBoolean()) {
  94           // likely fake term
  95           if (random.nextBoolean()) {
  96             target = new Term("body",
  97                               _TestUtil.randomSimpleString(random));
  98           } else {
  99             target = new Term("body",
 100                               _TestUtil.randomRealisticUnicodeString(random));
 101           }
 102           exists = "likely not";
 103         } else {
 104           // real term
 105           target = terms.get(random.nextInt(terms.size()));
 106           exists = "yes";
 107         }
 108
 109         upto = Collections.binarySearch(terms, target);
 110
 111         if (VERBOSE) {
 112           System.out.println("TEST: iter seekCeil target=" + target + " exists=" + exists);
 113         }
 114         termEnum = r.terms(target);
 115         final Term actualTerm = termEnum.term();
 116
 117         if (VERBOSE) {
 118           System.out.println("  got term=" + actualTerm);
 119         }
 120
 121         if (upto < 0) {
 122           upto = -(upto+1);
 123           if (upto >= terms.size()) {
 124             assertTrue(actualTerm == null || !"body".equals(actualTerm.field()));
 125             upto = -1;
 126           } else {
 127             assertTrue(actualTerm != null && "body".equals(actualTerm.field()));
 128             assertEquals(terms.get(upto), actualTerm);
 129           }
 130         } else {
 131           assertEquals(terms.get(upto), actualTerm);
 132         }
 133       }
 134     }
 135
 136     r.close();
 137     d.close();
 138   }
 139
 140   private Directory d;
 141   private IndexReader r;
 142
 143   private final String FIELD = "field";
 144
 145   private IndexReader makeIndex(String... terms) throws Exception {
 146     d = newDirectory();
 147     IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
 148
 149     /*
 150     CoreCodecProvider cp = new CoreCodecProvider();
 151     cp.unregister(cp.lookup("Standard"));
 152     cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock));
 153     cp.setDefaultFieldCodec("Standard");
 154     iwc.setCodecProvider(cp);
 155     */
 156
 157     final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc);
 158     w.w.setInfoStream(VERBOSE ? System.out : null);
 159     for(String term : terms) {
 160       Document doc = new Document();
 161       Field f = newField(FIELD, term, Field.Store.NO, Field.Index.NOT_ANALYZED);
 162       doc.add(f);
 163       w.addDocument(doc);
 164     }
 165     if (r != null) {
 166       close();
 167     }
 168     r = w.getReader();
 169     w.close();
 170     return r;
 171   }
 172
 173   private void close() throws Exception {
 174     final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory();
 175     r.close();
 176     d.close();
 177   }
 178
 179   private int docFreq(IndexReader r, String term) throws Exception {
 180     return r.docFreq(new Term(FIELD, term));
 181   }
 182
 183   public void testEasy() throws Exception {
 184     // No floor arcs:
 185     r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
 186
 187     // First term in block:
 188     assertEquals(1, docFreq(r, "aa0"));
 189
 190     // Scan forward to another term in same block
 191     assertEquals(1, docFreq(r, "aa2"));
 192
 193     assertEquals(1, docFreq(r, "aa"));
 194
 195     // Reset same block then scan forwards
 196     assertEquals(1, docFreq(r, "aa1"));
 197
 198     // Not found, in same block
 199     assertEquals(0, docFreq(r, "aa5"));
 200
 201     // Found, in same block
 202     assertEquals(1, docFreq(r, "aa2"));
 203
 204     // Not found in index:
 205     assertEquals(0, docFreq(r, "b0"));
 206
 207     // Found:
 208     assertEquals(1, docFreq(r, "aa2"));
 209
 210     // Found, rewind:
 211     assertEquals(1, docFreq(r, "aa0"));
 212
 213
 214     // First term in block:
 215     assertEquals(1, docFreq(r, "bb0"));
 216
 217     // Scan forward to another term in same block
 218     assertEquals(1, docFreq(r, "bb2"));
 219
 220     // Reset same block then scan forwards
 221     assertEquals(1, docFreq(r, "bb1"));
 222
 223     // Not found, in same block
 224     assertEquals(0, docFreq(r, "bb5"));
 225
 226     // Found, in same block
 227     assertEquals(1, docFreq(r, "bb2"));
 228
 229     // Not found in index:
 230     assertEquals(0, docFreq(r, "b0"));
 231
 232     // Found:
 233     assertEquals(1, docFreq(r, "bb2"));
 234
 235     // Found, rewind:
 236     assertEquals(1, docFreq(r, "bb0"));
 237
 238     close();
 239   }
 240
 241   // tests:
 242   //   - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
 243   //   - term that's entirely in the index
 244
 245   public void testFloorBlocks() throws Exception {
 246     final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"};
 247     r = makeIndex(terms);
 248     //r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9");
 249
 250     // First term in first block:
 251     assertEquals(1, docFreq(r, "aa0"));
 252     assertEquals(1, docFreq(r, "aa4"));
 253
 254     // No block
 255     assertEquals(0, docFreq(r, "bb0"));
 256
 257     // Second block
 258     assertEquals(1, docFreq(r, "aa4"));
 259
 260     // Backwards to prior floor block:
 261     assertEquals(1, docFreq(r, "aa0"));
 262
 263     // Forwards to last floor block:
 264     assertEquals(1, docFreq(r, "aa9"));
 265
 266     assertEquals(0, docFreq(r, "a"));
 267     assertEquals(1, docFreq(r, "aa"));
 268     assertEquals(0, docFreq(r, "a"));
 269     assertEquals(1, docFreq(r, "aa"));
 270
 271     // Forwards to last floor block:
 272     assertEquals(1, docFreq(r, "xx"));
 273     assertEquals(1, docFreq(r, "aa1"));
 274     assertEquals(0, docFreq(r, "yy"));
 275
 276     assertEquals(1, docFreq(r, "xx"));
 277     assertEquals(1, docFreq(r, "aa9"));
 278
 279     assertEquals(1, docFreq(r, "xx"));
 280     assertEquals(1, docFreq(r, "aa4"));
 281
 282     final TermEnum te = r.terms(new Term(FIELD));
 283     while(te.next()) {
 284       //System.out.println("TEST: next term=" + te.term().utf8ToString());
 285     }
 286
 287     testRandomSeeks(r, terms);
 288     close();
 289   }
 290
 291   public void testZeroTerms() throws Exception {
 292     d = newDirectory();
 293     final RandomIndexWriter w = new RandomIndexWriter(random, d);
 294     w.w.setInfoStream(VERBOSE ? System.out : null);
 295     Document doc = new Document();
 296     doc.add(newField("field", "one two three", Field.Store.NO, Field.Index.ANALYZED));
 297     doc = new Document();
 298     doc.add(newField("field2", "one two three", Field.Store.NO, Field.Index.ANALYZED));
 299     w.addDocument(doc);
 300     w.commit();
 301     w.deleteDocuments(new Term("field", "one"));
 302     w.forceMerge(1);
 303     IndexReader r = w.getReader();
 304     w.close();
 305     assertEquals(1, r.numDocs());
 306     assertEquals(1, r.maxDoc());
 307     TermEnum terms = r.terms(new Term("field"));
 308     if (terms != null) {
 309       assertTrue(!terms.next() || !"field".equals(terms.term().field()));
 310     }
 311     r.close();
 312     d.close();
 313   }
 314
 315   private String getRandomString() {
 316     //return _TestUtil.randomSimpleString(random);
 317     return _TestUtil.randomRealisticUnicodeString(random);
 318   }
 319
 320   public void testRandomTerms() throws Exception {
 321     final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))];
 322     final Set<String> seen = new HashSet<String>();
 323
 324     final boolean allowEmptyString = random.nextBoolean();
 325
 326     if (random.nextInt(10) == 7 && terms.length > 2) {
 327       // Sometimes add a bunch of terms sharing a longish common prefix:
 328       final int numTermsSamePrefix = random.nextInt(terms.length/2);
 329       if (numTermsSamePrefix > 0) {
 330         String prefix;
 331         while(true) {
 332           prefix = getRandomString();
 333           if (prefix.length() < 5) {
 334             continue;
 335           } else {
 336             break;
 337           }
 338         }
 339         while(seen.size() < numTermsSamePrefix) {
 340           final String t = prefix + getRandomString();
 341           if (!seen.contains(t)) {
 342             terms[seen.size()] = t;
 343             seen.add(t);
 344           }
 345         }
 346       }
 347     }
 348
 349     while(seen.size() < terms.length) {
 350       final String t = getRandomString();
 351       if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) {
 352         terms[seen.size()] = t;
 353         seen.add(t);
 354       }
 355     }
 356     r = makeIndex(terms);
 357     testRandomSeeks(r, terms);
 358     close();
 359   }
 360
 361   private BytesRef getNonExistTerm(BytesRef[] terms) {
 362     BytesRef t = null;
 363     while(true) {
 364       final String ts = getRandomString();
 365       t = new BytesRef(ts);
 366       if (Arrays.binarySearch(terms, t) < 0) {
 367         return t;
 368       }
 369     }
 370   }
 371
 372   private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException {
 373     final BytesRef[] validTerms = new BytesRef[validTermStrings.length];
 374     for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) {
 375       validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
 376     }
 377     Arrays.sort(validTerms, BytesRef.getUTF8SortedAsUTF16Comparator());
 378     if (VERBOSE) {
 379       System.out.println("TEST: " + validTerms.length + " terms:");
 380       for(int idx=0;idx<validTerms.length;idx++) {
 381         System.out.println("  " + idx + ": " + validTerms[idx]);
 382       }
 383     }
 384
 385     final int END_LOC = -validTerms.length-1;
 386
 387     for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
 388
 389       final BytesRef t;
 390       int loc;
 391       if (random.nextInt(6) == 4) {
 392         // pick term that doens't exist:
 393         t = getNonExistTerm(validTerms);
 394         if (VERBOSE) {
 395           System.out.println("\nTEST: invalid term=" + t.utf8ToString());
 396         }
 397         loc = Arrays.binarySearch(validTerms, t, BytesRef.getUTF8SortedAsUTF16Comparator());
 398       } else {
 399         // pick valid term
 400         loc = random.nextInt(validTerms.length);
 401         t = new BytesRef(validTerms[loc]);
 402         if (VERBOSE) {
 403           System.out.println("\nTEST: valid term=" + t.utf8ToString());
 404         }
 405       }
 406       final Term targetTerm = new Term(FIELD, t.utf8ToString());
 407
 408       if (VERBOSE) {
 409         System.out.println("  seek term=" + targetTerm);
 410       }
 411
 412       final TermEnum te = r.terms(targetTerm);
 413       Term actualTerm = te.term();
 414       if (VERBOSE) {
 415         System.out.println("  got " + actualTerm);
 416       }
 417
 418       if (loc >= 0) {
 419         // assertEquals(TermsEnum.SeekStatus.FOUND, result);
 420       } else if (loc == END_LOC) {
 421         assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
 422       } else {
 423         assert loc >= -validTerms.length;
 424         assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
 425         //assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result);
 426       }
 427
 428       if (loc >= 0) {
 429         assertEquals(targetTerm, actualTerm);
 430       } else if (loc == END_LOC) {
 431         continue;
 432       } else {
 433         loc = -loc-1;
 434         assertEquals(new Term(FIELD, validTerms[loc].utf8ToString()), actualTerm);
 435       }
 436
 437       // Do a bunch of next's after the seek
 438       final int numNext = random.nextInt(validTerms.length);
 439
 440       if (VERBOSE) {
 441         System.out.println("\nTEST: numNext=" + numNext);
 442       }
 443
 444       for(int nextCount=0;nextCount<numNext;nextCount++) {
 445         if (VERBOSE) {
 446           System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length);
 447         }
 448         boolean result = te.next();
 449         actualTerm = te.term();
 450         loc++;
 451
 452         if (loc == validTerms.length) {
 453           if (VERBOSE) {
 454             System.out.println("  actual=null");
 455           }
 456           assertFalse(result);
 457           assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
 458           break;
 459         } else {
 460           if (VERBOSE) {
 461             System.out.println("  actual=" + new BytesRef(actualTerm.text()));
 462           }
 463           assertTrue(result);
 464           assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
 465           assertEquals(validTerms[loc], new BytesRef(actualTerm.text()));
 466         }
 467       }
 468     }
 469   }
 470 }