pylucene 3.5.0-3

[pylucene.git] / lucene-java-3.5.0 / lucene / src / test / org / apache / lucene / index / TestTermsEnum.java
diff --git a/lucene-java-3.5.0/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java b/lucene-java-3.5.0/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java

new file mode 100644 (file)

index 0000000..6e2e569
--- /dev/null
+++ b/lucene-java-3.5.0/lucene/src/test/org/apache/lucene/index/TestTermsEnum.java
@@ -0,0 +1,470 @@
+package org.apache.lucene.index;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.IOException;
+import java.util.ArrayList;
+import java.util.Arrays;
+import java.util.Collections;
+import java.util.HashSet;
+import java.util.List;
+import java.util.Set;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util._TestUtil;
+
+public class TestTermsEnum extends LuceneTestCase {
+
+  public void test() throws Exception {
+    final LineFileDocs docs = new LineFileDocs(random);
+    final Directory d = newDirectory();
+    final RandomIndexWriter w = new RandomIndexWriter(random, d);
+    final int numDocs = atLeast(10);
+    for(int docCount=0;docCount<numDocs;docCount++) {
+      w.addDocument(docs.nextDoc());
+    }
+    final IndexReader r = w.getReader();
+    w.close();
+
+    final List<Term> terms = new ArrayList<Term>();
+    TermEnum termEnum = r.terms(new Term("body"));
+    do {
+      Term term = termEnum.term();
+      if (term == null || !"body".equals(term.field())) {
+        break;
+      }
+      terms.add(term);
+    } while (termEnum.next());
+
+    if (VERBOSE) {
+      System.out.println("TEST: " + terms.size() + " terms");
+    }
+
+    int upto = -1;
+    final int iters = atLeast(200);
+    for(int iter=0;iter<iters;iter++) {
+      final boolean isEnd;
+      if (upto != -1 && random.nextBoolean()) {
+        // next
+        if (VERBOSE) {
+          System.out.println("TEST: iter next");
+        }
+        termEnum.next();
+        isEnd = termEnum.term() == null || !"body".equals(termEnum.term().field());
+        upto++;
+        if (isEnd) {
+          if (VERBOSE) {
+            System.out.println("  end");
+          }
+          assertEquals(upto, terms.size());
+          upto = -1;
+        } else {
+          if (VERBOSE) {
+            System.out.println("  got term=" + termEnum.term() + " expected=" + terms.get(upto));
+          }
+          assertTrue(upto < terms.size());
+          assertEquals(terms.get(upto), termEnum.term());
+        }
+      } else {
+
+        final Term target;
+        final String exists;
+        if (random.nextBoolean()) {
+          // likely fake term
+          if (random.nextBoolean()) {
+            target = new Term("body",
+                              _TestUtil.randomSimpleString(random));
+          } else {
+            target = new Term("body",
+                              _TestUtil.randomRealisticUnicodeString(random));
+          }
+          exists = "likely not";
+        } else {
+          // real term 
+          target = terms.get(random.nextInt(terms.size()));
+          exists = "yes";
+        }
+
+        upto = Collections.binarySearch(terms, target);
+
+        if (VERBOSE) {
+          System.out.println("TEST: iter seekCeil target=" + target + " exists=" + exists);
+        }
+        termEnum = r.terms(target);
+        final Term actualTerm = termEnum.term();
+
+        if (VERBOSE) {
+          System.out.println("  got term=" + actualTerm);
+        }
+          
+        if (upto < 0) {
+          upto = -(upto+1);
+          if (upto >= terms.size()) {
+            assertTrue(actualTerm == null || !"body".equals(actualTerm.field()));
+            upto = -1;
+          } else {
+            assertTrue(actualTerm != null && "body".equals(actualTerm.field()));
+            assertEquals(terms.get(upto), actualTerm);
+          }
+        } else {
+          assertEquals(terms.get(upto), actualTerm);
+        }
+      }
+    }
+
+    r.close();
+    d.close();
+  }
+
+  private Directory d;
+  private IndexReader r;
+
+  private final String FIELD = "field";
+
+  private IndexReader makeIndex(String... terms) throws Exception {
+    d = newDirectory();
+    IndexWriterConfig iwc = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random));
+
+    /*
+    CoreCodecProvider cp = new CoreCodecProvider();    
+    cp.unregister(cp.lookup("Standard"));
+    cp.register(new StandardCodec(minTermsInBlock, maxTermsInBlock));
+    cp.setDefaultFieldCodec("Standard");
+    iwc.setCodecProvider(cp);
+    */
+
+    final RandomIndexWriter w = new RandomIndexWriter(random, d, iwc);
+    w.w.setInfoStream(VERBOSE ? System.out : null);
+    for(String term : terms) {
+      Document doc = new Document();
+      Field f = newField(FIELD, term, Field.Store.NO, Field.Index.NOT_ANALYZED);
+      doc.add(f);
+      w.addDocument(doc);
+    }
+    if (r != null) {
+      close();
+    }
+    r = w.getReader();
+    w.close();
+    return r;
+  }
+
+  private void close() throws Exception {
+    final Directory d = ((SegmentReader) r.getSequentialSubReaders()[0]).directory();
+    r.close();
+    d.close();
+  }
+
+  private int docFreq(IndexReader r, String term) throws Exception {
+    return r.docFreq(new Term(FIELD, term));
+  }
+
+  public void testEasy() throws Exception {
+    // No floor arcs:
+    r = makeIndex("aa0", "aa1", "aa2", "aa3", "bb0", "bb1", "bb2", "bb3", "aa");
+
+    // First term in block:
+    assertEquals(1, docFreq(r, "aa0"));
+
+    // Scan forward to another term in same block
+    assertEquals(1, docFreq(r, "aa2"));
+
+    assertEquals(1, docFreq(r, "aa"));
+
+    // Reset same block then scan forwards
+    assertEquals(1, docFreq(r, "aa1"));
+
+    // Not found, in same block
+    assertEquals(0, docFreq(r, "aa5"));
+
+    // Found, in same block
+    assertEquals(1, docFreq(r, "aa2"));
+
+    // Not found in index:
+    assertEquals(0, docFreq(r, "b0"));
+
+    // Found:
+    assertEquals(1, docFreq(r, "aa2"));
+
+    // Found, rewind:
+    assertEquals(1, docFreq(r, "aa0"));
+
+
+    // First term in block:
+    assertEquals(1, docFreq(r, "bb0"));
+
+    // Scan forward to another term in same block
+    assertEquals(1, docFreq(r, "bb2"));
+
+    // Reset same block then scan forwards
+    assertEquals(1, docFreq(r, "bb1"));
+
+    // Not found, in same block
+    assertEquals(0, docFreq(r, "bb5"));
+
+    // Found, in same block
+    assertEquals(1, docFreq(r, "bb2"));
+
+    // Not found in index:
+    assertEquals(0, docFreq(r, "b0"));
+
+    // Found:
+    assertEquals(1, docFreq(r, "bb2"));
+
+    // Found, rewind:
+    assertEquals(1, docFreq(r, "bb0"));
+
+    close();
+  }
+
+  // tests:
+  //   - test same prefix has non-floor block and floor block (ie, has 2 long outputs on same term prefix)
+  //   - term that's entirely in the index
+
+  public void testFloorBlocks() throws Exception {
+    final String[] terms = new String[] {"aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9", "aa", "xx"};
+    r = makeIndex(terms);
+    //r = makeIndex("aa0", "aa1", "aa2", "aa3", "aa4", "aa5", "aa6", "aa7", "aa8", "aa9");
+
+    // First term in first block:
+    assertEquals(1, docFreq(r, "aa0"));
+    assertEquals(1, docFreq(r, "aa4"));
+
+    // No block
+    assertEquals(0, docFreq(r, "bb0"));
+
+    // Second block
+    assertEquals(1, docFreq(r, "aa4"));
+
+    // Backwards to prior floor block:
+    assertEquals(1, docFreq(r, "aa0"));
+
+    // Forwards to last floor block:
+    assertEquals(1, docFreq(r, "aa9"));
+
+    assertEquals(0, docFreq(r, "a"));
+    assertEquals(1, docFreq(r, "aa"));
+    assertEquals(0, docFreq(r, "a"));
+    assertEquals(1, docFreq(r, "aa"));
+
+    // Forwards to last floor block:
+    assertEquals(1, docFreq(r, "xx"));
+    assertEquals(1, docFreq(r, "aa1"));
+    assertEquals(0, docFreq(r, "yy"));
+
+    assertEquals(1, docFreq(r, "xx"));
+    assertEquals(1, docFreq(r, "aa9"));
+
+    assertEquals(1, docFreq(r, "xx"));
+    assertEquals(1, docFreq(r, "aa4"));
+
+    final TermEnum te = r.terms(new Term(FIELD));
+    while(te.next()) {
+      //System.out.println("TEST: next term=" + te.term().utf8ToString());
+    }
+
+    testRandomSeeks(r, terms);
+    close();
+  }
+
+  public void testZeroTerms() throws Exception {
+    d = newDirectory();
+    final RandomIndexWriter w = new RandomIndexWriter(random, d);
+    w.w.setInfoStream(VERBOSE ? System.out : null);
+    Document doc = new Document();
+    doc.add(newField("field", "one two three", Field.Store.NO, Field.Index.ANALYZED));
+    doc = new Document();
+    doc.add(newField("field2", "one two three", Field.Store.NO, Field.Index.ANALYZED));
+    w.addDocument(doc);
+    w.commit();
+    w.deleteDocuments(new Term("field", "one"));
+    w.forceMerge(1);
+    IndexReader r = w.getReader();
+    w.close();
+    assertEquals(1, r.numDocs());
+    assertEquals(1, r.maxDoc());
+    TermEnum terms = r.terms(new Term("field"));
+    if (terms != null) {
+      assertTrue(!terms.next() || !"field".equals(terms.term().field()));
+    }
+    r.close();
+    d.close();
+  }
+
+  private String getRandomString() {
+    //return _TestUtil.randomSimpleString(random);
+    return _TestUtil.randomRealisticUnicodeString(random);
+  }
+
+  public void testRandomTerms() throws Exception {
+    final String[] terms = new String[_TestUtil.nextInt(random, 1, atLeast(1000))];
+    final Set<String> seen = new HashSet<String>();
+
+    final boolean allowEmptyString = random.nextBoolean();
+
+    if (random.nextInt(10) == 7 && terms.length > 2) {
+      // Sometimes add a bunch of terms sharing a longish common prefix:
+      final int numTermsSamePrefix = random.nextInt(terms.length/2);
+      if (numTermsSamePrefix > 0) {
+        String prefix;
+        while(true) {
+          prefix = getRandomString();
+          if (prefix.length() < 5) {
+            continue;
+          } else {
+            break;
+          }
+        }
+        while(seen.size() < numTermsSamePrefix) {
+          final String t = prefix + getRandomString();
+          if (!seen.contains(t)) {
+            terms[seen.size()] = t;
+            seen.add(t);
+          }
+        }
+      }
+    }
+
+    while(seen.size() < terms.length) {
+      final String t = getRandomString();
+      if (!seen.contains(t) && (allowEmptyString || t.length() != 0)) {
+        terms[seen.size()] = t;
+        seen.add(t);
+      }
+    }
+    r = makeIndex(terms);
+    testRandomSeeks(r, terms);
+    close();
+  }
+
+  private BytesRef getNonExistTerm(BytesRef[] terms) {
+    BytesRef t = null;
+    while(true) {
+      final String ts = getRandomString();
+      t = new BytesRef(ts);
+      if (Arrays.binarySearch(terms, t) < 0) {
+        return t;
+      }
+    }
+  }
+
+  private void testRandomSeeks(IndexReader r, String... validTermStrings) throws IOException {
+    final BytesRef[] validTerms = new BytesRef[validTermStrings.length];
+    for(int termIDX=0;termIDX<validTermStrings.length;termIDX++) {
+      validTerms[termIDX] = new BytesRef(validTermStrings[termIDX]);
+    }
+    Arrays.sort(validTerms, BytesRef.getUTF8SortedAsUTF16Comparator());
+    if (VERBOSE) {
+      System.out.println("TEST: " + validTerms.length + " terms:");
+      for(int idx=0;idx<validTerms.length;idx++) {
+        System.out.println("  " + idx + ": " + validTerms[idx]);
+      }
+    }
+
+    final int END_LOC = -validTerms.length-1;
+    
+    for(int iter=0;iter<100*RANDOM_MULTIPLIER;iter++) {
+
+      final BytesRef t;
+      int loc;
+      if (random.nextInt(6) == 4) {
+        // pick term that doens't exist:
+        t = getNonExistTerm(validTerms);
+        if (VERBOSE) {
+          System.out.println("\nTEST: invalid term=" + t.utf8ToString());
+        }
+        loc = Arrays.binarySearch(validTerms, t, BytesRef.getUTF8SortedAsUTF16Comparator());
+      } else {
+        // pick valid term
+        loc = random.nextInt(validTerms.length);
+        t = new BytesRef(validTerms[loc]);
+        if (VERBOSE) {
+          System.out.println("\nTEST: valid term=" + t.utf8ToString());
+        }
+      }
+      final Term targetTerm = new Term(FIELD, t.utf8ToString());
+
+      if (VERBOSE) {
+        System.out.println("  seek term=" + targetTerm);
+      }
+
+      final TermEnum te = r.terms(targetTerm);
+      Term actualTerm = te.term();
+      if (VERBOSE) {
+        System.out.println("  got " + actualTerm);
+      }
+
+      if (loc >= 0) {
+        // assertEquals(TermsEnum.SeekStatus.FOUND, result);
+      } else if (loc == END_LOC) {
+        assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
+      } else {
+        assert loc >= -validTerms.length;
+        assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
+        //assertEquals(TermsEnum.SeekStatus.NOT_FOUND, result);
+      }
+
+      if (loc >= 0) {
+        assertEquals(targetTerm, actualTerm);
+      } else if (loc == END_LOC) {
+        continue;
+      } else {
+        loc = -loc-1;
+        assertEquals(new Term(FIELD, validTerms[loc].utf8ToString()), actualTerm);
+      }
+
+      // Do a bunch of next's after the seek
+      final int numNext = random.nextInt(validTerms.length);
+
+      if (VERBOSE) {
+        System.out.println("\nTEST: numNext=" + numNext);
+      }
+
+      for(int nextCount=0;nextCount<numNext;nextCount++) {
+        if (VERBOSE) {
+          System.out.println("\nTEST: next loc=" + loc + " of " + validTerms.length);
+        }
+        boolean result = te.next();
+        actualTerm = te.term();
+        loc++;
+
+        if (loc == validTerms.length) {
+          if (VERBOSE) {
+            System.out.println("  actual=null");
+          }
+          assertFalse(result);
+          assertTrue(actualTerm == null || !FIELD.equals(actualTerm.field()));
+          break;
+        } else {
+          if (VERBOSE) {
+            System.out.println("  actual=" + new BytesRef(actualTerm.text()));
+          }
+          assertTrue(result);
+          assertTrue(actualTerm != null && FIELD.equals(actualTerm.field()));
+          assertEquals(validTerms[loc], new BytesRef(actualTerm.text()));
+        }
+      }
+    }
+  }
+}