+++ /dev/null
-package org.apache.lucene.index;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import org.apache.lucene.util.*;
-import org.apache.lucene.store.*;
-import org.apache.lucene.search.*;
-import org.apache.lucene.analysis.*;
-import org.apache.lucene.analysis.tokenattributes.*;
-import org.apache.lucene.document.*;
-import org.apache.lucene.index.FieldInfo.IndexOptions;
-import java.io.File;
-import java.io.IOException;
-import java.io.UnsupportedEncodingException;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.List;
-import org.junit.Ignore;
-
-// Best to run this test w/ plenty of RAM (because of the
-// terms index):
-//
-// ant compile-test
-//
-// java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
-//
-
-public class Test2BTerms extends LuceneTestCase {
-
- private final class MyTokenStream extends TokenStream {
-
- private final int tokensPerDoc;
- private int tokenCount;
- private final CharTermAttribute charTerm;
- private final static int TOKEN_LEN = 5;
- private final char[] chars;
- public final List<String> savedTerms = new ArrayList<String>();
- private int nextSave;
-
- public MyTokenStream(int tokensPerDoc) {
- super();
- this.tokensPerDoc = tokensPerDoc;
- charTerm = addAttribute(CharTermAttribute.class);
- chars = charTerm.resizeBuffer(TOKEN_LEN);
- charTerm.setLength(TOKEN_LEN);
- nextSave = _TestUtil.nextInt(random, 500000, 1000000);
- }
-
- @Override
- public boolean incrementToken() {
- if (tokenCount >= tokensPerDoc) {
- return false;
- }
- _TestUtil.randomFixedLengthUnicodeString(random, chars, 0, TOKEN_LEN);
- tokenCount++;
- if (--nextSave == 0) {
- final String s = new String(chars, 0, TOKEN_LEN);
- System.out.println("TEST: save term=" + s + " [" + toHexString(s) + "]");
- savedTerms.add(s);
- nextSave = _TestUtil.nextInt(random, 500000, 1000000);
- }
- return true;
- }
-
- @Override
- public void reset() {
- tokenCount = 0;
- }
- }
-
- @Ignore("Takes ~4 hours to run on a fast machine!!")
- public void test2BTerms() throws IOException {
-
- final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
-
- final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000);
-
- List<String> savedTerms = null;
-
- MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
- dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
- dir.setCheckIndexOnClose(false); // don't double-checkindex
- //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
-
- if (true) {
-
- IndexWriter w = new IndexWriter(dir,
- new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
- .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
- .setRAMBufferSizeMB(256.0)
- .setMergeScheduler(new ConcurrentMergeScheduler())
- .setMergePolicy(newLogMergePolicy(false, 10))
- .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
-
- MergePolicy mp = w.getConfig().getMergePolicy();
- if (mp instanceof LogByteSizeMergePolicy) {
- // 1 petabyte:
- ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
- }
-
- Document doc = new Document();
-
- final MyTokenStream ts = new MyTokenStream(TERMS_PER_DOC);
- Field field = new Field("field", ts);
- field.setIndexOptions(IndexOptions.DOCS_ONLY);
- field.setOmitNorms(true);
- doc.add(field);
- //w.setInfoStream(System.out);
- final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
-
- System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC);
- System.out.println("numDocs=" + numDocs);
-
- for(int i=0;i<numDocs;i++) {
- final long t0 = System.currentTimeMillis();
- w.addDocument(doc);
- System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
- }
- savedTerms = ts.savedTerms;
-
- System.out.println("TEST: optimize");
- w.optimize();
- System.out.println("TEST: close writer");
- w.close();
- }
-
- System.out.println("TEST: open reader");
- final IndexReader r = IndexReader.open(dir);
- if (savedTerms == null) {
- savedTerms = findTerms(r);
- }
- final int numSavedTerms = savedTerms.size();
- final List<String> bigOrdTerms = new ArrayList<String>(savedTerms.subList(numSavedTerms-10, numSavedTerms));
- System.out.println("TEST: test big ord terms...");
- testSavedTerms(r, bigOrdTerms);
- System.out.println("TEST: test all saved terms...");
- testSavedTerms(r, savedTerms);
- r.close();
-
- System.out.println("TEST: now CheckIndex...");
- CheckIndex.Status status = _TestUtil.checkIndex(dir);
- final long tc = status.segmentInfos.get(0).termIndexStatus.termCount;
- assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
- dir.close();
- }
-
- private List<String> findTerms(IndexReader r) throws IOException {
- System.out.println("TEST: findTerms");
- final TermEnum termEnum = r.terms();
- final List<String> savedTerms = new ArrayList<String>();
- int nextSave = _TestUtil.nextInt(random, 500000, 1000000);
- while(termEnum.next()) {
- if (--nextSave == 0) {
- savedTerms.add(termEnum.term().text());
- System.out.println("TEST: add " + termEnum.term());
- nextSave = _TestUtil.nextInt(random, 500000, 1000000);
- }
- }
- return savedTerms;
- }
-
- private String toHexString(String s) {
- byte[] bytes;
- try {
- bytes = s.getBytes("UTF-8");
- } catch (UnsupportedEncodingException uee) {
- throw new RuntimeException(uee);
- }
- StringBuilder sb = new StringBuilder();
- for(byte b : bytes) {
- if (sb.length() > 0) {
- sb.append(' ');
- }
- sb.append(Integer.toHexString(b&0xFF));
- }
- return sb.toString();
- }
-
- private void testSavedTerms(IndexReader r, List<String> terms) throws IOException {
- System.out.println("TEST: run " + terms.size() + " terms on reader=" + r);
- IndexSearcher s = new IndexSearcher(r);
- Collections.shuffle(terms);
- boolean failed = false;
- for(int iter=0;iter<10*terms.size();iter++) {
- final String term = terms.get(random.nextInt(terms.size()));
- System.out.println("TEST: search " + term + " [" + toHexString(term) + "]");
- final long t0 = System.currentTimeMillis();
- final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits;
- if (count <= 0) {
- System.out.println(" FAILED: count=" + count);
- failed = true;
- }
- final long t1 = System.currentTimeMillis();
- System.out.println(" took " + (t1-t0) + " millis");
-
- final TermEnum termEnum = r.terms(new Term("field", term));
- final String text = termEnum.term().text();
- if (!term.equals(text)) {
- System.out.println(" FAILED: wrong term: got " + text + " [" + toHexString(text) + "]");
- failed = true;
- }
- }
- assertFalse(failed);
- }
-}