1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.util.*;
21 import org.apache.lucene.store.*;
22 import org.apache.lucene.search.*;
23 import org.apache.lucene.analysis.*;
24 import org.apache.lucene.analysis.tokenattributes.*;
25 import org.apache.lucene.document.*;
27 import java.io.IOException;
28 import java.io.UnsupportedEncodingException;
29 import java.util.ArrayList;
30 import java.util.Collections;
31 import java.util.List;
32 import org.junit.Ignore;
34 // Best to run this test w/ plenty of RAM (because of the
39 // java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
42 public class Test2BTerms extends LuceneTestCase {
44 private final class MyTokenStream extends TokenStream {
46 private final int tokensPerDoc;
47 private int tokenCount;
48 private final CharTermAttribute charTerm;
49 private final static int TOKEN_LEN = 5;
50 private final char[] chars;
51 public final List<String> savedTerms = new ArrayList<String>();
54 public MyTokenStream(int tokensPerDoc) {
56 this.tokensPerDoc = tokensPerDoc;
57 charTerm = addAttribute(CharTermAttribute.class);
58 chars = charTerm.resizeBuffer(TOKEN_LEN);
59 charTerm.setLength(TOKEN_LEN);
60 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
64 public boolean incrementToken() {
65 if (tokenCount >= tokensPerDoc) {
68 _TestUtil.randomFixedLengthUnicodeString(random, chars, 0, TOKEN_LEN);
70 if (--nextSave == 0) {
71 final String s = new String(chars, 0, TOKEN_LEN);
72 System.out.println("TEST: save term=" + s + " [" + toHexString(s) + "]");
74 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
85 @Ignore("Takes ~4 hours to run on a fast machine!!")
86 public void test2BTerms() throws IOException {
88 final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
90 final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000);
92 List<String> savedTerms = null;
94 MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
95 dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
96 dir.setCheckIndexOnClose(false); // don't double-checkindex
97 //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
101 IndexWriter w = new IndexWriter(dir,
102 new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
103 .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
104 .setRAMBufferSizeMB(256.0)
105 .setMergeScheduler(new ConcurrentMergeScheduler())
106 .setMergePolicy(newLogMergePolicy(false, 10))
107 .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
109 MergePolicy mp = w.getConfig().getMergePolicy();
110 if (mp instanceof LogByteSizeMergePolicy) {
112 ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
115 Document doc = new Document();
117 final MyTokenStream ts = new MyTokenStream(TERMS_PER_DOC);
118 Field field = new Field("field", ts);
119 field.setOmitTermFreqAndPositions(true);
120 field.setOmitNorms(true);
122 //w.setInfoStream(System.out);
123 final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
125 System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC);
126 System.out.println("numDocs=" + numDocs);
128 for(int i=0;i<numDocs;i++) {
129 final long t0 = System.currentTimeMillis();
131 System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
133 savedTerms = ts.savedTerms;
135 System.out.println("TEST: optimize");
137 System.out.println("TEST: close writer");
141 System.out.println("TEST: open reader");
142 final IndexReader r = IndexReader.open(dir);
143 if (savedTerms == null) {
144 savedTerms = findTerms(r);
146 final int numSavedTerms = savedTerms.size();
147 final List<String> bigOrdTerms = new ArrayList<String>(savedTerms.subList(numSavedTerms-10, numSavedTerms));
148 System.out.println("TEST: test big ord terms...");
149 testSavedTerms(r, bigOrdTerms);
150 System.out.println("TEST: test all saved terms...");
151 testSavedTerms(r, savedTerms);
154 System.out.println("TEST: now CheckIndex...");
155 CheckIndex.Status status = _TestUtil.checkIndex(dir);
156 final long tc = status.segmentInfos.get(0).termIndexStatus.termCount;
157 assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
161 private List<String> findTerms(IndexReader r) throws IOException {
162 System.out.println("TEST: findTerms");
163 final TermEnum termEnum = r.terms();
164 final List<String> savedTerms = new ArrayList<String>();
165 int nextSave = _TestUtil.nextInt(random, 500000, 1000000);
166 while(termEnum.next()) {
167 if (--nextSave == 0) {
168 savedTerms.add(termEnum.term().text());
169 System.out.println("TEST: add " + termEnum.term());
170 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
176 private String toHexString(String s) {
179 bytes = s.getBytes("UTF-8");
180 } catch (UnsupportedEncodingException uee) {
181 throw new RuntimeException(uee);
183 StringBuilder sb = new StringBuilder();
184 for(byte b : bytes) {
185 if (sb.length() > 0) {
188 sb.append(Integer.toHexString(b&0xFF));
190 return sb.toString();
193 private void testSavedTerms(IndexReader r, List<String> terms) throws IOException {
194 System.out.println("TEST: run " + terms.size() + " terms on reader=" + r);
195 IndexSearcher s = new IndexSearcher(r);
196 Collections.shuffle(terms);
197 boolean failed = false;
198 for(int iter=0;iter<10*terms.size();iter++) {
199 final String term = terms.get(random.nextInt(terms.size()));
200 System.out.println("TEST: search " + term + " [" + toHexString(term) + "]");
201 final long t0 = System.currentTimeMillis();
202 final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits;
204 System.out.println(" FAILED: count=" + count);
207 final long t1 = System.currentTimeMillis();
208 System.out.println(" took " + (t1-t0) + " millis");
210 final TermEnum termEnum = r.terms(new Term("field", term));
211 final String text = termEnum.term().text();
212 if (!term.equals(text)) {
213 System.out.println(" FAILED: wrong term: got " + text + " [" + toHexString(text) + "]");