1 package org.apache.lucene.index;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.util.*;
21 import org.apache.lucene.store.*;
22 import org.apache.lucene.search.*;
23 import org.apache.lucene.analysis.*;
24 import org.apache.lucene.analysis.tokenattributes.*;
25 import org.apache.lucene.document.*;
26 import org.apache.lucene.index.FieldInfo.IndexOptions;
28 import java.io.IOException;
29 import java.io.UnsupportedEncodingException;
30 import java.util.ArrayList;
31 import java.util.Collections;
32 import java.util.List;
33 import org.junit.Ignore;
35 // Best to run this test w/ plenty of RAM (because of the
40 // java -server -Xmx8g -d64 -cp .:lib/junit-4.7.jar:./build/classes/test:./build/classes/test-framework:./build/classes/java -Dlucene.version=4.0-dev -Dtests.directory=MMapDirectory -DtempDir=build -ea org.junit.runner.JUnitCore org.apache.lucene.index.Test2BTerms
43 public class Test2BTerms extends LuceneTestCase {
45 private final class MyTokenStream extends TokenStream {
47 private final int tokensPerDoc;
48 private int tokenCount;
49 private final CharTermAttribute charTerm;
50 private final static int TOKEN_LEN = 5;
51 private final char[] chars;
52 public final List<String> savedTerms = new ArrayList<String>();
55 public MyTokenStream(int tokensPerDoc) {
57 this.tokensPerDoc = tokensPerDoc;
58 charTerm = addAttribute(CharTermAttribute.class);
59 chars = charTerm.resizeBuffer(TOKEN_LEN);
60 charTerm.setLength(TOKEN_LEN);
61 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
65 public boolean incrementToken() {
66 if (tokenCount >= tokensPerDoc) {
69 _TestUtil.randomFixedLengthUnicodeString(random, chars, 0, TOKEN_LEN);
71 if (--nextSave == 0) {
72 final String s = new String(chars, 0, TOKEN_LEN);
73 System.out.println("TEST: save term=" + s + " [" + toHexString(s) + "]");
75 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
86 @Ignore("Takes ~4 hours to run on a fast machine!!")
87 public void test2BTerms() throws IOException {
89 final long TERM_COUNT = ((long) Integer.MAX_VALUE) + 100000000;
91 final int TERMS_PER_DOC = _TestUtil.nextInt(random, 100000, 1000000);
93 List<String> savedTerms = null;
95 MockDirectoryWrapper dir = newFSDirectory(_TestUtil.getTempDir("2BTerms"));
96 dir.setThrottling(MockDirectoryWrapper.Throttling.NEVER);
97 dir.setCheckIndexOnClose(false); // don't double-checkindex
98 //Directory dir = newFSDirectory(new File("/p/lucene/indices/2bindex"));
102 IndexWriter w = new IndexWriter(dir,
103 new IndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random))
104 .setMaxBufferedDocs(IndexWriterConfig.DISABLE_AUTO_FLUSH)
105 .setRAMBufferSizeMB(256.0)
106 .setMergeScheduler(new ConcurrentMergeScheduler())
107 .setMergePolicy(newLogMergePolicy(false, 10))
108 .setOpenMode(IndexWriterConfig.OpenMode.CREATE));
110 MergePolicy mp = w.getConfig().getMergePolicy();
111 if (mp instanceof LogByteSizeMergePolicy) {
113 ((LogByteSizeMergePolicy) mp).setMaxMergeMB(1024*1024*1024);
116 Document doc = new Document();
118 final MyTokenStream ts = new MyTokenStream(TERMS_PER_DOC);
119 Field field = new Field("field", ts);
120 field.setIndexOptions(IndexOptions.DOCS_ONLY);
121 field.setOmitNorms(true);
123 //w.setInfoStream(System.out);
124 final int numDocs = (int) (TERM_COUNT/TERMS_PER_DOC);
126 System.out.println("TERMS_PER_DOC=" + TERMS_PER_DOC);
127 System.out.println("numDocs=" + numDocs);
129 for(int i=0;i<numDocs;i++) {
130 final long t0 = System.currentTimeMillis();
132 System.out.println(i + " of " + numDocs + " " + (System.currentTimeMillis()-t0) + " msec");
134 savedTerms = ts.savedTerms;
136 System.out.println("TEST: optimize");
138 System.out.println("TEST: close writer");
142 System.out.println("TEST: open reader");
143 final IndexReader r = IndexReader.open(dir);
144 if (savedTerms == null) {
145 savedTerms = findTerms(r);
147 final int numSavedTerms = savedTerms.size();
148 final List<String> bigOrdTerms = new ArrayList<String>(savedTerms.subList(numSavedTerms-10, numSavedTerms));
149 System.out.println("TEST: test big ord terms...");
150 testSavedTerms(r, bigOrdTerms);
151 System.out.println("TEST: test all saved terms...");
152 testSavedTerms(r, savedTerms);
155 System.out.println("TEST: now CheckIndex...");
156 CheckIndex.Status status = _TestUtil.checkIndex(dir);
157 final long tc = status.segmentInfos.get(0).termIndexStatus.termCount;
158 assertTrue("count " + tc + " is not > " + Integer.MAX_VALUE, tc > Integer.MAX_VALUE);
162 private List<String> findTerms(IndexReader r) throws IOException {
163 System.out.println("TEST: findTerms");
164 final TermEnum termEnum = r.terms();
165 final List<String> savedTerms = new ArrayList<String>();
166 int nextSave = _TestUtil.nextInt(random, 500000, 1000000);
167 while(termEnum.next()) {
168 if (--nextSave == 0) {
169 savedTerms.add(termEnum.term().text());
170 System.out.println("TEST: add " + termEnum.term());
171 nextSave = _TestUtil.nextInt(random, 500000, 1000000);
177 private String toHexString(String s) {
180 bytes = s.getBytes("UTF-8");
181 } catch (UnsupportedEncodingException uee) {
182 throw new RuntimeException(uee);
184 StringBuilder sb = new StringBuilder();
185 for(byte b : bytes) {
186 if (sb.length() > 0) {
189 sb.append(Integer.toHexString(b&0xFF));
191 return sb.toString();
194 private void testSavedTerms(IndexReader r, List<String> terms) throws IOException {
195 System.out.println("TEST: run " + terms.size() + " terms on reader=" + r);
196 IndexSearcher s = new IndexSearcher(r);
197 Collections.shuffle(terms);
198 boolean failed = false;
199 for(int iter=0;iter<10*terms.size();iter++) {
200 final String term = terms.get(random.nextInt(terms.size()));
201 System.out.println("TEST: search " + term + " [" + toHexString(term) + "]");
202 final long t0 = System.currentTimeMillis();
203 final int count = s.search(new TermQuery(new Term("field", term)), 1).totalHits;
205 System.out.println(" FAILED: count=" + count);
208 final long t1 = System.currentTimeMillis();
209 System.out.println(" took " + (t1-t0) + " millis");
211 final TermEnum termEnum = r.terms(new Term("field", term));
212 final String text = termEnum.term().text();
213 if (!term.equals(text)) {
214 System.out.println(" FAILED: wrong term: got " + text + " [" + toHexString(text) + "]");