1 package org.apache.lucene.util.fst;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
22 import java.io.FileInputStream;
23 import java.io.FileOutputStream;
24 import java.io.IOException;
25 import java.io.InputStreamReader;
26 import java.io.OutputStreamWriter;
27 import java.io.Writer;
30 import org.apache.lucene.analysis.MockAnalyzer;
31 import org.apache.lucene.document.Document;
32 import org.apache.lucene.document.Field;
33 import org.apache.lucene.index.IndexReader;
34 import org.apache.lucene.index.IndexWriter;
35 import org.apache.lucene.index.IndexWriterConfig;
36 import org.apache.lucene.index.RandomIndexWriter;
37 import org.apache.lucene.index.Term;
38 import org.apache.lucene.index.TermEnum;
39 import org.apache.lucene.search.IndexSearcher;
40 import org.apache.lucene.search.TermQuery;
41 import org.apache.lucene.store.Directory;
42 import org.apache.lucene.store.FSDirectory;
43 import org.apache.lucene.store.IndexInput;
44 import org.apache.lucene.store.IndexOutput;
45 import org.apache.lucene.store.MockDirectoryWrapper;
46 import org.apache.lucene.util.BytesRef;
47 import org.apache.lucene.util.IntsRef;
48 import org.apache.lucene.util.LineFileDocs;
49 import org.apache.lucene.util.LuceneTestCase;
50 import org.apache.lucene.util.UnicodeUtil;
51 import org.apache.lucene.util._TestUtil;
52 import org.apache.lucene.util.fst.FST.Arc;
54 public class TestFSTs extends LuceneTestCase {
56 private MockDirectoryWrapper dir;
59 public void setUp() throws Exception {
62 dir.setPreventDoubleWrite(false);
66 public void tearDown() throws Exception {
71 private static BytesRef toBytesRef(IntsRef ir) {
72 BytesRef br = new BytesRef(ir.length);
73 for(int i=0;i<ir.length;i++) {
74 int x = ir.ints[ir.offset+i];
75 assert x >= 0 && x <= 255;
76 br.bytes[i] = (byte) x;
78 br.length = ir.length;
82 private static IntsRef toIntsRef(String s, int inputMode) {
83 return toIntsRef(s, inputMode, new IntsRef(10));
86 private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
89 return toIntsRef(new BytesRef(s), ir);
92 return toIntsRefUTF32(s, ir);
96 private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
97 final int charLength = s.length();
100 while(charIdx < charLength) {
101 if (intIdx == ir.ints.length) {
104 final int utf32 = s.codePointAt(charIdx);
105 ir.ints[intIdx] = utf32;
106 charIdx += Character.charCount(utf32);
113 private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
114 if (br.length > ir.ints.length) {
117 for(int i=0;i<br.length;i++) {
118 ir.ints[i] = br.bytes[br.offset+i]&0xFF;
120 ir.length = br.length;
124 public void testBasicFSA() throws IOException {
125 String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
126 String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
127 IntsRef[] terms = new IntsRef[strings.length];
128 IntsRef[] terms2 = new IntsRef[strings2.length];
129 for(int inputMode=0;inputMode<2;inputMode++) {
131 System.out.println("TEST: inputMode=" + inputModeToString(inputMode));
134 for(int idx=0;idx<strings.length;idx++) {
135 terms[idx] = toIntsRef(strings[idx], inputMode);
137 for(int idx=0;idx<strings2.length;idx++) {
138 terms2[idx] = toIntsRef(strings2[idx], inputMode);
142 doTest(inputMode, terms);
144 // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms):
148 final Outputs<Object> outputs = NoOutputs.getSingleton();
149 final Object NO_OUTPUT = outputs.getNoOutput();
150 final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms2.length);
151 for(IntsRef term : terms2) {
152 pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
154 FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
156 assertEquals(22, fst.getNodeCount());
157 assertEquals(27, fst.getArcCount());
162 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
163 final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
164 for(int idx=0;idx<terms2.length;idx++) {
165 pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
167 final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
169 assertEquals(22, fst.getNodeCount());
170 assertEquals(27, fst.getArcCount());
173 // FST byte sequence ord
175 final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
176 final BytesRef NO_OUTPUT = outputs.getNoOutput();
177 final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms2.length);
178 for(int idx=0;idx<terms2.length;idx++) {
179 final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
180 pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
182 final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
184 assertEquals(24, fst.getNodeCount());
185 assertEquals(30, fst.getArcCount());
190 private static String simpleRandomString(Random r) {
191 final int end = r.nextInt(10);
196 final char[] buffer = new char[end];
197 for (int i = 0; i < end; i++) {
198 buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
200 return new String(buffer, 0, end);
203 // given set of terms, test the different outputs for them
204 private void doTest(int inputMode, IntsRef[] terms) throws IOException {
207 // NoOutputs (simple FSA)
209 final Outputs<Object> outputs = NoOutputs.getSingleton();
210 final Object NO_OUTPUT = outputs.getNoOutput();
211 final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
212 for(IntsRef term : terms) {
213 pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
215 new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
218 // PositiveIntOutput (ord)
220 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
221 final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
222 for(int idx=0;idx<terms.length;idx++) {
223 pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
225 new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
228 // PositiveIntOutput (random monotonically increasing positive number)
230 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
231 final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
233 for(int idx=0;idx<terms.length;idx++) {
234 final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
236 pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
238 new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
241 // PositiveIntOutput (random positive number)
243 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
244 final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
245 for(int idx=0;idx<terms.length;idx++) {
246 pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
248 new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
251 // Pair<ord, (random monotonically increasing positive number>
253 final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean());
254 final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean());
255 final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
256 final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>>(terms.length);
258 for(int idx=0;idx<terms.length;idx++) {
259 final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
261 pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
262 outputs.get(o1.get(idx),
265 new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
270 final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
271 final BytesRef NO_OUTPUT = outputs.getNoOutput();
272 final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms.length);
273 for(int idx=0;idx<terms.length;idx++) {
274 final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
275 pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
277 new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
282 final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
283 final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<FSTTester.InputOutput<IntsRef>>(terms.length);
284 for(int idx=0;idx<terms.length;idx++) {
285 final String s = Integer.toString(idx);
286 final IntsRef output = new IntsRef(s.length());
287 output.length = s.length();
288 for(int idx2=0;idx2<output.length;idx2++) {
289 output.ints[idx2] = s.charAt(idx2);
291 pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
293 new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
296 // Up to two positive ints, shared, generally but not
297 // monotonically increasing
300 System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
302 final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
303 final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
305 for(int idx=0;idx<terms.length;idx++) {
306 // Sometimes go backwards
307 long value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
309 value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
312 if (random.nextInt(5) == 3) {
313 long value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
315 value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
317 output = outputs.get(value, value2);
319 output = outputs.get(value);
321 pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
323 new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
327 private static class FSTTester<T> {
330 final List<InputOutput<T>> pairs;
332 final Outputs<T> outputs;
335 public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
336 this.random = random;
338 this.inputMode = inputMode;
340 this.outputs = outputs;
343 private static class InputOutput<T> implements Comparable<InputOutput<T>> {
344 public final IntsRef input;
345 public final T output;
347 public InputOutput(IntsRef input, T output) {
349 this.output = output;
352 public int compareTo(InputOutput<T> other) {
353 if (other instanceof InputOutput) {
354 return input.compareTo((other).input);
356 throw new IllegalArgumentException();
361 public void doTest() throws IOException {
365 if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
367 doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
370 doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
374 // runs the term, returning the output, or null if term
375 // isn't accepted. if prefixLength is non-null it must be
376 // length 1 int array; prefixLength[0] is set to the length
377 // of the term prefix that matches
378 private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
379 assert prefixLength == null || prefixLength.length == 1;
380 final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
381 final T NO_OUTPUT = fst.outputs.getNoOutput();
382 T output = NO_OUTPUT;
384 for(int i=0;i<=term.length;i++) {
386 if (i == term.length) {
387 label = FST.END_LABEL;
389 label = term.ints[term.offset+i];
391 //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
392 if (fst.findTargetArc(label, arc, arc) == null) {
393 if (prefixLength != null) {
400 output = fst.outputs.add(output, arc.output);
403 if (prefixLength != null) {
404 prefixLength[0] = term.length;
410 private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
411 FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
413 final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
416 final T NO_OUTPUT = fst.outputs.getNoOutput();
417 T output = NO_OUTPUT;
421 fst.readFirstTargetArc(arc, arc);
422 arcs.add(new FST.Arc<T>().copyFrom(arc));
423 while(!arc.isLast()) {
424 fst.readNextArc(arc);
425 arcs.add(new FST.Arc<T>().copyFrom(arc));
429 arc = arcs.get(random.nextInt(arcs.size()));
433 output = fst.outputs.add(output, arc.output);
436 if (arc.label == FST.END_LABEL) {
440 if (in.ints.length == in.length) {
441 in.grow(1+in.length);
443 in.ints[in.length++] = arc.label;
450 FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
452 System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
455 final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
457 prune1==0 && prune2==0,
458 allowRandomSuffixSharing ? random.nextBoolean() : true,
459 allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
462 for(InputOutput<T> pair : pairs) {
463 if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
464 final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
465 final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
466 @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
467 builderObject.add(pair.input, _outputs.get(twoLongs.first));
468 builderObject.add(pair.input, _outputs.get(twoLongs.second));
470 builder.add(pair.input, pair.output);
473 FST<T> fst = builder.finish();
475 if (random.nextBoolean() && fst != null) {
476 IndexOutput out = dir.createOutput("fst.bin");
479 IndexInput in = dir.openInput("fst.bin");
481 fst = new FST<T>(in, outputs);
484 dir.deleteFile("fst.bin");
488 if (VERBOSE && pairs.size() <= 20 && fst != null) {
489 Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
490 Util.toDot(fst, w, false, false);
492 System.out.println("SAVED out.dot");
497 System.out.println(" fst has 0 nodes (fully pruned)");
499 System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
503 if (prune1 == 0 && prune2 == 0) {
504 verifyUnPruned(inputMode, fst);
506 verifyPruned(inputMode, fst, prune1, prune2);
513 private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
515 if (pairs.size() == 0) {
521 System.out.println("TEST: now verify " + pairs.size() + " terms");
522 for(InputOutput<T> pair : pairs) {
524 assertNotNull(pair.input);
525 assertNotNull(pair.output);
526 System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
532 // visit valid paris in order -- make sure all words
533 // are accepted, and FSTEnum's next() steps through
536 System.out.println("TEST: check valid terms/next()");
539 IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
540 for(InputOutput<T> pair : pairs) {
541 IntsRef term = pair.input;
543 System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
545 Object output = run(fst, term, null);
547 assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
548 assertEquals(pair.output, output);
550 // verify enum's next
551 IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
553 assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
554 assertEquals(pair.output, t.output);
556 assertNull(fstEnum.next());
559 final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
560 for(InputOutput<T> pair : pairs) {
561 termsMap.put(pair.input, pair.output);
564 // find random matching word and make sure it's valid
566 System.out.println("TEST: verify random accepted terms");
568 final IntsRef scratch = new IntsRef(10);
569 int num = atLeast(500);
570 for(int iter=0;iter<num;iter++) {
571 T output = randomAcceptedWord(fst, scratch);
572 assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
573 assertEquals(termsMap.get(scratch), output);
576 // test IntsRefFSTEnum.seek:
578 System.out.println("TEST: verify seek");
580 IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
582 for(int iter=0;iter<num;iter++) {
584 System.out.println("TEST: iter=" + iter);
586 if (random.nextBoolean()) {
587 // seek to term that doesn't exist:
589 final IntsRef term = toIntsRef(getRandomString(), inputMode);
590 int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
594 //System.out.println(" seek " + inputToString(inputMode, term));
595 final IntsRefFSTEnum.InputOutput<T> seekResult;
596 if (random.nextBoolean()) {
598 System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
600 seekResult = fstEnum.seekFloor(term);
604 System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
606 seekResult = fstEnum.seekCeil(term);
609 if (pos != -1 && pos < pairs.size()) {
610 //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
611 assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
613 System.out.println(" got " + inputToString(inputMode, seekResult.input));
615 assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
616 assertEquals(pairs.get(pos).output, seekResult.output);
618 // seeked before start or beyond end
619 //System.out.println("seek=" + seekTerm);
620 assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
622 System.out.println(" got null");
630 // seek to term that does exist:
631 InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
632 final IntsRefFSTEnum.InputOutput<T> seekResult;
633 if (random.nextBoolean()) {
635 System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
637 seekResult = fstEnum.seekFloor(pair.input);
640 System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
642 seekResult = fstEnum.seekCeil(pair.input);
644 assertNotNull(seekResult);
645 assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
646 assertEquals(pair.output, seekResult.output);
651 System.out.println("TEST: mixed next/seek");
654 // test mixed next/seek
656 for(int iter=0;iter<num;iter++) {
658 System.out.println("TEST: iter " + iter);
661 fstEnum = new IntsRefFSTEnum<T>(fst);
664 boolean isDone = false;
665 if (upto == pairs.size()-1 || random.nextBoolean()) {
669 System.out.println(" do next");
671 isDone = fstEnum.next() == null;
672 } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
674 for(;attempt<10;attempt++) {
675 IntsRef term = toIntsRef(getRandomString(), inputMode);
676 if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
677 int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
681 if (random.nextBoolean()) {
683 assertTrue(upto != -1);
685 System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
687 isDone = fstEnum.seekFloor(term) == null;
690 System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
692 isDone = fstEnum.seekCeil(term) == null;
703 final int inc = random.nextInt(pairs.size() - upto - 1);
709 if (random.nextBoolean()) {
711 System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
713 isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
716 System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
718 isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
723 System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
725 System.out.println(" got null");
729 if (upto == pairs.size()) {
734 assertEquals(pairs.get(upto).input, fstEnum.current().input);
735 assertEquals(pairs.get(upto).output, fstEnum.current().output);
738 if (upto < pairs.size()-1) {
740 while(tryCount < 10) {
741 final IntsRef t = toIntsRef(getRandomString(), inputMode);
742 if (pairs.get(upto).input.compareTo(t) < 0) {
743 final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
745 System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
747 assertEquals(expected, fstEnum.beforeNext(t));
759 private static class CountMinOutput<T> {
763 boolean isLeaf = true;
768 private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
771 System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
772 for(InputOutput<T> pair : pairs) {
773 System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
777 // To validate the FST, we brute-force compute all prefixes
778 // in the terms, matched to their "common" outputs, prune that
779 // set according to the prune thresholds, then assert the FST
780 // matches that same set.
782 // NOTE: Crazy RAM intensive!!
784 //System.out.println("TEST: tally prefixes");
786 // build all prefixes
787 final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
788 final IntsRef scratch = new IntsRef(10);
789 for(InputOutput<T> pair: pairs) {
790 scratch.copy(pair.input);
791 for(int idx=0;idx<=pair.input.length;idx++) {
792 scratch.length = idx;
793 CountMinOutput<T> cmo = prefixes.get(scratch);
795 cmo = new CountMinOutput<T>();
797 cmo.output = pair.output;
798 prefixes.put(new IntsRef(scratch), cmo);
801 cmo.output = outputs.common(cmo.output, pair.output);
803 if (idx == pair.input.length) {
805 cmo.finalOutput = cmo.output;
811 System.out.println("TEST: now prune");
815 final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
816 while(it.hasNext()) {
817 Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
818 final IntsRef prefix = ent.getKey();
819 final CountMinOutput<T> cmo = ent.getValue();
821 System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
825 keep = cmo.count >= prune1;
828 if (prune2 > 1 && cmo.count >= prune2) {
830 } else if (prefix.length > 0) {
831 // consult our parent
832 scratch.length = prefix.length-1;
833 System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
834 final CountMinOutput<T> cmo2 = prefixes.get(scratch);
835 //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
836 keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
837 } else if (cmo.count >= prune2) {
846 //System.out.println(" remove");
848 // clear isLeaf for all ancestors
849 //System.out.println(" keep");
850 scratch.copy(prefix);
852 while(scratch.length >= 0) {
853 final CountMinOutput<T> cmo2 = prefixes.get(scratch);
855 //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
863 //System.out.println("TEST: after prune");
865 for(Map.Entry<BytesRef,CountMinOutput> ent : prefixes.entrySet()) {
866 System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
867 if (ent.getValue().isFinal) {
868 System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
873 if (prefixes.size() <= 1) {
880 // make sure FST only enums valid prefixes
882 System.out.println("TEST: check pruned enum");
884 IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
885 IntsRefFSTEnum.InputOutput<T> current;
886 while((current = fstEnum.next()) != null) {
888 System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
890 final CountMinOutput cmo = prefixes.get(current.input);
892 assertTrue(cmo.isLeaf || cmo.isFinal);
893 //if (cmo.isFinal && !cmo.isLeaf) {
895 assertEquals(cmo.finalOutput, current.output);
897 assertEquals(cmo.output, current.output);
901 // make sure all non-pruned prefixes are present in the FST
903 System.out.println("TEST: verify all prefixes");
905 final int[] stopNode = new int[1];
906 for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
907 if (ent.getKey().length > 0) {
908 final CountMinOutput<T> cmo = ent.getValue();
909 final T output = run(fst, ent.getKey(), stopNode);
911 System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
913 // if (cmo.isFinal && !cmo.isLeaf) {
915 assertEquals(cmo.finalOutput, output);
917 assertEquals(cmo.output, output);
919 assertEquals(ent.getKey().length, stopNode[0]);
925 public void testRandomWords() throws IOException {
926 testRandomWords(1000, atLeast(2));
927 //testRandomWords(20, 100);
930 private String inputModeToString(int mode) {
938 private void testRandomWords(int maxNumWords, int numIter) throws IOException {
939 for(int iter=0;iter<numIter;iter++) {
941 System.out.println("\nTEST: iter " + iter);
943 for(int inputMode=0;inputMode<2;inputMode++) {
944 final int numWords = random.nextInt(maxNumWords+1);
945 Set<IntsRef> termsSet = new HashSet<IntsRef>();
946 IntsRef[] terms = new IntsRef[numWords];
947 while(termsSet.size() < numWords) {
948 final String term = getRandomString();
949 termsSet.add(toIntsRef(term, inputMode));
951 doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
956 static String getRandomString() {
958 if (random.nextBoolean()) {
959 term = _TestUtil.randomRealisticUnicodeString(random);
961 // we want to mix in limited-alphabet symbols so
962 // we get more sharing of the nodes given how few
963 // terms we are testing...
964 term = simpleRandomString(random);
970 public void testBigSet() throws IOException {
971 testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
974 private static String inputToString(int inputMode, IntsRef term) {
975 return inputToString(inputMode, term, true);
978 private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
979 if (!isValidUnicode) {
980 return term.toString();
981 } else if (inputMode == 0) {
983 return toBytesRef(term).utf8ToString() + " " + term;
986 return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
990 private static IntsRef toIntsRef(String s) {
991 final int charCount = s.length();
992 IntsRef ir = new IntsRef(charCount);
993 for(int charIDX=0;charIDX<charCount;charIDX++) {
994 ir.ints[charIDX] = s.charAt(charIDX);
996 ir.length = charCount;
1000 private static String toString(IntsRef ints) {
1001 char[] chars = new char[ints.length];
1002 for(int charIDX=0;charIDX<ints.length;charIDX++) {
1003 final int ch = ints.ints[ints.offset+charIDX];
1004 assertTrue(ch >= 0 && ch < 65536);
1005 chars[charIDX] = (char) ch;
1007 return new String(chars);
1010 // Build FST for all unique terms in the test line docs
1011 // file, up until a time limit
1012 public void testRealTerms() throws Exception {
1015 if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
1017 CodecProvider.getDefault().setDefaultFieldCodec("Standard");
1021 final LineFileDocs docs = new LineFileDocs(random);
1022 final int RUN_TIME_MSEC = atLeast(500);
1023 final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
1024 final File tempDir = _TestUtil.getTempDir("fstlines");
1025 final MockDirectoryWrapper dir = newFSDirectory(tempDir);
1026 final IndexWriter writer = new IndexWriter(dir, conf);
1027 writer.setInfoStream(VERBOSE ? System.out : null);
1028 final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
1031 while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
1032 writer.addDocument(doc);
1035 IndexReader r = IndexReader.open(writer, true);
1037 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
1038 Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, outputs);
1040 boolean storeOrd = false;
1043 System.out.println("FST stores ord");
1045 System.out.println("FST stores docFreq");
1048 TermEnum termEnum = r.terms(new Term("body", ""));
1050 System.out.println("TEST: got termEnum=" + termEnum);
1054 final Term term = termEnum.term();
1055 if (term == null || !"body".equals(term.field())) {
1064 } catch (UnsupportedOperationException uoe) {
1066 System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
1076 output = termEnum.docFreq();
1078 //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
1079 builder.add(toIntsRef(term.text()), outputs.get(output));
1081 if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
1082 System.out.println(ord + " terms...");
1086 final FST<Long> fst = builder.finish();
1088 System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
1092 // Now confirm BytesRefFSTEnum and TermEnum act the
1094 final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
1095 int num = atLeast(1000);
1096 for(int iter=0;iter<num;iter++) {
1097 final String randomTerm = getRandomString();
1100 System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
1103 termEnum = r.terms(new Term("body", randomTerm));
1104 final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));
1106 if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
1107 assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
1109 assertSame(termEnum, fstEnum, storeOrd);
1110 for(int nextIter=0;nextIter<10;nextIter++) {
1112 System.out.println("TEST: next");
1114 //System.out.println(" ord=" + termEnum.ord());
1118 if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
1120 System.out.println(" term=" + termEnum.term());
1122 assertNotNull(fstEnum.next());
1123 assertSame(termEnum, fstEnum, storeOrd);
1126 System.out.println(" end!");
1128 IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
1129 if (nextResult != null) {
1130 System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
1144 private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception {
1145 if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
1146 if (fstEnum.current() != null) {
1147 fail("fstEnum.current().input=" + toString(fstEnum.current().input));
1150 assertNotNull(fstEnum.current());
1151 assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input));
1153 // fst stored the ord
1155 // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue());
1157 // fst stored the docFreq
1158 assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
1163 private static abstract class VisitTerms<T> {
1164 private final String dirOut;
1165 private final String wordsFileIn;
1166 private int inputMode;
1167 private final Outputs<T> outputs;
1168 private final Builder<T> builder;
1170 public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
1171 this.dirOut = dirOut;
1172 this.wordsFileIn = wordsFileIn;
1173 this.inputMode = inputMode;
1174 this.outputs = outputs;
1176 builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
1179 protected abstract T getOutput(IntsRef input, int ord) throws IOException;
1181 public void run(int limit, boolean verify) throws IOException {
1182 BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
1184 final IntsRef intsRef = new IntsRef(10);
1185 long tStart = System.currentTimeMillis();
1188 String w = is.readLine();
1192 toIntsRef(w, inputMode, intsRef);
1193 builder.add(intsRef,
1194 getOutput(intsRef, ord));
1197 if (ord % 500000 == 0) {
1199 String.format(Locale.ENGLISH,
1200 "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
1207 assert builder.getTermCount() == ord;
1208 final FST<T> fst = builder.finish();
1210 System.out.println("FST was fully pruned!");
1217 System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
1218 if (fst.getNodeCount() < 100) {
1219 Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
1220 Util.toDot(fst, w, false, false);
1222 System.out.println("Wrote FST to out.dot");
1225 Directory dir = FSDirectory.open(new File(dirOut));
1226 IndexOutput out = dir.createOutput("fst.bin");
1230 System.out.println("Saved FST to fst.bin.");
1236 System.out.println("\nNow verify...");
1239 is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
1242 tStart = System.currentTimeMillis();
1244 String w = is.readLine();
1248 toIntsRef(w, inputMode, intsRef);
1249 T expected = getOutput(intsRef, ord);
1250 T actual = Util.get(fst, intsRef);
1251 if (actual == null) {
1252 throw new RuntimeException("unexpected null output on input=" + w);
1254 if (!actual.equals(expected)) {
1255 throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
1259 if (ord % 500000 == 0) {
1260 System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
1267 double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
1268 System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
1276 // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
1277 public static void main(String[] args) throws IOException {
1279 int limit = Integer.MAX_VALUE;
1280 int inputMode = 0; // utf8
1281 boolean storeOrds = false;
1282 boolean storeDocFreqs = false;
1283 boolean verify = true;
1285 String wordsFileIn = null;
1286 String dirOut = null;
1289 while (idx < args.length) {
1290 if (args[idx].equals("-prune")) {
1291 prune = Integer.valueOf(args[1 + idx]);
1293 } else if (args[idx].equals("-limit")) {
1294 limit = Integer.valueOf(args[1 + idx]);
1296 } else if (args[idx].equals("-utf8")) {
1298 } else if (args[idx].equals("-utf32")) {
1300 } else if (args[idx].equals("-docFreq")) {
1301 storeDocFreqs = true;
1302 } else if (args[idx].equals("-ords")) {
1304 } else if (args[idx].equals("-noverify")) {
1306 } else if (args[idx].startsWith("-")) {
1307 System.err.println("Unrecognized option: " + args[idx]);
1310 if (wordsFileIn == null) {
1311 wordsFileIn = args[idx];
1312 } else if (dirOut == null) {
1315 System.err.println("Too many arguments, expected: input [output]");
1322 if (wordsFileIn == null) {
1323 System.err.println("No input file.");
1327 // ord benefits from share, docFreqs don't:
1329 if (storeOrds && storeDocFreqs) {
1330 // Store both ord & docFreq:
1331 final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
1332 final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
1333 final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
1334 new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
1337 public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
1339 rand = new Random(17);
1341 return new PairOutputs.Pair<Long,Long>(o1.get(ord),
1342 o2.get(_TestUtil.nextInt(rand, 1, 5000)));
1344 }.run(limit, verify);
1345 } else if (storeOrds) {
1347 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
1348 new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
1350 public Long getOutput(IntsRef input, int ord) {
1351 return outputs.get(ord);
1353 }.run(limit, verify);
1354 } else if (storeDocFreqs) {
1355 // Store only docFreq
1356 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
1357 new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
1360 public Long getOutput(IntsRef input, int ord) {
1362 rand = new Random(17);
1364 return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
1366 }.run(limit, verify);
1369 final NoOutputs outputs = NoOutputs.getSingleton();
1370 final Object NO_OUTPUT = outputs.getNoOutput();
1371 new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
1373 public Object getOutput(IntsRef input, int ord) {
1376 }.run(limit, verify);
1380 public void testSingleString() throws Exception {
1381 final Outputs<Object> outputs = NoOutputs.getSingleton();
1382 final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
1383 b.add(new BytesRef("foobar"), outputs.getNoOutput());
1384 final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
1385 assertNull(fstEnum.seekFloor(new BytesRef("foo")));
1386 assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
1389 public void testSimple() throws Exception {
1391 // Get outputs -- passing true means FST will share
1392 // (delta code) the outputs. This should result in
1393 // smaller FST if the outputs grow monotonically. But
1394 // if numbers are "random", false should give smaller
1396 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
1398 // Build an FST mapping BytesRef -> Long
1399 final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
1401 final BytesRef a = new BytesRef("a");
1402 final BytesRef b = new BytesRef("b");
1403 final BytesRef c = new BytesRef("c");
1405 builder.add(a, outputs.get(17));
1406 builder.add(b, outputs.get(42));
1407 builder.add(c, outputs.get(13824324872317238L));
1409 final FST<Long> fst = builder.finish();
1411 assertEquals(13824324872317238L, (long) Util.get(fst, c));
1412 assertEquals(42, (long) Util.get(fst, b));
1413 assertEquals(17, (long) Util.get(fst, a));
1415 BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
1416 BytesRefFSTEnum.InputOutput<Long> seekResult;
1417 seekResult = fstEnum.seekFloor(a);
1418 assertNotNull(seekResult);
1419 assertEquals(17, (long) seekResult.output);
1422 seekResult = fstEnum.seekFloor(new BytesRef("aa"));
1423 assertNotNull(seekResult);
1424 assertEquals(17, (long) seekResult.output);
1427 seekResult = fstEnum.seekCeil(new BytesRef("aa"));
1428 assertNotNull(seekResult);
1429 assertEquals(b, seekResult.input);
1430 assertEquals(42, (long) seekResult.output);
1433 public void testPrimaryKeys() throws Exception {
1434 Directory dir = newDirectory();
1436 for(int cycle=0;cycle<2;cycle++) {
1438 System.out.println("TEST: cycle=" + cycle);
1440 RandomIndexWriter w = new RandomIndexWriter(random, dir,
1441 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
1442 Document doc = new Document();
1443 Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
1446 final int NUM_IDS = (int) (1000*RANDOM_MULTIPLIER*(1.0+random.nextDouble()));
1447 //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
1449 System.out.println("TEST: NUM_IDS=" + NUM_IDS);
1451 final Set<String> allIDs = new HashSet<String>();
1452 for(int id=0;id<NUM_IDS;id++) {
1455 // PKs are assigned sequentially
1456 idString = String.format("%07d", id);
1459 final String s = Long.toString(random.nextLong());
1460 if (!allIDs.contains(s)) {
1466 allIDs.add(idString);
1467 idField.setValue(idString);
1473 // turn writer into reader:
1474 final IndexReader r = w.getReader();
1475 final IndexSearcher s = new IndexSearcher(r);
1478 final List<String> allIDsList = new ArrayList<String>(allIDs);
1479 final List<String> sortedAllIDsList = new ArrayList<String>(allIDsList);
1480 Collections.sort(sortedAllIDsList);
1482 // Sprinkle in some non-existent PKs:
1483 Set<String> outOfBounds = new HashSet<String>();
1484 for(int idx=0;idx<NUM_IDS/10;idx++) {
1487 idString = String.format("%07d", (NUM_IDS + idx));
1490 idString = Long.toString(random.nextLong());
1491 if (!allIDs.contains(idString)) {
1496 outOfBounds.add(idString);
1497 allIDsList.add(idString);
1500 // Verify w/ TermQuery
1501 for(int iter=0;iter<2*NUM_IDS;iter++) {
1502 final String id = allIDsList.get(random.nextInt(allIDsList.size()));
1503 final boolean exists = !outOfBounds.contains(id);
1505 System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id);
1507 assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.search(new TermQuery(new Term("id", id)), 1).totalHits);
1510 // Verify w/ MultiTermsEnum
1511 for(int iter=0;iter<2*NUM_IDS;iter++) {
1513 final String nextID;
1514 final boolean exists;
1516 if (random.nextBoolean()) {
1517 id = allIDsList.get(random.nextInt(allIDsList.size()));
1518 exists = !outOfBounds.contains(id);
1521 System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id);
1524 // Pick ID between two IDs:
1526 final int idv = random.nextInt(NUM_IDS-1);
1528 id = String.format("%07da", idv);
1529 nextID = String.format("%07d", idv+1);
1531 id = sortedAllIDsList.get(idv) + "a";
1532 nextID = sortedAllIDsList.get(idv+1);
1535 System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID);
1539 final boolean useCache = random.nextBoolean();
1541 System.out.println(" useCache=" + useCache);
1544 final Term idTerm = new Term("id", id);
1545 final TermEnum termEnum = r.terms(idTerm);
1546 final Term actual = termEnum.term();
1548 if (nextID != null) {
1549 assertNotNull(actual);
1550 assertTrue(!actual.equals(idTerm));
1551 assertEquals("expected=" + nextID + " actual=" + actual.text(), nextID, actual.text());
1552 } else if (!exists) {
1553 assertTrue(actual == null || !actual.equals(idTerm));
1555 assertEquals(actual, idTerm);
1564 public void testRandomTermLookup() throws Exception {
1565 Directory dir = newDirectory();
1567 RandomIndexWriter w = new RandomIndexWriter(random, dir,
1568 newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
1569 w.w.setInfoStream(VERBOSE ? System.out : null);
1571 Document doc = new Document();
1572 Field f = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
1575 final int NUM_TERMS = (int) (1000*RANDOM_MULTIPLIER * (1+random.nextDouble()));
1577 System.out.println("TEST: NUM_TERMS=" + NUM_TERMS);
1580 final Set<String> allTerms = new HashSet<String>();
1581 while(allTerms.size() < NUM_TERMS) {
1582 allTerms.add(simpleRandomString(random));
1585 for(String term : allTerms) {
1590 // turn writer into reader:
1592 System.out.println("TEST: get reader");
1594 IndexReader r = w.getReader();
1596 System.out.println("TEST: got reader=" + r);
1598 IndexSearcher s = new IndexSearcher(r);
1601 final List<String> allTermsList = new ArrayList<String>(allTerms);
1602 Collections.shuffle(allTermsList, random);
1604 // verify exact lookup
1605 for(String term : allTermsList) {
1607 System.out.println("TEST: term=" + term);
1609 assertEquals("term=" + term, 1, s.search(new TermQuery(new Term("field", term)), 1).totalHits);
1617 * Test state expansion (array format) on close-to-root states. Creates
1618 * synthetic input that has one expanded state on each level.
1620 * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
1622 public void testExpandedCloseToRoot() throws Exception {
1623 class SyntheticData {
1624 FST<Object> compile(String[] lines) throws IOException {
1625 final NoOutputs outputs = NoOutputs.getSingleton();
1626 final Object nothing = outputs.getNoOutput();
1627 final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
1630 final BytesRef term = new BytesRef();
1631 while (line < lines.length) {
1632 String w = lines[line++];
1637 b.add(term, nothing);
1643 void generate(ArrayList<String> out, StringBuilder b, char from, char to,
1645 if (depth == 0 || from == to) {
1646 String seq = b.toString() + "_" + out.size() + "_end";
1649 for (char c = from; c <= to; c++) {
1651 generate(out, b, from, c == to ? to : from, depth - 1);
1652 b.deleteCharAt(b.length() - 1);
1657 public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
1658 throws IOException {
1659 if (fst.targetHasArcs(arc)) {
1661 for (arc = fst.readFirstTargetArc(arc, arc);;
1662 arc = fst.readNextArc(arc), childCount++)
1664 boolean expanded = fst.isExpandedTarget(arc);
1665 int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
1669 (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
1670 children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
1671 children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
1672 if (arc.isLast()) break;
1682 assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
1683 assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
1685 SyntheticData s = new SyntheticData();
1687 ArrayList<String> out = new ArrayList<String>();
1688 StringBuilder b = new StringBuilder();
1689 s.generate(out, b, 'a', 'i', 10);
1690 String[] input = out.toArray(new String[out.size()]);
1692 FST<Object> fst = s.compile(input);
1693 FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
1694 s.verifyStateAndBelow(fst, arc, 1);
1697 // Make sure raw FST can differentiate between final vs
1698 // non-final end nodes
1699 public void testNonFinalStopNodes() throws Exception {
1700 final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
1701 final Long nothing = outputs.getNoOutput();
1702 final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
1704 final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
1706 final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
1708 // Add final stop node
1710 final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
1711 node.isFinal = true;
1712 rootNode.addArc('a', node);
1713 final Builder.CompiledNode frozen = new Builder.CompiledNode();
1714 frozen.address = fst.addNode(node);
1715 rootNode.arcs[0].nextFinalOutput = outputs.get(17);
1716 rootNode.arcs[0].isFinal = true;
1717 rootNode.arcs[0].output = nothing;
1718 rootNode.arcs[0].target = frozen;
1721 // Add non-final stop node
1723 final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
1724 rootNode.addArc('b', node);
1725 final Builder.CompiledNode frozen = new Builder.CompiledNode();
1726 frozen.address = fst.addNode(node);
1727 rootNode.arcs[1].nextFinalOutput = nothing;
1728 rootNode.arcs[1].output = outputs.get(42);
1729 rootNode.arcs[1].target = frozen;
1732 fst.finish(fst.addNode(rootNode));
1734 checkStopNodes(fst, outputs);
1736 // Make sure it still works after save/load:
1737 Directory dir = newDirectory();
1738 IndexOutput out = dir.createOutput("fst");
1742 IndexInput in = dir.openInput("fst");
1743 final FST<Long> fst2 = new FST<Long>(in, outputs);
1744 checkStopNodes(fst2, outputs);
1749 private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
1750 final Long nothing = outputs.getNoOutput();
1751 FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
1752 assertEquals(nothing, startArc.output);
1753 assertEquals(nothing, startArc.nextFinalOutput);
1755 FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>());
1756 assertEquals('a', arc.label);
1757 assertEquals(17, arc.nextFinalOutput.longValue());
1758 assertTrue(arc.isFinal());
1760 arc = fst.readNextArc(arc);
1761 assertEquals('b', arc.label);
1762 assertFalse(arc.isFinal());
1763 assertEquals(42, arc.output.longValue());