--- /dev/null
+package org.apache.lucene.util.fst;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileInputStream;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.util.*;
+
+import org.apache.lucene.analysis.MockAnalyzer;
+import org.apache.lucene.document.Document;
+import org.apache.lucene.document.Field;
+import org.apache.lucene.index.IndexReader;
+import org.apache.lucene.index.IndexWriter;
+import org.apache.lucene.index.IndexWriterConfig;
+import org.apache.lucene.index.RandomIndexWriter;
+import org.apache.lucene.index.Term;
+import org.apache.lucene.index.TermEnum;
+import org.apache.lucene.search.IndexSearcher;
+import org.apache.lucene.search.TermQuery;
+import org.apache.lucene.store.Directory;
+import org.apache.lucene.store.FSDirectory;
+import org.apache.lucene.store.IndexInput;
+import org.apache.lucene.store.IndexOutput;
+import org.apache.lucene.store.MockDirectoryWrapper;
+import org.apache.lucene.util.BytesRef;
+import org.apache.lucene.util.IntsRef;
+import org.apache.lucene.util.LineFileDocs;
+import org.apache.lucene.util.LuceneTestCase;
+import org.apache.lucene.util.UnicodeUtil;
+import org.apache.lucene.util._TestUtil;
+import org.apache.lucene.util.fst.FST.Arc;
+
+public class TestFSTs extends LuceneTestCase {
+
+ private MockDirectoryWrapper dir;
+
+ @Override
+ public void setUp() throws Exception {
+ super.setUp();
+ dir = newDirectory();
+ dir.setPreventDoubleWrite(false);
+ }
+
+ @Override
+ public void tearDown() throws Exception {
+ dir.close();
+ super.tearDown();
+ }
+
+ private static BytesRef toBytesRef(IntsRef ir) {
+ BytesRef br = new BytesRef(ir.length);
+ for(int i=0;i<ir.length;i++) {
+ int x = ir.ints[ir.offset+i];
+ assert x >= 0 && x <= 255;
+ br.bytes[i] = (byte) x;
+ }
+ br.length = ir.length;
+ return br;
+ }
+
+ private static IntsRef toIntsRef(String s, int inputMode) {
+ return toIntsRef(s, inputMode, new IntsRef(10));
+ }
+
+ private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) {
+ if (inputMode == 0) {
+ // utf8
+ return toIntsRef(new BytesRef(s), ir);
+ } else {
+ // utf32
+ return toIntsRefUTF32(s, ir);
+ }
+ }
+
+ private static IntsRef toIntsRefUTF32(String s, IntsRef ir) {
+ final int charLength = s.length();
+ int charIdx = 0;
+ int intIdx = 0;
+ while(charIdx < charLength) {
+ if (intIdx == ir.ints.length) {
+ ir.grow(intIdx+1);
+ }
+ final int utf32 = s.codePointAt(charIdx);
+ ir.ints[intIdx] = utf32;
+ charIdx += Character.charCount(utf32);
+ intIdx++;
+ }
+ ir.length = intIdx;
+ return ir;
+ }
+
+ private static IntsRef toIntsRef(BytesRef br, IntsRef ir) {
+ if (br.length > ir.ints.length) {
+ ir.grow(br.length);
+ }
+ for(int i=0;i<br.length;i++) {
+ ir.ints[i] = br.bytes[br.offset+i]&0xFF;
+ }
+ ir.length = br.length;
+ return ir;
+ }
+
+ public void testBasicFSA() throws IOException {
+ String[] strings = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation", "stat"};
+ String[] strings2 = new String[] {"station", "commotion", "elation", "elastic", "plastic", "stop", "ftop", "ftation"};
+ IntsRef[] terms = new IntsRef[strings.length];
+ IntsRef[] terms2 = new IntsRef[strings2.length];
+ for(int inputMode=0;inputMode<2;inputMode++) {
+ if (VERBOSE) {
+ System.out.println("TEST: inputMode=" + inputModeToString(inputMode));
+ }
+
+ for(int idx=0;idx<strings.length;idx++) {
+ terms[idx] = toIntsRef(strings[idx], inputMode);
+ }
+ for(int idx=0;idx<strings2.length;idx++) {
+ terms2[idx] = toIntsRef(strings2[idx], inputMode);
+ }
+ Arrays.sort(terms2);
+
+ doTest(inputMode, terms);
+
+ // Test pre-determined FST sizes to make sure we haven't lost minimality (at least on this trivial set of terms):
+
+ // FSA
+ {
+ final Outputs<Object> outputs = NoOutputs.getSingleton();
+ final Object NO_OUTPUT = outputs.getNoOutput();
+ final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms2.length);
+ for(IntsRef term : terms2) {
+ pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
+ }
+ FST<Object> fst = new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ assertNotNull(fst);
+ assertEquals(22, fst.getNodeCount());
+ assertEquals(27, fst.getArcCount());
+ }
+
+ // FST ord pos int
+ {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms2.length);
+ for(int idx=0;idx<terms2.length;idx++) {
+ pairs.add(new FSTTester.InputOutput<Long>(terms2[idx], outputs.get(idx)));
+ }
+ final FST<Long> fst = new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ assertNotNull(fst);
+ assertEquals(22, fst.getNodeCount());
+ assertEquals(27, fst.getArcCount());
+ }
+
+ // FST byte sequence ord
+ {
+ final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ final BytesRef NO_OUTPUT = outputs.getNoOutput();
+ final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms2.length);
+ for(int idx=0;idx<terms2.length;idx++) {
+ final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
+ pairs.add(new FSTTester.InputOutput<BytesRef>(terms2[idx], output));
+ }
+ final FST<BytesRef> fst = new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest(0, 0, false);
+ assertNotNull(fst);
+ assertEquals(24, fst.getNodeCount());
+ assertEquals(30, fst.getArcCount());
+ }
+ }
+ }
+
+ private static String simpleRandomString(Random r) {
+ final int end = r.nextInt(10);
+ if (end == 0) {
+ // allow 0 length
+ return "";
+ }
+ final char[] buffer = new char[end];
+ for (int i = 0; i < end; i++) {
+ buffer[i] = (char) _TestUtil.nextInt(r, 97, 102);
+ }
+ return new String(buffer, 0, end);
+ }
+
+ // given set of terms, test the different outputs for them
+ private void doTest(int inputMode, IntsRef[] terms) throws IOException {
+ Arrays.sort(terms);
+
+ // NoOutputs (simple FSA)
+ {
+ final Outputs<Object> outputs = NoOutputs.getSingleton();
+ final Object NO_OUTPUT = outputs.getNoOutput();
+ final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
+ for(IntsRef term : terms) {
+ pairs.add(new FSTTester.InputOutput<Object>(term, NO_OUTPUT));
+ }
+ new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // PositiveIntOutput (ord)
+ {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+ for(int idx=0;idx<terms.length;idx++) {
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(idx)));
+ }
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // PositiveIntOutput (random monotonically increasing positive number)
+ {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+ long lastOutput = 0;
+ for(int idx=0;idx<terms.length;idx++) {
+ final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
+ lastOutput = value;
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(value)));
+ }
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // PositiveIntOutput (random positive number)
+ {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ final List<FSTTester.InputOutput<Long>> pairs = new ArrayList<FSTTester.InputOutput<Long>>(terms.length);
+ for(int idx=0;idx<terms.length;idx++) {
+ pairs.add(new FSTTester.InputOutput<Long>(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE));
+ }
+ new FSTTester<Long>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // Pair<ord, (random monotonically increasing positive number>
+ {
+ final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
+ final List<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>> pairs = new ArrayList<FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>>(terms.length);
+ long lastOutput = 0;
+ for(int idx=0;idx<terms.length;idx++) {
+ final long value = lastOutput + _TestUtil.nextInt(random, 1, 1000);
+ lastOutput = value;
+ pairs.add(new FSTTester.InputOutput<PairOutputs.Pair<Long,Long>>(terms[idx],
+ outputs.get(o1.get(idx),
+ o2.get(value))));
+ }
+ new FSTTester<PairOutputs.Pair<Long,Long>>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // Sequence-of-bytes
+ {
+ final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
+ final BytesRef NO_OUTPUT = outputs.getNoOutput();
+ final List<FSTTester.InputOutput<BytesRef>> pairs = new ArrayList<FSTTester.InputOutput<BytesRef>>(terms.length);
+ for(int idx=0;idx<terms.length;idx++) {
+ final BytesRef output = random.nextInt(30) == 17 ? NO_OUTPUT : new BytesRef(Integer.toString(idx));
+ pairs.add(new FSTTester.InputOutput<BytesRef>(terms[idx], output));
+ }
+ new FSTTester<BytesRef>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // Sequence-of-ints
+ {
+ final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton();
+ final List<FSTTester.InputOutput<IntsRef>> pairs = new ArrayList<FSTTester.InputOutput<IntsRef>>(terms.length);
+ for(int idx=0;idx<terms.length;idx++) {
+ final String s = Integer.toString(idx);
+ final IntsRef output = new IntsRef(s.length());
+ output.length = s.length();
+ for(int idx2=0;idx2<output.length;idx2++) {
+ output.ints[idx2] = s.charAt(idx2);
+ }
+ pairs.add(new FSTTester.InputOutput<IntsRef>(terms[idx], output));
+ }
+ new FSTTester<IntsRef>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+
+ // Up to two positive ints, shared, generally but not
+ // monotonically increasing
+ {
+ if (VERBOSE) {
+ System.out.println("TEST: now test UpToTwoPositiveIntOutputs");
+ }
+ final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true);
+ final List<FSTTester.InputOutput<Object>> pairs = new ArrayList<FSTTester.InputOutput<Object>>(terms.length);
+ long lastOutput = 0;
+ for(int idx=0;idx<terms.length;idx++) {
+ // Sometimes go backwards
+ long value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+ while(value < 0) {
+ value = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+ }
+ final Object output;
+ if (random.nextInt(5) == 3) {
+ long value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+ while(value2 < 0) {
+ value2 = lastOutput + _TestUtil.nextInt(random, -100, 1000);
+ }
+ output = outputs.get(value, value2);
+ } else {
+ output = outputs.get(value);
+ }
+ pairs.add(new FSTTester.InputOutput<Object>(terms[idx], output));
+ }
+ new FSTTester<Object>(random, dir, inputMode, pairs, outputs).doTest();
+ }
+ }
+
+ private static class FSTTester<T> {
+
+ final Random random;
+ final List<InputOutput<T>> pairs;
+ final int inputMode;
+ final Outputs<T> outputs;
+ final Directory dir;
+
+ public FSTTester(Random random, Directory dir, int inputMode, List<InputOutput<T>> pairs, Outputs<T> outputs) {
+ this.random = random;
+ this.dir = dir;
+ this.inputMode = inputMode;
+ this.pairs = pairs;
+ this.outputs = outputs;
+ }
+
+ private static class InputOutput<T> implements Comparable<InputOutput<T>> {
+ public final IntsRef input;
+ public final T output;
+
+ public InputOutput(IntsRef input, T output) {
+ this.input = input;
+ this.output = output;
+ }
+
+ public int compareTo(InputOutput<T> other) {
+ if (other instanceof InputOutput) {
+ return input.compareTo((other).input);
+ } else {
+ throw new IllegalArgumentException();
+ }
+ }
+ }
+
+ public void doTest() throws IOException {
+ // no pruning
+ doTest(0, 0, true);
+
+ if (!(outputs instanceof UpToTwoPositiveIntOutputs)) {
+ // simple pruning
+ doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true);
+
+ // leafy pruning
+ doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true);
+ }
+ }
+
+ // runs the term, returning the output, or null if term
+ // isn't accepted. if prefixLength is non-null it must be
+ // length 1 int array; prefixLength[0] is set to the length
+ // of the term prefix that matches
+ private T run(FST<T> fst, IntsRef term, int[] prefixLength) throws IOException {
+ assert prefixLength == null || prefixLength.length == 1;
+ final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+ final T NO_OUTPUT = fst.outputs.getNoOutput();
+ T output = NO_OUTPUT;
+
+ for(int i=0;i<=term.length;i++) {
+ final int label;
+ if (i == term.length) {
+ label = FST.END_LABEL;
+ } else {
+ label = term.ints[term.offset+i];
+ }
+ //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal());
+ if (fst.findTargetArc(label, arc, arc) == null) {
+ if (prefixLength != null) {
+ prefixLength[0] = i;
+ return output;
+ } else {
+ return null;
+ }
+ }
+ output = fst.outputs.add(output, arc.output);
+ }
+
+ if (prefixLength != null) {
+ prefixLength[0] = term.length;
+ }
+
+ return output;
+ }
+
+ private T randomAcceptedWord(FST<T> fst, IntsRef in) throws IOException {
+ FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
+
+ final List<FST.Arc<T>> arcs = new ArrayList<FST.Arc<T>>();
+ in.length = 0;
+ in.offset = 0;
+ final T NO_OUTPUT = fst.outputs.getNoOutput();
+ T output = NO_OUTPUT;
+
+ while(true) {
+ // read all arcs:
+ fst.readFirstTargetArc(arc, arc);
+ arcs.add(new FST.Arc<T>().copyFrom(arc));
+ while(!arc.isLast()) {
+ fst.readNextArc(arc);
+ arcs.add(new FST.Arc<T>().copyFrom(arc));
+ }
+
+ // pick one
+ arc = arcs.get(random.nextInt(arcs.size()));
+ arcs.clear();
+
+ // accumulate output
+ output = fst.outputs.add(output, arc.output);
+
+ // append label
+ if (arc.label == FST.END_LABEL) {
+ break;
+ }
+
+ if (in.ints.length == in.length) {
+ in.grow(1+in.length);
+ }
+ in.ints[in.length++] = arc.label;
+ }
+
+ return output;
+ }
+
+
+ FST<T> doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException {
+ if (VERBOSE) {
+ System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2);
+ }
+
+ final Builder<T> builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4,
+ prune1, prune2,
+ prune1==0 && prune2==0,
+ allowRandomSuffixSharing ? random.nextBoolean() : true,
+ allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE,
+ outputs);
+
+ for(InputOutput<T> pair : pairs) {
+ if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) {
+ final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs;
+ final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output;
+ @SuppressWarnings("unchecked") final Builder<Object> builderObject = (Builder<Object>) builder;
+ builderObject.add(pair.input, _outputs.get(twoLongs.first));
+ builderObject.add(pair.input, _outputs.get(twoLongs.second));
+ } else {
+ builder.add(pair.input, pair.output);
+ }
+ }
+ FST<T> fst = builder.finish();
+
+ if (random.nextBoolean() && fst != null) {
+ IndexOutput out = dir.createOutput("fst.bin");
+ fst.save(out);
+ out.close();
+ IndexInput in = dir.openInput("fst.bin");
+ try {
+ fst = new FST<T>(in, outputs);
+ } finally {
+ in.close();
+ dir.deleteFile("fst.bin");
+ }
+ }
+
+ if (VERBOSE && pairs.size() <= 20 && fst != null) {
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ Util.toDot(fst, w, false, false);
+ w.close();
+ System.out.println("SAVED out.dot");
+ }
+
+ if (VERBOSE) {
+ if (fst == null) {
+ System.out.println(" fst has 0 nodes (fully pruned)");
+ } else {
+ System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs");
+ }
+ }
+
+ if (prune1 == 0 && prune2 == 0) {
+ verifyUnPruned(inputMode, fst);
+ } else {
+ verifyPruned(inputMode, fst, prune1, prune2);
+ }
+
+ return fst;
+ }
+
+ // FST is complete
+ private void verifyUnPruned(int inputMode, FST<T> fst) throws IOException {
+
+ if (pairs.size() == 0) {
+ assertNull(fst);
+ return;
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: now verify " + pairs.size() + " terms");
+ for(InputOutput<T> pair : pairs) {
+ assertNotNull(pair);
+ assertNotNull(pair.input);
+ assertNotNull(pair.output);
+ System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+ }
+ }
+
+ assertNotNull(fst);
+
+ // visit valid paris in order -- make sure all words
+ // are accepted, and FSTEnum's next() steps through
+ // them correctly
+ if (VERBOSE) {
+ System.out.println("TEST: check valid terms/next()");
+ }
+ {
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ for(InputOutput<T> pair : pairs) {
+ IntsRef term = pair.input;
+ if (VERBOSE) {
+ System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output));
+ }
+ Object output = run(fst, term, null);
+
+ assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output);
+ assertEquals(pair.output, output);
+
+ // verify enum's next
+ IntsRefFSTEnum.InputOutput<T> t = fstEnum.next();
+ assertNotNull(t);
+ assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input);
+ assertEquals(pair.output, t.output);
+ }
+ assertNull(fstEnum.next());
+ }
+
+ final Map<IntsRef,T> termsMap = new HashMap<IntsRef,T>();
+ for(InputOutput<T> pair : pairs) {
+ termsMap.put(pair.input, pair.output);
+ }
+
+ // find random matching word and make sure it's valid
+ if (VERBOSE) {
+ System.out.println("TEST: verify random accepted terms");
+ }
+ final IntsRef scratch = new IntsRef(10);
+ int num = atLeast(500);
+ for(int iter=0;iter<num;iter++) {
+ T output = randomAcceptedWord(fst, scratch);
+ assertTrue("accepted word " + inputToString(inputMode, scratch) + " is not valid", termsMap.containsKey(scratch));
+ assertEquals(termsMap.get(scratch), output);
+ }
+
+ // test IntsRefFSTEnum.seek:
+ if (VERBOSE) {
+ System.out.println("TEST: verify seek");
+ }
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ num = atLeast(100);
+ for(int iter=0;iter<num;iter++) {
+ if (VERBOSE) {
+ System.out.println("TEST: iter=" + iter);
+ }
+ if (random.nextBoolean()) {
+ // seek to term that doesn't exist:
+ while(true) {
+ final IntsRef term = toIntsRef(getRandomString(), inputMode);
+ int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+ if (pos < 0) {
+ pos = -(pos+1);
+ // ok doesn't exist
+ //System.out.println(" seek " + inputToString(inputMode, term));
+ final IntsRefFSTEnum.InputOutput<T> seekResult;
+ if (random.nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term));
+ }
+ seekResult = fstEnum.seekFloor(term);
+ pos--;
+ } else {
+ if (VERBOSE) {
+ System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term));
+ }
+ seekResult = fstEnum.seekCeil(term);
+ }
+
+ if (pos != -1 && pos < pairs.size()) {
+ //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output));
+ assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult);
+ if (VERBOSE) {
+ System.out.println(" got " + inputToString(inputMode, seekResult.input));
+ }
+ assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input);
+ assertEquals(pairs.get(pos).output, seekResult.output);
+ } else {
+ // seeked before start or beyond end
+ //System.out.println("seek=" + seekTerm);
+ assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult);
+ if (VERBOSE) {
+ System.out.println(" got null");
+ }
+ }
+
+ break;
+ }
+ }
+ } else {
+ // seek to term that does exist:
+ InputOutput<T> pair = pairs.get(random.nextInt(pairs.size()));
+ final IntsRefFSTEnum.InputOutput<T> seekResult;
+ if (random.nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input));
+ }
+ seekResult = fstEnum.seekFloor(pair.input);
+ } else {
+ if (VERBOSE) {
+ System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input));
+ }
+ seekResult = fstEnum.seekCeil(pair.input);
+ }
+ assertNotNull(seekResult);
+ assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input);
+ assertEquals(pair.output, seekResult.output);
+ }
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: mixed next/seek");
+ }
+
+ // test mixed next/seek
+ num = atLeast(100);
+ for(int iter=0;iter<num;iter++) {
+ if (VERBOSE) {
+ System.out.println("TEST: iter " + iter);
+ }
+ // reset:
+ fstEnum = new IntsRefFSTEnum<T>(fst);
+ int upto = -1;
+ while(true) {
+ boolean isDone = false;
+ if (upto == pairs.size()-1 || random.nextBoolean()) {
+ // next
+ upto++;
+ if (VERBOSE) {
+ System.out.println(" do next");
+ }
+ isDone = fstEnum.next() == null;
+ } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) {
+ int attempt = 0;
+ for(;attempt<10;attempt++) {
+ IntsRef term = toIntsRef(getRandomString(), inputMode);
+ if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) {
+ int pos = Collections.binarySearch(pairs, new InputOutput<T>(term, null));
+ assert pos < 0;
+ upto = -(pos+1);
+
+ if (random.nextBoolean()) {
+ upto--;
+ assertTrue(upto != -1);
+ if (VERBOSE) {
+ System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")");
+ }
+ isDone = fstEnum.seekFloor(term) == null;
+ } else {
+ if (VERBOSE) {
+ System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")");
+ }
+ isDone = fstEnum.seekCeil(term) == null;
+ }
+
+ break;
+ }
+ }
+ if (attempt == 10) {
+ continue;
+ }
+
+ } else {
+ final int inc = random.nextInt(pairs.size() - upto - 1);
+ upto += inc;
+ if (upto == -1) {
+ upto = 0;
+ }
+
+ if (random.nextBoolean()) {
+ if (VERBOSE) {
+ System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+ }
+ isDone = fstEnum.seekCeil(pairs.get(upto).input) == null;
+ } else {
+ if (VERBOSE) {
+ System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")");
+ }
+ isDone = fstEnum.seekFloor(pairs.get(upto).input) == null;
+ }
+ }
+ if (VERBOSE) {
+ if (!isDone) {
+ System.out.println(" got " + inputToString(inputMode, fstEnum.current().input));
+ } else {
+ System.out.println(" got null");
+ }
+ }
+
+ if (upto == pairs.size()) {
+ assertTrue(isDone);
+ break;
+ } else {
+ assertFalse(isDone);
+ assertEquals(pairs.get(upto).input, fstEnum.current().input);
+ assertEquals(pairs.get(upto).output, fstEnum.current().output);
+
+ /*
+ if (upto < pairs.size()-1) {
+ int tryCount = 0;
+ while(tryCount < 10) {
+ final IntsRef t = toIntsRef(getRandomString(), inputMode);
+ if (pairs.get(upto).input.compareTo(t) < 0) {
+ final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0;
+ if (VERBOSE) {
+ System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected);
+ }
+ assertEquals(expected, fstEnum.beforeNext(t));
+ break;
+ }
+ tryCount++;
+ }
+ }
+ */
+ }
+ }
+ }
+ }
+
+ private static class CountMinOutput<T> {
+ int count;
+ T output;
+ T finalOutput;
+ boolean isLeaf = true;
+ boolean isFinal;
+ }
+
+ // FST is pruned
+ private void verifyPruned(int inputMode, FST<T> fst, int prune1, int prune2) throws IOException {
+
+ if (VERBOSE) {
+ System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs);
+ for(InputOutput<T> pair : pairs) {
+ System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output));
+ }
+ }
+
+ // To validate the FST, we brute-force compute all prefixes
+ // in the terms, matched to their "common" outputs, prune that
+ // set according to the prune thresholds, then assert the FST
+ // matches that same set.
+
+ // NOTE: Crazy RAM intensive!!
+
+ //System.out.println("TEST: tally prefixes");
+
+ // build all prefixes
+ final Map<IntsRef,CountMinOutput<T>> prefixes = new HashMap<IntsRef,CountMinOutput<T>>();
+ final IntsRef scratch = new IntsRef(10);
+ for(InputOutput<T> pair: pairs) {
+ scratch.copy(pair.input);
+ for(int idx=0;idx<=pair.input.length;idx++) {
+ scratch.length = idx;
+ CountMinOutput<T> cmo = prefixes.get(scratch);
+ if (cmo == null) {
+ cmo = new CountMinOutput<T>();
+ cmo.count = 1;
+ cmo.output = pair.output;
+ prefixes.put(new IntsRef(scratch), cmo);
+ } else {
+ cmo.count++;
+ cmo.output = outputs.common(cmo.output, pair.output);
+ }
+ if (idx == pair.input.length) {
+ cmo.isFinal = true;
+ cmo.finalOutput = cmo.output;
+ }
+ }
+ }
+
+ if (VERBOSE) {
+ System.out.println("TEST: now prune");
+ }
+
+ // prune 'em
+ final Iterator<Map.Entry<IntsRef,CountMinOutput<T>>> it = prefixes.entrySet().iterator();
+ while(it.hasNext()) {
+ Map.Entry<IntsRef,CountMinOutput<T>> ent = it.next();
+ final IntsRef prefix = ent.getKey();
+ final CountMinOutput<T> cmo = ent.getValue();
+ if (VERBOSE) {
+ System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal);
+ }
+ final boolean keep;
+ if (prune1 > 0) {
+ keep = cmo.count >= prune1;
+ } else {
+ assert prune2 > 0;
+ if (prune2 > 1 && cmo.count >= prune2) {
+ keep = true;
+ } else if (prefix.length > 0) {
+ // consult our parent
+ scratch.length = prefix.length-1;
+ System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length);
+ final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+ //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count));
+ keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1)));
+ } else if (cmo.count >= prune2) {
+ keep = true;
+ } else {
+ keep = false;
+ }
+ }
+
+ if (!keep) {
+ it.remove();
+ //System.out.println(" remove");
+ } else {
+ // clear isLeaf for all ancestors
+ //System.out.println(" keep");
+ scratch.copy(prefix);
+ scratch.length--;
+ while(scratch.length >= 0) {
+ final CountMinOutput<T> cmo2 = prefixes.get(scratch);
+ if (cmo2 != null) {
+ //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch));
+ cmo2.isLeaf = false;
+ }
+ scratch.length--;
+ }
+ }
+ }
+
+ //System.out.println("TEST: after prune");
+ /*
+ for(Map.Entry<BytesRef,CountMinOutput> ent : prefixes.entrySet()) {
+ System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal);
+ if (ent.getValue().isFinal) {
+ System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput));
+ }
+ }
+ */
+
+ if (prefixes.size() <= 1) {
+ assertNull(fst);
+ return;
+ }
+
+ assertNotNull(fst);
+
+ // make sure FST only enums valid prefixes
+ if (VERBOSE) {
+ System.out.println("TEST: check pruned enum");
+ }
+ IntsRefFSTEnum<T> fstEnum = new IntsRefFSTEnum<T>(fst);
+ IntsRefFSTEnum.InputOutput<T> current;
+ while((current = fstEnum.next()) != null) {
+ if (VERBOSE) {
+ System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output));
+ }
+ final CountMinOutput cmo = prefixes.get(current.input);
+ assertNotNull(cmo);
+ assertTrue(cmo.isLeaf || cmo.isFinal);
+ //if (cmo.isFinal && !cmo.isLeaf) {
+ if (cmo.isFinal) {
+ assertEquals(cmo.finalOutput, current.output);
+ } else {
+ assertEquals(cmo.output, current.output);
+ }
+ }
+
+ // make sure all non-pruned prefixes are present in the FST
+ if (VERBOSE) {
+ System.out.println("TEST: verify all prefixes");
+ }
+ final int[] stopNode = new int[1];
+ for(Map.Entry<IntsRef,CountMinOutput<T>> ent : prefixes.entrySet()) {
+ if (ent.getKey().length > 0) {
+ final CountMinOutput<T> cmo = ent.getValue();
+ final T output = run(fst, ent.getKey(), stopNode);
+ if (VERBOSE) {
+ System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output));
+ }
+ // if (cmo.isFinal && !cmo.isLeaf) {
+ if (cmo.isFinal) {
+ assertEquals(cmo.finalOutput, output);
+ } else {
+ assertEquals(cmo.output, output);
+ }
+ assertEquals(ent.getKey().length, stopNode[0]);
+ }
+ }
+ }
+ }
+
+ public void testRandomWords() throws IOException {
+ testRandomWords(1000, atLeast(2));
+ //testRandomWords(20, 100);
+ }
+
+ private String inputModeToString(int mode) {
+ if (mode == 0) {
+ return "utf8";
+ } else {
+ return "utf32";
+ }
+ }
+
+ private void testRandomWords(int maxNumWords, int numIter) throws IOException {
+ for(int iter=0;iter<numIter;iter++) {
+ if (VERBOSE) {
+ System.out.println("\nTEST: iter " + iter);
+ }
+ for(int inputMode=0;inputMode<2;inputMode++) {
+ final int numWords = random.nextInt(maxNumWords+1);
+ Set<IntsRef> termsSet = new HashSet<IntsRef>();
+ IntsRef[] terms = new IntsRef[numWords];
+ while(termsSet.size() < numWords) {
+ final String term = getRandomString();
+ termsSet.add(toIntsRef(term, inputMode));
+ }
+ doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()]));
+ }
+ }
+ }
+
+ static String getRandomString() {
+ final String term;
+ if (random.nextBoolean()) {
+ term = _TestUtil.randomRealisticUnicodeString(random);
+ } else {
+ // we want to mix in limited-alphabet symbols so
+ // we get more sharing of the nodes given how few
+ // terms we are testing...
+ term = simpleRandomString(random);
+ }
+ return term;
+ }
+
+ @Nightly
+ public void testBigSet() throws IOException {
+ testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1);
+ }
+
+ private static String inputToString(int inputMode, IntsRef term) {
+ return inputToString(inputMode, term, true);
+ }
+
+ private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) {
+ if (!isValidUnicode) {
+ return term.toString();
+ } else if (inputMode == 0) {
+ // utf8
+ return toBytesRef(term).utf8ToString() + " " + term;
+ } else {
+ // utf32
+ return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term;
+ }
+ }
+
+ private static IntsRef toIntsRef(String s) {
+ final int charCount = s.length();
+ IntsRef ir = new IntsRef(charCount);
+ for(int charIDX=0;charIDX<charCount;charIDX++) {
+ ir.ints[charIDX] = s.charAt(charIDX);
+ }
+ ir.length = charCount;
+ return ir;
+ }
+
+ private static String toString(IntsRef ints) {
+ char[] chars = new char[ints.length];
+ for(int charIDX=0;charIDX<ints.length;charIDX++) {
+ final int ch = ints.ints[ints.offset+charIDX];
+ assertTrue(ch >= 0 && ch < 65536);
+ chars[charIDX] = (char) ch;
+ }
+ return new String(chars);
+ }
+
+ // Build FST for all unique terms in the test line docs
+ // file, up until a time limit
+ public void testRealTerms() throws Exception {
+
+ /*
+ if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) {
+ // no
+ CodecProvider.getDefault().setDefaultFieldCodec("Standard");
+ }
+ */
+
+ final LineFileDocs docs = new LineFileDocs(random);
+ final int RUN_TIME_MSEC = atLeast(500);
+ final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64);
+ final File tempDir = _TestUtil.getTempDir("fstlines");
+ final MockDirectoryWrapper dir = newFSDirectory(tempDir);
+ final IndexWriter writer = new IndexWriter(dir, conf);
+ writer.setInfoStream(VERBOSE ? System.out : null);
+ final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC;
+ Document doc;
+ int docCount = 0;
+ while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) {
+ writer.addDocument(doc);
+ docCount++;
+ }
+ IndexReader r = IndexReader.open(writer, true);
+ writer.close();
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean());
+ Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE2, outputs);
+
+ boolean storeOrd = false;
+ if (VERBOSE) {
+ if (storeOrd) {
+ System.out.println("FST stores ord");
+ } else {
+ System.out.println("FST stores docFreq");
+ }
+ }
+ TermEnum termEnum = r.terms(new Term("body", ""));
+ if (VERBOSE) {
+ System.out.println("TEST: got termEnum=" + termEnum);
+ }
+ int ord = 0;
+ while(true) {
+ final Term term = termEnum.term();
+ if (term == null || !"body".equals(term.field())) {
+ break;
+ }
+
+ // No ord in 3.x:
+ /*
+ if (ord == 0) {
+ try {
+ termsEnum.ord();
+ } catch (UnsupportedOperationException uoe) {
+ if (VERBOSE) {
+ System.out.println("TEST: codec doesn't support ord; FST stores docFreq");
+ }
+ storeOrd = false;
+ }
+ }
+ */
+ final int output;
+ if (storeOrd) {
+ output = ord;
+ } else {
+ output = termEnum.docFreq();
+ }
+ //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0)));
+ builder.add(toIntsRef(term.text()), outputs.get(output));
+ ord++;
+ if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) {
+ System.out.println(ord + " terms...");
+ }
+ termEnum.next();
+ }
+ final FST<Long> fst = builder.finish();
+ if (VERBOSE) {
+ System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes");
+ }
+
+ if (ord > 0) {
+ // Now confirm BytesRefFSTEnum and TermEnum act the
+ // same:
+ final IntsRefFSTEnum<Long> fstEnum = new IntsRefFSTEnum<Long>(fst);
+ int num = atLeast(1000);
+ for(int iter=0;iter<num;iter++) {
+ final String randomTerm = getRandomString();
+
+ if (VERBOSE) {
+ System.out.println("TEST: seek " + randomTerm + " ch[0]=" + (randomTerm.length() == 0 ? -1 : randomTerm.charAt(0)));
+ }
+
+ termEnum = r.terms(new Term("body", randomTerm));
+ final IntsRefFSTEnum.InputOutput fstSeekResult = fstEnum.seekCeil(toIntsRef(randomTerm));
+
+ if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
+ assertNull("got " + (fstSeekResult == null ? "null" : toString(fstSeekResult.input) + " but expected null"), fstSeekResult);
+ } else {
+ assertSame(termEnum, fstEnum, storeOrd);
+ for(int nextIter=0;nextIter<10;nextIter++) {
+ if (VERBOSE) {
+ System.out.println("TEST: next");
+ //if (storeOrd) {
+ //System.out.println(" ord=" + termEnum.ord());
+ //}
+ }
+ termEnum.next();
+ if (termEnum.term() != null && "body".equals(termEnum.term().field())) {
+ if (VERBOSE) {
+ System.out.println(" term=" + termEnum.term());
+ }
+ assertNotNull(fstEnum.next());
+ assertSame(termEnum, fstEnum, storeOrd);
+ } else {
+ if (VERBOSE) {
+ System.out.println(" end!");
+ }
+ IntsRefFSTEnum.InputOutput<Long> nextResult = fstEnum.next();
+ if (nextResult != null) {
+ System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output));
+ fail();
+ }
+ break;
+ }
+ }
+ }
+ }
+ }
+
+ r.close();
+ dir.close();
+ }
+
+ private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception {
+ if (termEnum.term() == null || !"body".equals(termEnum.term().field())) {
+ if (fstEnum.current() != null) {
+ fail("fstEnum.current().input=" + toString(fstEnum.current().input));
+ }
+ } else {
+ assertNotNull(fstEnum.current());
+ assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input));
+ if (storeOrd) {
+ // fst stored the ord
+ // No ord in 3.x
+ // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue());
+ } else {
+ // fst stored the docFreq
+ assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue()));
+ }
+ }
+ }
+
+ private static abstract class VisitTerms<T> {
+ private final String dirOut;
+ private final String wordsFileIn;
+ private int inputMode;
+ private final Outputs<T> outputs;
+ private final Builder<T> builder;
+
+ public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs<T> outputs) {
+ this.dirOut = dirOut;
+ this.wordsFileIn = wordsFileIn;
+ this.inputMode = inputMode;
+ this.outputs = outputs;
+
+ builder = new Builder<T>(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs);
+ }
+
+ protected abstract T getOutput(IntsRef input, int ord) throws IOException;
+
+ public void run(int limit, boolean verify) throws IOException {
+ BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+ try {
+ final IntsRef intsRef = new IntsRef(10);
+ long tStart = System.currentTimeMillis();
+ int ord = 0;
+ while(true) {
+ String w = is.readLine();
+ if (w == null) {
+ break;
+ }
+ toIntsRef(w, inputMode, intsRef);
+ builder.add(intsRef,
+ getOutput(intsRef, ord));
+
+ ord++;
+ if (ord % 500000 == 0) {
+ System.out.println(
+ String.format(Locale.ENGLISH,
+ "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord));
+ }
+ if (ord >= limit) {
+ break;
+ }
+ }
+
+ assert builder.getTermCount() == ord;
+ final FST<T> fst = builder.finish();
+ if (fst == null) {
+ System.out.println("FST was fully pruned!");
+ System.exit(0);
+ }
+
+ if (dirOut == null)
+ return;
+
+ System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes());
+ if (fst.getNodeCount() < 100) {
+ Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8");
+ Util.toDot(fst, w, false, false);
+ w.close();
+ System.out.println("Wrote FST to out.dot");
+ }
+
+ Directory dir = FSDirectory.open(new File(dirOut));
+ IndexOutput out = dir.createOutput("fst.bin");
+ fst.save(out);
+ out.close();
+
+ System.out.println("Saved FST to fst.bin.");
+
+ if (!verify) {
+ return;
+ }
+
+ System.out.println("\nNow verify...");
+
+ is.close();
+ is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536);
+
+ ord = 0;
+ tStart = System.currentTimeMillis();
+ while(true) {
+ String w = is.readLine();
+ if (w == null) {
+ break;
+ }
+ toIntsRef(w, inputMode, intsRef);
+ T expected = getOutput(intsRef, ord);
+ T actual = Util.get(fst, intsRef);
+ if (actual == null) {
+ throw new RuntimeException("unexpected null output on input=" + w);
+ }
+ if (!actual.equals(expected)) {
+ throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w);
+ }
+
+ ord++;
+ if (ord % 500000 == 0) {
+ System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "...");
+ }
+ if (ord >= limit) {
+ break;
+ }
+ }
+
+ double totSec = ((System.currentTimeMillis() - tStart)/1000.0);
+ System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)");
+
+ } finally {
+ is.close();
+ }
+ }
+ }
+
+ // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out
+ public static void main(String[] args) throws IOException {
+ int prune = 0;
+ int limit = Integer.MAX_VALUE;
+ int inputMode = 0; // utf8
+ boolean storeOrds = false;
+ boolean storeDocFreqs = false;
+ boolean verify = true;
+
+ String wordsFileIn = null;
+ String dirOut = null;
+
+ int idx = 0;
+ while (idx < args.length) {
+ if (args[idx].equals("-prune")) {
+ prune = Integer.valueOf(args[1 + idx]);
+ idx++;
+ } else if (args[idx].equals("-limit")) {
+ limit = Integer.valueOf(args[1 + idx]);
+ idx++;
+ } else if (args[idx].equals("-utf8")) {
+ inputMode = 0;
+ } else if (args[idx].equals("-utf32")) {
+ inputMode = 1;
+ } else if (args[idx].equals("-docFreq")) {
+ storeDocFreqs = true;
+ } else if (args[idx].equals("-ords")) {
+ storeOrds = true;
+ } else if (args[idx].equals("-noverify")) {
+ verify = false;
+ } else if (args[idx].startsWith("-")) {
+ System.err.println("Unrecognized option: " + args[idx]);
+ System.exit(-1);
+ } else {
+ if (wordsFileIn == null) {
+ wordsFileIn = args[idx];
+ } else if (dirOut == null) {
+ dirOut = args[idx];
+ } else {
+ System.err.println("Too many arguments, expected: input [output]");
+ System.exit(-1);
+ }
+ }
+ idx++;
+ }
+
+ if (wordsFileIn == null) {
+ System.err.println("No input file.");
+ System.exit(-1);
+ }
+
+ // ord benefits from share, docFreqs don't:
+
+ if (storeOrds && storeDocFreqs) {
+ // Store both ord & docFreq:
+ final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true);
+ final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false);
+ final PairOutputs<Long,Long> outputs = new PairOutputs<Long,Long>(o1, o2);
+ new VisitTerms<PairOutputs.Pair<Long,Long>>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ Random rand;
+ @Override
+ public PairOutputs.Pair<Long,Long> getOutput(IntsRef input, int ord) {
+ if (ord == 0) {
+ rand = new Random(17);
+ }
+ return new PairOutputs.Pair<Long,Long>(o1.get(ord),
+ o2.get(_TestUtil.nextInt(rand, 1, 5000)));
+ }
+ }.run(limit, verify);
+ } else if (storeOrds) {
+ // Store only ords
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ @Override
+ public Long getOutput(IntsRef input, int ord) {
+ return outputs.get(ord);
+ }
+ }.run(limit, verify);
+ } else if (storeDocFreqs) {
+ // Store only docFreq
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false);
+ new VisitTerms<Long>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ Random rand;
+ @Override
+ public Long getOutput(IntsRef input, int ord) {
+ if (ord == 0) {
+ rand = new Random(17);
+ }
+ return outputs.get(_TestUtil.nextInt(rand, 1, 5000));
+ }
+ }.run(limit, verify);
+ } else {
+ // Store nothing
+ final NoOutputs outputs = NoOutputs.getSingleton();
+ final Object NO_OUTPUT = outputs.getNoOutput();
+ new VisitTerms<Object>(dirOut, wordsFileIn, inputMode, prune, outputs) {
+ @Override
+ public Object getOutput(IntsRef input, int ord) {
+ return NO_OUTPUT;
+ }
+ }.run(limit, verify);
+ }
+ }
+
+ public void testSingleString() throws Exception {
+ final Outputs<Object> outputs = NoOutputs.getSingleton();
+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
+ b.add(new BytesRef("foobar"), outputs.getNoOutput());
+ final BytesRefFSTEnum<Object> fstEnum = new BytesRefFSTEnum<Object>(b.finish());
+ assertNull(fstEnum.seekFloor(new BytesRef("foo")));
+ assertNull(fstEnum.seekCeil(new BytesRef("foobaz")));
+ }
+
+ public void testSimple() throws Exception {
+
+ // Get outputs -- passing true means FST will share
+ // (delta code) the outputs. This should result in
+ // smaller FST if the outputs grow monotonically. But
+ // if numbers are "random", false should give smaller
+ // final size:
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+
+ // Build an FST mapping BytesRef -> Long
+ final Builder<Long> builder = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ final BytesRef a = new BytesRef("a");
+ final BytesRef b = new BytesRef("b");
+ final BytesRef c = new BytesRef("c");
+
+ builder.add(a, outputs.get(17));
+ builder.add(b, outputs.get(42));
+ builder.add(c, outputs.get(13824324872317238L));
+
+ final FST<Long> fst = builder.finish();
+
+ assertEquals(13824324872317238L, (long) Util.get(fst, c));
+ assertEquals(42, (long) Util.get(fst, b));
+ assertEquals(17, (long) Util.get(fst, a));
+
+ BytesRefFSTEnum<Long> fstEnum = new BytesRefFSTEnum<Long>(fst);
+ BytesRefFSTEnum.InputOutput<Long> seekResult;
+ seekResult = fstEnum.seekFloor(a);
+ assertNotNull(seekResult);
+ assertEquals(17, (long) seekResult.output);
+
+ // goes to a
+ seekResult = fstEnum.seekFloor(new BytesRef("aa"));
+ assertNotNull(seekResult);
+ assertEquals(17, (long) seekResult.output);
+
+ // goes to b
+ seekResult = fstEnum.seekCeil(new BytesRef("aa"));
+ assertNotNull(seekResult);
+ assertEquals(b, seekResult.input);
+ assertEquals(42, (long) seekResult.output);
+ }
+
+ public void testPrimaryKeys() throws Exception {
+ Directory dir = newDirectory();
+
+ for(int cycle=0;cycle<2;cycle++) {
+ if (VERBOSE) {
+ System.out.println("TEST: cycle=" + cycle);
+ }
+ RandomIndexWriter w = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+ Document doc = new Document();
+ Field idField = newField("id", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
+ doc.add(idField);
+
+ final int NUM_IDS = (int) (1000*RANDOM_MULTIPLIER*(1.0+random.nextDouble()));
+ //final int NUM_IDS = (int) (377 * (1.0+random.nextDouble()));
+ if (VERBOSE) {
+ System.out.println("TEST: NUM_IDS=" + NUM_IDS);
+ }
+ final Set<String> allIDs = new HashSet<String>();
+ for(int id=0;id<NUM_IDS;id++) {
+ String idString;
+ if (cycle == 0) {
+ // PKs are assigned sequentially
+ idString = String.format("%07d", id);
+ } else {
+ while(true) {
+ final String s = Long.toString(random.nextLong());
+ if (!allIDs.contains(s)) {
+ idString = s;
+ break;
+ }
+ }
+ }
+ allIDs.add(idString);
+ idField.setValue(idString);
+ w.addDocument(doc);
+ }
+
+ //w.forceMerge(1);
+
+ // turn writer into reader:
+ final IndexReader r = w.getReader();
+ final IndexSearcher s = new IndexSearcher(r);
+ w.close();
+
+ final List<String> allIDsList = new ArrayList<String>(allIDs);
+ final List<String> sortedAllIDsList = new ArrayList<String>(allIDsList);
+ Collections.sort(sortedAllIDsList);
+
+ // Sprinkle in some non-existent PKs:
+ Set<String> outOfBounds = new HashSet<String>();
+ for(int idx=0;idx<NUM_IDS/10;idx++) {
+ String idString;
+ if (cycle == 0) {
+ idString = String.format("%07d", (NUM_IDS + idx));
+ } else {
+ while(true) {
+ idString = Long.toString(random.nextLong());
+ if (!allIDs.contains(idString)) {
+ break;
+ }
+ }
+ }
+ outOfBounds.add(idString);
+ allIDsList.add(idString);
+ }
+
+ // Verify w/ TermQuery
+ for(int iter=0;iter<2*NUM_IDS;iter++) {
+ final String id = allIDsList.get(random.nextInt(allIDsList.size()));
+ final boolean exists = !outOfBounds.contains(id);
+ if (VERBOSE) {
+ System.out.println("TEST: TermQuery " + (exists ? "" : "non-exist ") + " id=" + id);
+ }
+ assertEquals((exists ? "" : "non-exist ") + "id=" + id, exists ? 1 : 0, s.search(new TermQuery(new Term("id", id)), 1).totalHits);
+ }
+
+ // Verify w/ MultiTermsEnum
+ for(int iter=0;iter<2*NUM_IDS;iter++) {
+ final String id;
+ final String nextID;
+ final boolean exists;
+
+ if (random.nextBoolean()) {
+ id = allIDsList.get(random.nextInt(allIDsList.size()));
+ exists = !outOfBounds.contains(id);
+ nextID = null;
+ if (VERBOSE) {
+ System.out.println("TEST: exactOnly " + (exists ? "" : "non-exist ") + "id=" + id);
+ }
+ } else {
+ // Pick ID between two IDs:
+ exists = false;
+ final int idv = random.nextInt(NUM_IDS-1);
+ if (cycle == 0) {
+ id = String.format("%07da", idv);
+ nextID = String.format("%07d", idv+1);
+ } else {
+ id = sortedAllIDsList.get(idv) + "a";
+ nextID = sortedAllIDsList.get(idv+1);
+ }
+ if (VERBOSE) {
+ System.out.println("TEST: not exactOnly id=" + id + " nextID=" + nextID);
+ }
+ }
+
+ final boolean useCache = random.nextBoolean();
+ if (VERBOSE) {
+ System.out.println(" useCache=" + useCache);
+ }
+
+ final Term idTerm = new Term("id", id);
+ final TermEnum termEnum = r.terms(idTerm);
+ final Term actual = termEnum.term();
+
+ if (nextID != null) {
+ assertNotNull(actual);
+ assertTrue(!actual.equals(idTerm));
+ assertEquals("expected=" + nextID + " actual=" + actual.text(), nextID, actual.text());
+ } else if (!exists) {
+ assertTrue(actual == null || !actual.equals(idTerm));
+ } else {
+ assertEquals(actual, idTerm);
+ }
+ }
+
+ r.close();
+ }
+ dir.close();
+ }
+
+ public void testRandomTermLookup() throws Exception {
+ Directory dir = newDirectory();
+
+ RandomIndexWriter w = new RandomIndexWriter(random, dir,
+ newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setOpenMode(IndexWriterConfig.OpenMode.CREATE));
+ w.w.setInfoStream(VERBOSE ? System.out : null);
+
+ Document doc = new Document();
+ Field f = newField("field", "", Field.Store.NO, Field.Index.NOT_ANALYZED);
+ doc.add(f);
+
+ final int NUM_TERMS = (int) (1000*RANDOM_MULTIPLIER * (1+random.nextDouble()));
+ if (VERBOSE) {
+ System.out.println("TEST: NUM_TERMS=" + NUM_TERMS);
+ }
+
+ final Set<String> allTerms = new HashSet<String>();
+ while(allTerms.size() < NUM_TERMS) {
+ allTerms.add(simpleRandomString(random));
+ }
+
+ for(String term : allTerms) {
+ f.setValue(term);
+ w.addDocument(doc);
+ }
+
+ // turn writer into reader:
+ if (VERBOSE) {
+ System.out.println("TEST: get reader");
+ }
+ IndexReader r = w.getReader();
+ if (VERBOSE) {
+ System.out.println("TEST: got reader=" + r);
+ }
+ IndexSearcher s = new IndexSearcher(r);
+ w.close();
+
+ final List<String> allTermsList = new ArrayList<String>(allTerms);
+ Collections.shuffle(allTermsList, random);
+
+ // verify exact lookup
+ for(String term : allTermsList) {
+ if (VERBOSE) {
+ System.out.println("TEST: term=" + term);
+ }
+ assertEquals("term=" + term, 1, s.search(new TermQuery(new Term("field", term)), 1).totalHits);
+ }
+
+ r.close();
+ dir.close();
+ }
+
+ /**
+ * Test state expansion (array format) on close-to-root states. Creates
+ * synthetic input that has one expanded state on each level.
+ *
+ * @see "https://issues.apache.org/jira/browse/LUCENE-2933"
+ */
+ public void testExpandedCloseToRoot() throws Exception {
+ class SyntheticData {
+ FST<Object> compile(String[] lines) throws IOException {
+ final NoOutputs outputs = NoOutputs.getSingleton();
+ final Object nothing = outputs.getNoOutput();
+ final Builder<Object> b = new Builder<Object>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ int line = 0;
+ final BytesRef term = new BytesRef();
+ while (line < lines.length) {
+ String w = lines[line++];
+ if (w == null) {
+ break;
+ }
+ term.copy(w);
+ b.add(term, nothing);
+ }
+
+ return b.finish();
+ }
+
+ void generate(ArrayList<String> out, StringBuilder b, char from, char to,
+ int depth) {
+ if (depth == 0 || from == to) {
+ String seq = b.toString() + "_" + out.size() + "_end";
+ out.add(seq);
+ } else {
+ for (char c = from; c <= to; c++) {
+ b.append(c);
+ generate(out, b, from, c == to ? to : from, depth - 1);
+ b.deleteCharAt(b.length() - 1);
+ }
+ }
+ }
+
+ public int verifyStateAndBelow(FST<Object> fst, Arc<Object> arc, int depth)
+ throws IOException {
+ if (fst.targetHasArcs(arc)) {
+ int childCount = 0;
+ for (arc = fst.readFirstTargetArc(arc, arc);;
+ arc = fst.readNextArc(arc), childCount++)
+ {
+ boolean expanded = fst.isExpandedTarget(arc);
+ int children = verifyStateAndBelow(fst, new FST.Arc<Object>().copyFrom(arc), depth + 1);
+
+ assertEquals(
+ expanded,
+ (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE &&
+ children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
+ children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ if (arc.isLast()) break;
+ }
+
+ return childCount;
+ }
+ return 0;
+ }
+ }
+
+ // Sanity check.
+ assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP);
+ assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0);
+
+ SyntheticData s = new SyntheticData();
+
+ ArrayList<String> out = new ArrayList<String>();
+ StringBuilder b = new StringBuilder();
+ s.generate(out, b, 'a', 'i', 10);
+ String[] input = out.toArray(new String[out.size()]);
+ Arrays.sort(input);
+ FST<Object> fst = s.compile(input);
+ FST.Arc<Object> arc = fst.getFirstArc(new FST.Arc<Object>());
+ s.verifyStateAndBelow(fst, arc, 1);
+ }
+
+ // Make sure raw FST can differentiate between final vs
+ // non-final end nodes
+ public void testNonFinalStopNodes() throws Exception {
+ final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true);
+ final Long nothing = outputs.getNoOutput();
+ final Builder<Long> b = new Builder<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ final FST<Long> fst = new FST<Long>(FST.INPUT_TYPE.BYTE1, outputs);
+
+ final Builder.UnCompiledNode<Long> rootNode = new Builder.UnCompiledNode<Long>(b, 0);
+
+ // Add final stop node
+ {
+ final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
+ node.isFinal = true;
+ rootNode.addArc('a', node);
+ final Builder.CompiledNode frozen = new Builder.CompiledNode();
+ frozen.address = fst.addNode(node);
+ rootNode.arcs[0].nextFinalOutput = outputs.get(17);
+ rootNode.arcs[0].isFinal = true;
+ rootNode.arcs[0].output = nothing;
+ rootNode.arcs[0].target = frozen;
+ }
+
+ // Add non-final stop node
+ {
+ final Builder.UnCompiledNode<Long> node = new Builder.UnCompiledNode<Long>(b, 0);
+ rootNode.addArc('b', node);
+ final Builder.CompiledNode frozen = new Builder.CompiledNode();
+ frozen.address = fst.addNode(node);
+ rootNode.arcs[1].nextFinalOutput = nothing;
+ rootNode.arcs[1].output = outputs.get(42);
+ rootNode.arcs[1].target = frozen;
+ }
+
+ fst.finish(fst.addNode(rootNode));
+
+ checkStopNodes(fst, outputs);
+
+ // Make sure it still works after save/load:
+ Directory dir = newDirectory();
+ IndexOutput out = dir.createOutput("fst");
+ fst.save(out);
+ out.close();
+
+ IndexInput in = dir.openInput("fst");
+ final FST<Long> fst2 = new FST<Long>(in, outputs);
+ checkStopNodes(fst2, outputs);
+ in.close();
+ dir.close();
+ }
+
+ private void checkStopNodes(FST<Long> fst, PositiveIntOutputs outputs) throws Exception {
+ final Long nothing = outputs.getNoOutput();
+ FST.Arc<Long> startArc = fst.getFirstArc(new FST.Arc<Long>());
+ assertEquals(nothing, startArc.output);
+ assertEquals(nothing, startArc.nextFinalOutput);
+
+ FST.Arc<Long> arc = fst.readFirstTargetArc(startArc, new FST.Arc<Long>());
+ assertEquals('a', arc.label);
+ assertEquals(17, arc.nextFinalOutput.longValue());
+ assertTrue(arc.isFinal());
+
+ arc = fst.readNextArc(arc);
+ assertEquals('b', arc.label);
+ assertFalse(arc.isFinal());
+ assertEquals(42, arc.output.longValue());
+ }
+}