X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/fst/TestFSTs.java diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/fst/TestFSTs.java new file mode 100644 index 0000000..2695ad9 --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/util/fst/TestFSTs.java @@ -0,0 +1,1578 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.BufferedReader; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStreamReader; +import java.io.OutputStreamWriter; +import java.io.Writer; +import java.util.*; + +import org.apache.lucene.analysis.MockAnalyzer; +import org.apache.lucene.document.Document; +import org.apache.lucene.index.IndexReader; +import org.apache.lucene.index.IndexWriter; +import org.apache.lucene.index.IndexWriterConfig; +import org.apache.lucene.index.Term; +import org.apache.lucene.index.TermEnum; +import org.apache.lucene.store.Directory; +import org.apache.lucene.store.FSDirectory; +import org.apache.lucene.store.IndexInput; +import org.apache.lucene.store.IndexOutput; +import org.apache.lucene.store.MockDirectoryWrapper; +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.LineFileDocs; +import org.apache.lucene.util.LuceneTestCase; +import org.apache.lucene.util.UnicodeUtil; +import org.apache.lucene.util._TestUtil; +import org.apache.lucene.util.fst.FST.Arc; + +public class TestFSTs extends LuceneTestCase { + + private MockDirectoryWrapper dir; + + @Override + public void setUp() throws Exception { + super.setUp(); + dir = newDirectory(); + dir.setPreventDoubleWrite(false); + } + + @Override + public void tearDown() throws Exception { + dir.close(); + super.tearDown(); + } + + private static BytesRef toBytesRef(IntsRef ir) { + BytesRef br = new BytesRef(ir.length); + for(int i=0;i= 0 && x <= 255; + br.bytes[i] = (byte) x; + } + br.length = ir.length; + return br; + } + + private static IntsRef toIntsRef(String s, int inputMode) { + return toIntsRef(s, inputMode, new IntsRef(10)); + } + + private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { + if (inputMode == 0) { + // utf8 + return toIntsRef(new BytesRef(s), ir); + } else { + // utf32 + return toIntsRefUTF32(s, ir); + } + } + + private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { + final int charLength = s.length(); + int charIdx = 0; + int intIdx = 0; + while(charIdx < charLength) { + if (intIdx == ir.ints.length) { + ir.grow(intIdx+1); + } + final int utf32 = s.codePointAt(charIdx); + ir.ints[intIdx] = utf32; + charIdx += Character.charCount(utf32); + intIdx++; + } + ir.length = intIdx; + return ir; + } + + private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { + if (br.length > ir.ints.length) { + ir.grow(br.length); + } + for(int i=0;i outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(IntsRef term : terms) { + pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (ord) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(idx))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random monotonically increasing positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], outputs.get(value))); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // PositiveIntOutput (random positive number) + { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Pair + { + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean()); + final PairOutputs outputs = new PairOutputs(o1, o2); + final List>> pairs = new ArrayList>>(terms.length); + long lastOutput = 0; + for(int idx=0;idx>(terms[idx], + outputs.get(o1.get(idx), + o2.get(value)))); + } + new FSTTester>(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-bytes + { + final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); + final BytesRef NO_OUTPUT = outputs.getNoOutput(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Sequence-of-ints + { + final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); + final List> pairs = new ArrayList>(terms.length); + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + + // Up to two positive ints, shared, generally but not + // monotonically increasing + { + if (VERBOSE) { + System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); + } + final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); + final List> pairs = new ArrayList>(terms.length); + long lastOutput = 0; + for(int idx=0;idx(terms[idx], output)); + } + new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); + } + } + + private static class FSTTester { + + final Random random; + final List> pairs; + final int inputMode; + final Outputs outputs; + final Directory dir; + + public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs) { + this.random = random; + this.dir = dir; + this.inputMode = inputMode; + this.pairs = pairs; + this.outputs = outputs; + } + + private static class InputOutput implements Comparable> { + public final IntsRef input; + public final T output; + + public InputOutput(IntsRef input, T output) { + this.input = input; + this.output = output; + } + + public int compareTo(InputOutput other) { + if (other instanceof InputOutput) { + return input.compareTo((other).input); + } else { + throw new IllegalArgumentException(); + } + } + } + + public void doTest() throws IOException { + // no pruning + doTest(0, 0, true); + + if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { + // simple pruning + doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true); + + // leafy pruning + doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true); + } + } + + // runs the term, returning the output, or null if term + // isn't accepted. if prefixLength is non-null it must be + // length 1 int array; prefixLength[0] is set to the length + // of the term prefix that matches + private T run(FST fst, IntsRef term, int[] prefixLength) throws IOException { + assert prefixLength == null || prefixLength.length == 1; + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + for(int i=0;i<=term.length;i++) { + final int label; + if (i == term.length) { + label = FST.END_LABEL; + } else { + label = term.ints[term.offset+i]; + } + //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); + if (fst.findTargetArc(label, arc, arc) == null) { + if (prefixLength != null) { + prefixLength[0] = i; + return output; + } else { + return null; + } + } + output = fst.outputs.add(output, arc.output); + } + + if (prefixLength != null) { + prefixLength[0] = term.length; + } + + return output; + } + + private T randomAcceptedWord(FST fst, IntsRef in) throws IOException { + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + final List> arcs = new ArrayList>(); + in.length = 0; + in.offset = 0; + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + while(true) { + // read all arcs: + fst.readFirstTargetArc(arc, arc); + arcs.add(new FST.Arc().copyFrom(arc)); + while(!arc.isLast()) { + fst.readNextArc(arc); + arcs.add(new FST.Arc().copyFrom(arc)); + } + + // pick one + arc = arcs.get(random.nextInt(arcs.size())); + arcs.clear(); + + // accumulate output + output = fst.outputs.add(output, arc.output); + + // append label + if (arc.label == FST.END_LABEL) { + break; + } + + if (in.ints.length == in.length) { + in.grow(1+in.length); + } + in.ints[in.length++] = arc.label; + } + + return output; + } + + + FST doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { + if (VERBOSE) { + System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); + } + + final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, + prune1, prune2, + prune1==0 && prune2==0, + allowRandomSuffixSharing ? random.nextBoolean() : true, + allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, + outputs); + + for(InputOutput pair : pairs) { + if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { + final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; + final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; + @SuppressWarnings("unchecked") final Builder builderObject = (Builder) builder; + builderObject.add(pair.input, _outputs.get(twoLongs.first)); + builderObject.add(pair.input, _outputs.get(twoLongs.second)); + } else { + builder.add(pair.input, pair.output); + } + } + FST fst = builder.finish(); + + if (random.nextBoolean() && fst != null) { + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + IndexInput in = dir.openInput("fst.bin"); + try { + fst = new FST(in, outputs); + } finally { + in.close(); + dir.deleteFile("fst.bin"); + } + } + + if (VERBOSE && pairs.size() <= 20 && fst != null) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(fst, w, false, false); + w.close(); + System.out.println("SAVED out.dot"); + } + + if (VERBOSE) { + if (fst == null) { + System.out.println(" fst has 0 nodes (fully pruned)"); + } else { + System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs"); + } + } + + if (prune1 == 0 && prune2 == 0) { + verifyUnPruned(inputMode, fst); + } else { + verifyPruned(inputMode, fst, prune1, prune2); + } + + return fst; + } + + // FST is complete + private void verifyUnPruned(int inputMode, FST fst) throws IOException { + + if (pairs.size() == 0) { + assertNull(fst); + return; + } + + if (VERBOSE) { + System.out.println("TEST: now verify " + pairs.size() + " terms"); + for(InputOutput pair : pairs) { + assertNotNull(pair); + assertNotNull(pair.input); + assertNotNull(pair.output); + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + assertNotNull(fst); + + // visit valid paris in order -- make sure all words + // are accepted, and FSTEnum's next() steps through + // them correctly + if (VERBOSE) { + System.out.println("TEST: check valid terms/next()"); + } + { + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + for(InputOutput pair : pairs) { + IntsRef term = pair.input; + if (VERBOSE) { + System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output)); + } + Object output = run(fst, term, null); + + assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); + assertEquals(pair.output, output); + + // verify enum's next + IntsRefFSTEnum.InputOutput t = fstEnum.next(); + assertNotNull(t); + assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input); + assertEquals(pair.output, t.output); + } + assertNull(fstEnum.next()); + } + + final Map termsMap = new HashMap(); + for(InputOutput pair : pairs) { + termsMap.put(pair.input, pair.output); + } + + // find random matching word and make sure it's valid + if (VERBOSE) { + System.out.println("TEST: verify random accepted terms"); + } + final IntsRef scratch = new IntsRef(10); + int num = atLeast(500); + for(int iter=0;iter fstEnum = new IntsRefFSTEnum(fst); + num = atLeast(100); + for(int iter=0;iter seekResult; + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term)); + } + seekResult = fstEnum.seekFloor(term); + pos--; + } else { + if (VERBOSE) { + System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term)); + } + seekResult = fstEnum.seekCeil(term); + } + + if (pos != -1 && pos < pairs.size()) { + //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output)); + assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult); + if (VERBOSE) { + System.out.println(" got " + inputToString(inputMode, seekResult.input)); + } + assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input); + assertEquals(pairs.get(pos).output, seekResult.output); + } else { + // seeked before start or beyond end + //System.out.println("seek=" + seekTerm); + assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult); + if (VERBOSE) { + System.out.println(" got null"); + } + } + + break; + } + } + } else { + // seek to term that does exist: + InputOutput pair = pairs.get(random.nextInt(pairs.size())); + final IntsRefFSTEnum.InputOutput seekResult; + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input)); + } + seekResult = fstEnum.seekFloor(pair.input); + } else { + if (VERBOSE) { + System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input)); + } + seekResult = fstEnum.seekCeil(pair.input); + } + assertNotNull(seekResult); + assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input); + assertEquals(pair.output, seekResult.output); + } + } + + if (VERBOSE) { + System.out.println("TEST: mixed next/seek"); + } + + // test mixed next/seek + num = atLeast(100); + for(int iter=0;iter(fst); + int upto = -1; + while(true) { + boolean isDone = false; + if (upto == pairs.size()-1 || random.nextBoolean()) { + // next + upto++; + if (VERBOSE) { + System.out.println(" do next"); + } + isDone = fstEnum.next() == null; + } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) { + int attempt = 0; + for(;attempt<10;attempt++) { + IntsRef term = toIntsRef(getRandomString(), inputMode); + if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) { + int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); + assert pos < 0; + upto = -(pos+1); + + if (random.nextBoolean()) { + upto--; + assertTrue(upto != -1); + if (VERBOSE) { + System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")"); + } + isDone = fstEnum.seekFloor(term) == null; + } else { + if (VERBOSE) { + System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")"); + } + isDone = fstEnum.seekCeil(term) == null; + } + + break; + } + } + if (attempt == 10) { + continue; + } + + } else { + final int inc = random.nextInt(pairs.size() - upto - 1); + upto += inc; + if (upto == -1) { + upto = 0; + } + + if (random.nextBoolean()) { + if (VERBOSE) { + System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")"); + } + isDone = fstEnum.seekCeil(pairs.get(upto).input) == null; + } else { + if (VERBOSE) { + System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")"); + } + isDone = fstEnum.seekFloor(pairs.get(upto).input) == null; + } + } + if (VERBOSE) { + if (!isDone) { + System.out.println(" got " + inputToString(inputMode, fstEnum.current().input)); + } else { + System.out.println(" got null"); + } + } + + if (upto == pairs.size()) { + assertTrue(isDone); + break; + } else { + assertFalse(isDone); + assertEquals(pairs.get(upto).input, fstEnum.current().input); + assertEquals(pairs.get(upto).output, fstEnum.current().output); + + /* + if (upto < pairs.size()-1) { + int tryCount = 0; + while(tryCount < 10) { + final IntsRef t = toIntsRef(getRandomString(), inputMode); + if (pairs.get(upto).input.compareTo(t) < 0) { + final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; + if (VERBOSE) { + System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); + } + assertEquals(expected, fstEnum.beforeNext(t)); + break; + } + tryCount++; + } + } + */ + } + } + } + } + + private static class CountMinOutput { + int count; + T output; + T finalOutput; + boolean isLeaf = true; + boolean isFinal; + } + + // FST is pruned + private void verifyPruned(int inputMode, FST fst, int prune1, int prune2) throws IOException { + + if (VERBOSE) { + System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs); + for(InputOutput pair : pairs) { + System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); + } + } + + // To validate the FST, we brute-force compute all prefixes + // in the terms, matched to their "common" outputs, prune that + // set according to the prune thresholds, then assert the FST + // matches that same set. + + // NOTE: Crazy RAM intensive!! + + //System.out.println("TEST: tally prefixes"); + + // build all prefixes + final Map> prefixes = new HashMap>(); + final IntsRef scratch = new IntsRef(10); + for(InputOutput pair: pairs) { + scratch.copy(pair.input); + for(int idx=0;idx<=pair.input.length;idx++) { + scratch.length = idx; + CountMinOutput cmo = prefixes.get(scratch); + if (cmo == null) { + cmo = new CountMinOutput(); + cmo.count = 1; + cmo.output = pair.output; + prefixes.put(new IntsRef(scratch), cmo); + } else { + cmo.count++; + cmo.output = outputs.common(cmo.output, pair.output); + } + if (idx == pair.input.length) { + cmo.isFinal = true; + cmo.finalOutput = cmo.output; + } + } + } + + if (VERBOSE) { + System.out.println("TEST: now prune"); + } + + // prune 'em + final Iterator>> it = prefixes.entrySet().iterator(); + while(it.hasNext()) { + Map.Entry> ent = it.next(); + final IntsRef prefix = ent.getKey(); + final CountMinOutput cmo = ent.getValue(); + if (VERBOSE) { + System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal); + } + final boolean keep; + if (prune1 > 0) { + keep = cmo.count >= prune1; + } else { + assert prune2 > 0; + if (prune2 > 1 && cmo.count >= prune2) { + keep = true; + } else if (prefix.length > 0) { + // consult our parent + scratch.length = prefix.length-1; + System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length); + final CountMinOutput cmo2 = prefixes.get(scratch); + //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); + keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); + } else if (cmo.count >= prune2) { + keep = true; + } else { + keep = false; + } + } + + if (!keep) { + it.remove(); + //System.out.println(" remove"); + } else { + // clear isLeaf for all ancestors + //System.out.println(" keep"); + scratch.copy(prefix); + scratch.length--; + while(scratch.length >= 0) { + final CountMinOutput cmo2 = prefixes.get(scratch); + if (cmo2 != null) { + //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); + cmo2.isLeaf = false; + } + scratch.length--; + } + } + } + + //System.out.println("TEST: after prune"); + /* + for(Map.Entry ent : prefixes.entrySet()) { + System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); + if (ent.getValue().isFinal) { + System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); + } + } + */ + + if (prefixes.size() <= 1) { + assertNull(fst); + return; + } + + assertNotNull(fst); + + // make sure FST only enums valid prefixes + if (VERBOSE) { + System.out.println("TEST: check pruned enum"); + } + IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + IntsRefFSTEnum.InputOutput current; + while((current = fstEnum.next()) != null) { + if (VERBOSE) { + System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output)); + } + final CountMinOutput cmo = prefixes.get(current.input); + assertNotNull(cmo); + assertTrue(cmo.isLeaf || cmo.isFinal); + //if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, current.output); + } else { + assertEquals(cmo.output, current.output); + } + } + + // make sure all non-pruned prefixes are present in the FST + if (VERBOSE) { + System.out.println("TEST: verify all prefixes"); + } + final int[] stopNode = new int[1]; + for(Map.Entry> ent : prefixes.entrySet()) { + if (ent.getKey().length > 0) { + final CountMinOutput cmo = ent.getValue(); + final T output = run(fst, ent.getKey(), stopNode); + if (VERBOSE) { + System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output)); + } + // if (cmo.isFinal && !cmo.isLeaf) { + if (cmo.isFinal) { + assertEquals(cmo.finalOutput, output); + } else { + assertEquals(cmo.output, output); + } + assertEquals(ent.getKey().length, stopNode[0]); + } + } + } + } + + public void testRandomWords() throws IOException { + testRandomWords(1000, atLeast(2)); + //testRandomWords(20, 100); + } + + private String inputModeToString(int mode) { + if (mode == 0) { + return "utf8"; + } else { + return "utf32"; + } + } + + private void testRandomWords(int maxNumWords, int numIter) throws IOException { + for(int iter=0;iter termsSet = new HashSet(); + IntsRef[] terms = new IntsRef[numWords]; + while(termsSet.size() < numWords) { + final String term = getRandomString(); + termsSet.add(toIntsRef(term, inputMode)); + } + doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); + } + } + } + + static String getRandomString() { + final String term; + if (random.nextBoolean()) { + term = _TestUtil.randomRealisticUnicodeString(random); + } else { + // we want to mix in limited-alphabet symbols so + // we get more sharing of the nodes given how few + // terms we are testing... + term = simpleRandomString(random); + } + return term; + } + + @Nightly + public void testBigSet() throws IOException { + testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1); + } + + private static String inputToString(int inputMode, IntsRef term) { + return inputToString(inputMode, term, true); + } + + private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { + if (!isValidUnicode) { + return term.toString(); + } else if (inputMode == 0) { + // utf8 + return toBytesRef(term).utf8ToString() + " " + term; + } else { + // utf32 + return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; + } + } + + private static IntsRef toIntsRef(String s) { + final int charCount = s.length(); + IntsRef ir = new IntsRef(charCount); + for(int charIDX=0;charIDX= 0 && ch < 65536); + chars[charIDX] = (char) ch; + } + return new String(chars); + } + + // Build FST for all unique terms in the test line docs + // file, up until a time limit + public void testRealTerms() throws Exception { + + /* + if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { + // no + CodecProvider.getDefault().setDefaultFieldCodec("Standard"); + } + */ + + final LineFileDocs docs = new LineFileDocs(random); + final int RUN_TIME_MSEC = atLeast(500); + final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); + final File tempDir = _TestUtil.getTempDir("fstlines"); + final MockDirectoryWrapper dir = newFSDirectory(tempDir); + final IndexWriter writer = new IndexWriter(dir, conf); + writer.setInfoStream(VERBOSE ? System.out : null); + final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC; + Document doc; + int docCount = 0; + while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) { + writer.addDocument(doc); + docCount++; + } + IndexReader r = IndexReader.open(writer, true); + writer.close(); + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); + Builder builder = new Builder(FST.INPUT_TYPE.BYTE2, outputs); + + boolean storeOrd = false; + if (VERBOSE) { + if (storeOrd) { + System.out.println("FST stores ord"); + } else { + System.out.println("FST stores docFreq"); + } + } + TermEnum termEnum = r.terms(new Term("body", "")); + if (VERBOSE) { + System.out.println("TEST: got termEnum=" + termEnum); + } + int ord = 0; + while(true) { + final Term term = termEnum.term(); + if (term == null || !"body".equals(term.field())) { + break; + } + + // No ord in 3.x: + /* + if (ord == 0) { + try { + termsEnum.ord(); + } catch (UnsupportedOperationException uoe) { + if (VERBOSE) { + System.out.println("TEST: codec doesn't support ord; FST stores docFreq"); + } + storeOrd = false; + } + } + */ + final int output; + if (storeOrd) { + output = ord; + } else { + output = termEnum.docFreq(); + } + //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0))); + builder.add(toIntsRef(term.text()), outputs.get(output)); + ord++; + if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { + System.out.println(ord + " terms..."); + } + termEnum.next(); + } + final FST fst = builder.finish(); + if (VERBOSE) { + System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); + } + + if (ord > 0) { + // Now confirm BytesRefFSTEnum and TermEnum act the + // same: + final IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); + int num = atLeast(1000); + for(int iter=0;iter nextResult = fstEnum.next(); + if (nextResult != null) { + System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output)); + fail(); + } + break; + } + } + } + } + } + + r.close(); + dir.close(); + } + + private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception { + if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { + if (fstEnum.current() != null) { + fail("fstEnum.current().input=" + toString(fstEnum.current().input)); + } + } else { + assertNotNull(fstEnum.current()); + assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input)); + if (storeOrd) { + // fst stored the ord + // No ord in 3.x + // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue()); + } else { + // fst stored the docFreq + assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); + } + } + } + + private static abstract class VisitTerms { + private final String dirOut; + private final String wordsFileIn; + private int inputMode; + private final Outputs outputs; + private final Builder builder; + + public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { + this.dirOut = dirOut; + this.wordsFileIn = wordsFileIn; + this.inputMode = inputMode; + this.outputs = outputs; + + builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs); + } + + protected abstract T getOutput(IntsRef input, int ord) throws IOException; + + public void run(int limit, boolean verify) throws IOException { + BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + try { + final IntsRef intsRef = new IntsRef(10); + long tStart = System.currentTimeMillis(); + int ord = 0; + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + builder.add(intsRef, + getOutput(intsRef, ord)); + + ord++; + if (ord % 500000 == 0) { + System.out.println( + String.format(Locale.ENGLISH, + "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord)); + } + if (ord >= limit) { + break; + } + } + + assert builder.getTermCount() == ord; + final FST fst = builder.finish(); + if (fst == null) { + System.out.println("FST was fully pruned!"); + System.exit(0); + } + + if (dirOut == null) + return; + + System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); + if (fst.getNodeCount() < 100) { + Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); + Util.toDot(fst, w, false, false); + w.close(); + System.out.println("Wrote FST to out.dot"); + } + + Directory dir = FSDirectory.open(new File(dirOut)); + IndexOutput out = dir.createOutput("fst.bin"); + fst.save(out); + out.close(); + + System.out.println("Saved FST to fst.bin."); + + if (!verify) { + return; + } + + System.out.println("\nNow verify..."); + + is.close(); + is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); + + ord = 0; + tStart = System.currentTimeMillis(); + while(true) { + String w = is.readLine(); + if (w == null) { + break; + } + toIntsRef(w, inputMode, intsRef); + T expected = getOutput(intsRef, ord); + T actual = Util.get(fst, intsRef); + if (actual == null) { + throw new RuntimeException("unexpected null output on input=" + w); + } + if (!actual.equals(expected)) { + throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); + } + + ord++; + if (ord % 500000 == 0) { + System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); + } + if (ord >= limit) { + break; + } + } + + double totSec = ((System.currentTimeMillis() - tStart)/1000.0); + System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); + + } finally { + is.close(); + } + } + } + + // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out + public static void main(String[] args) throws IOException { + int prune = 0; + int limit = Integer.MAX_VALUE; + int inputMode = 0; // utf8 + boolean storeOrds = false; + boolean storeDocFreqs = false; + boolean verify = true; + + String wordsFileIn = null; + String dirOut = null; + + int idx = 0; + while (idx < args.length) { + if (args[idx].equals("-prune")) { + prune = Integer.valueOf(args[1 + idx]); + idx++; + } else if (args[idx].equals("-limit")) { + limit = Integer.valueOf(args[1 + idx]); + idx++; + } else if (args[idx].equals("-utf8")) { + inputMode = 0; + } else if (args[idx].equals("-utf32")) { + inputMode = 1; + } else if (args[idx].equals("-docFreq")) { + storeDocFreqs = true; + } else if (args[idx].equals("-ords")) { + storeOrds = true; + } else if (args[idx].equals("-noverify")) { + verify = false; + } else if (args[idx].startsWith("-")) { + System.err.println("Unrecognized option: " + args[idx]); + System.exit(-1); + } else { + if (wordsFileIn == null) { + wordsFileIn = args[idx]; + } else if (dirOut == null) { + dirOut = args[idx]; + } else { + System.err.println("Too many arguments, expected: input [output]"); + System.exit(-1); + } + } + idx++; + } + + if (wordsFileIn == null) { + System.err.println("No input file."); + System.exit(-1); + } + + // ord benefits from share, docFreqs don't: + + if (storeOrds && storeDocFreqs) { + // Store both ord & docFreq: + final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); + final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); + final PairOutputs outputs = new PairOutputs(o1, o2); + new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public PairOutputs.Pair getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return new PairOutputs.Pair(o1.get(ord), + o2.get(_TestUtil.nextInt(rand, 1, 5000))); + } + }.run(limit, verify); + } else if (storeOrds) { + // Store only ords + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Long getOutput(IntsRef input, int ord) { + return outputs.get(ord); + } + }.run(limit, verify); + } else if (storeDocFreqs) { + // Store only docFreq + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + Random rand; + @Override + public Long getOutput(IntsRef input, int ord) { + if (ord == 0) { + rand = new Random(17); + } + return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); + } + }.run(limit, verify); + } else { + // Store nothing + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object NO_OUTPUT = outputs.getNoOutput(); + new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { + @Override + public Object getOutput(IntsRef input, int ord) { + return NO_OUTPUT; + } + }.run(limit, verify); + } + } + + public void testSingleString() throws Exception { + final Outputs outputs = NoOutputs.getSingleton(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + b.add(new BytesRef("foobar"), outputs.getNoOutput()); + final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); + assertNull(fstEnum.seekFloor(new BytesRef("foo"))); + assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); + } + + public void testSimple() throws Exception { + + // Get outputs -- passing true means FST will share + // (delta code) the outputs. This should result in + // smaller FST if the outputs grow monotonically. But + // if numbers are "random", false should give smaller + // final size: + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + + // Build an FST mapping BytesRef -> Long + final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + + final BytesRef a = new BytesRef("a"); + final BytesRef b = new BytesRef("b"); + final BytesRef c = new BytesRef("c"); + + builder.add(a, outputs.get(17)); + builder.add(b, outputs.get(42)); + builder.add(c, outputs.get(13824324872317238L)); + + final FST fst = builder.finish(); + + assertEquals(13824324872317238L, (long) Util.get(fst, c)); + assertEquals(42, (long) Util.get(fst, b)); + assertEquals(17, (long) Util.get(fst, a)); + + BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); + BytesRefFSTEnum.InputOutput seekResult; + seekResult = fstEnum.seekFloor(a); + assertNotNull(seekResult); + assertEquals(17, (long) seekResult.output); + + // goes to a + seekResult = fstEnum.seekFloor(new BytesRef("aa")); + assertNotNull(seekResult); + assertEquals(17, (long) seekResult.output); + + // goes to b + seekResult = fstEnum.seekCeil(new BytesRef("aa")); + assertNotNull(seekResult); + assertEquals(b, seekResult.input); + assertEquals(42, (long) seekResult.output); + } + + /** + * Test state expansion (array format) on close-to-root states. Creates + * synthetic input that has one expanded state on each level. + * + * @see "https://issues.apache.org/jira/browse/LUCENE-2933" + */ + public void testExpandedCloseToRoot() throws Exception { + class SyntheticData { + FST compile(String[] lines) throws IOException { + final NoOutputs outputs = NoOutputs.getSingleton(); + final Object nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + + int line = 0; + final BytesRef term = new BytesRef(); + while (line < lines.length) { + String w = lines[line++]; + if (w == null) { + break; + } + term.copy(w); + b.add(term, nothing); + } + + return b.finish(); + } + + void generate(ArrayList out, StringBuilder b, char from, char to, + int depth) { + if (depth == 0 || from == to) { + String seq = b.toString() + "_" + out.size() + "_end"; + out.add(seq); + } else { + for (char c = from; c <= to; c++) { + b.append(c); + generate(out, b, from, c == to ? to : from, depth - 1); + b.deleteCharAt(b.length() - 1); + } + } + } + + public int verifyStateAndBelow(FST fst, Arc arc, int depth) + throws IOException { + if (fst.targetHasArcs(arc)) { + int childCount = 0; + for (arc = fst.readFirstTargetArc(arc, arc);; + arc = fst.readNextArc(arc), childCount++) + { + boolean expanded = fst.isExpandedTarget(arc); + int children = verifyStateAndBelow(fst, new FST.Arc().copyFrom(arc), depth + 1); + + assertEquals( + expanded, + (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && + children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || + children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); + if (arc.isLast()) break; + } + + return childCount; + } + return 0; + } + } + + // Sanity check. + assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); + assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); + + SyntheticData s = new SyntheticData(); + + ArrayList out = new ArrayList(); + StringBuilder b = new StringBuilder(); + s.generate(out, b, 'a', 'i', 10); + String[] input = out.toArray(new String[out.size()]); + Arrays.sort(input); + FST fst = s.compile(input); + FST.Arc arc = fst.getFirstArc(new FST.Arc()); + s.verifyStateAndBelow(fst, arc, 1); + } + + // Make sure raw FST can differentiate between final vs + // non-final end nodes + public void testNonFinalStopNodes() throws Exception { + final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); + final Long nothing = outputs.getNoOutput(); + final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); + + final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); + + final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); + + // Add final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + node.isFinal = true; + rootNode.addArc('a', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[0].nextFinalOutput = outputs.get(17); + rootNode.arcs[0].isFinal = true; + rootNode.arcs[0].output = nothing; + rootNode.arcs[0].target = frozen; + } + + // Add non-final stop node + { + final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); + rootNode.addArc('b', node); + final Builder.CompiledNode frozen = new Builder.CompiledNode(); + frozen.address = fst.addNode(node); + rootNode.arcs[1].nextFinalOutput = nothing; + rootNode.arcs[1].output = outputs.get(42); + rootNode.arcs[1].target = frozen; + } + + fst.finish(fst.addNode(rootNode)); + + checkStopNodes(fst, outputs); + + // Make sure it still works after save/load: + Directory dir = newDirectory(); + IndexOutput out = dir.createOutput("fst"); + fst.save(out); + out.close(); + + IndexInput in = dir.openInput("fst"); + final FST fst2 = new FST(in, outputs); + checkStopNodes(fst2, outputs); + in.close(); + dir.close(); + } + + private void checkStopNodes(FST fst, PositiveIntOutputs outputs) throws Exception { + final Long nothing = outputs.getNoOutput(); + FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + assertEquals(nothing, startArc.output); + assertEquals(nothing, startArc.nextFinalOutput); + + FST.Arc arc = fst.readFirstTargetArc(startArc, new FST.Arc()); + assertEquals('a', arc.label); + assertEquals(17, arc.nextFinalOutput.longValue()); + assertTrue(arc.isFinal()); + + arc = fst.readNextArc(arc); + assertEquals('b', arc.label); + assertFalse(arc.isFinal()); + assertEquals(42, arc.output.longValue()); + } +}