X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java diff --git a/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java b/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java deleted file mode 100644 index 2695ad9..0000000 --- a/lucene-java-3.4.0/lucene/src/test/org/apache/lucene/util/fst/TestFSTs.java +++ /dev/null @@ -1,1578 +0,0 @@ -package org.apache.lucene.util.fst; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStreamReader; -import java.io.OutputStreamWriter; -import java.io.Writer; -import java.util.*; - -import org.apache.lucene.analysis.MockAnalyzer; -import org.apache.lucene.document.Document; -import org.apache.lucene.index.IndexReader; -import org.apache.lucene.index.IndexWriter; -import org.apache.lucene.index.IndexWriterConfig; -import org.apache.lucene.index.Term; -import org.apache.lucene.index.TermEnum; -import org.apache.lucene.store.Directory; -import org.apache.lucene.store.FSDirectory; -import org.apache.lucene.store.IndexInput; -import org.apache.lucene.store.IndexOutput; -import org.apache.lucene.store.MockDirectoryWrapper; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.LineFileDocs; -import org.apache.lucene.util.LuceneTestCase; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util._TestUtil; -import org.apache.lucene.util.fst.FST.Arc; - -public class TestFSTs extends LuceneTestCase { - - private MockDirectoryWrapper dir; - - @Override - public void setUp() throws Exception { - super.setUp(); - dir = newDirectory(); - dir.setPreventDoubleWrite(false); - } - - @Override - public void tearDown() throws Exception { - dir.close(); - super.tearDown(); - } - - private static BytesRef toBytesRef(IntsRef ir) { - BytesRef br = new BytesRef(ir.length); - for(int i=0;i= 0 && x <= 255; - br.bytes[i] = (byte) x; - } - br.length = ir.length; - return br; - } - - private static IntsRef toIntsRef(String s, int inputMode) { - return toIntsRef(s, inputMode, new IntsRef(10)); - } - - private static IntsRef toIntsRef(String s, int inputMode, IntsRef ir) { - if (inputMode == 0) { - // utf8 - return toIntsRef(new BytesRef(s), ir); - } else { - // utf32 - return toIntsRefUTF32(s, ir); - } - } - - private static IntsRef toIntsRefUTF32(String s, IntsRef ir) { - final int charLength = s.length(); - int charIdx = 0; - int intIdx = 0; - while(charIdx < charLength) { - if (intIdx == ir.ints.length) { - ir.grow(intIdx+1); - } - final int utf32 = s.codePointAt(charIdx); - ir.ints[intIdx] = utf32; - charIdx += Character.charCount(utf32); - intIdx++; - } - ir.length = intIdx; - return ir; - } - - private static IntsRef toIntsRef(BytesRef br, IntsRef ir) { - if (br.length > ir.ints.length) { - ir.grow(br.length); - } - for(int i=0;i outputs = NoOutputs.getSingleton(); - final Object NO_OUTPUT = outputs.getNoOutput(); - final List> pairs = new ArrayList>(terms.length); - for(IntsRef term : terms) { - pairs.add(new FSTTester.InputOutput(term, NO_OUTPUT)); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // PositiveIntOutput (ord) - { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final List> pairs = new ArrayList>(terms.length); - for(int idx=0;idx(terms[idx], outputs.get(idx))); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // PositiveIntOutput (random monotonically increasing positive number) - { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - final List> pairs = new ArrayList>(terms.length); - long lastOutput = 0; - for(int idx=0;idx(terms[idx], outputs.get(value))); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // PositiveIntOutput (random positive number) - { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - final List> pairs = new ArrayList>(terms.length); - for(int idx=0;idx(terms[idx], outputs.get(random.nextLong()) & Long.MAX_VALUE)); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // Pair - { - final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(random.nextBoolean()); - final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(random.nextBoolean()); - final PairOutputs outputs = new PairOutputs(o1, o2); - final List>> pairs = new ArrayList>>(terms.length); - long lastOutput = 0; - for(int idx=0;idx>(terms[idx], - outputs.get(o1.get(idx), - o2.get(value)))); - } - new FSTTester>(random, dir, inputMode, pairs, outputs).doTest(); - } - - // Sequence-of-bytes - { - final ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - final BytesRef NO_OUTPUT = outputs.getNoOutput(); - final List> pairs = new ArrayList>(terms.length); - for(int idx=0;idx(terms[idx], output)); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // Sequence-of-ints - { - final IntSequenceOutputs outputs = IntSequenceOutputs.getSingleton(); - final List> pairs = new ArrayList>(terms.length); - for(int idx=0;idx(terms[idx], output)); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - - // Up to two positive ints, shared, generally but not - // monotonically increasing - { - if (VERBOSE) { - System.out.println("TEST: now test UpToTwoPositiveIntOutputs"); - } - final UpToTwoPositiveIntOutputs outputs = UpToTwoPositiveIntOutputs.getSingleton(true); - final List> pairs = new ArrayList>(terms.length); - long lastOutput = 0; - for(int idx=0;idx(terms[idx], output)); - } - new FSTTester(random, dir, inputMode, pairs, outputs).doTest(); - } - } - - private static class FSTTester { - - final Random random; - final List> pairs; - final int inputMode; - final Outputs outputs; - final Directory dir; - - public FSTTester(Random random, Directory dir, int inputMode, List> pairs, Outputs outputs) { - this.random = random; - this.dir = dir; - this.inputMode = inputMode; - this.pairs = pairs; - this.outputs = outputs; - } - - private static class InputOutput implements Comparable> { - public final IntsRef input; - public final T output; - - public InputOutput(IntsRef input, T output) { - this.input = input; - this.output = output; - } - - public int compareTo(InputOutput other) { - if (other instanceof InputOutput) { - return input.compareTo((other).input); - } else { - throw new IllegalArgumentException(); - } - } - } - - public void doTest() throws IOException { - // no pruning - doTest(0, 0, true); - - if (!(outputs instanceof UpToTwoPositiveIntOutputs)) { - // simple pruning - doTest(_TestUtil.nextInt(random, 1, 1+pairs.size()), 0, true); - - // leafy pruning - doTest(0, _TestUtil.nextInt(random, 1, 1+pairs.size()), true); - } - } - - // runs the term, returning the output, or null if term - // isn't accepted. if prefixLength is non-null it must be - // length 1 int array; prefixLength[0] is set to the length - // of the term prefix that matches - private T run(FST fst, IntsRef term, int[] prefixLength) throws IOException { - assert prefixLength == null || prefixLength.length == 1; - final FST.Arc arc = fst.getFirstArc(new FST.Arc()); - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; - - for(int i=0;i<=term.length;i++) { - final int label; - if (i == term.length) { - label = FST.END_LABEL; - } else { - label = term.ints[term.offset+i]; - } - //System.out.println(" loop i=" + i + " label=" + label + " output=" + fst.outputs.outputToString(output) + " curArc: target=" + arc.target + " isFinal?=" + arc.isFinal()); - if (fst.findTargetArc(label, arc, arc) == null) { - if (prefixLength != null) { - prefixLength[0] = i; - return output; - } else { - return null; - } - } - output = fst.outputs.add(output, arc.output); - } - - if (prefixLength != null) { - prefixLength[0] = term.length; - } - - return output; - } - - private T randomAcceptedWord(FST fst, IntsRef in) throws IOException { - FST.Arc arc = fst.getFirstArc(new FST.Arc()); - - final List> arcs = new ArrayList>(); - in.length = 0; - in.offset = 0; - final T NO_OUTPUT = fst.outputs.getNoOutput(); - T output = NO_OUTPUT; - - while(true) { - // read all arcs: - fst.readFirstTargetArc(arc, arc); - arcs.add(new FST.Arc().copyFrom(arc)); - while(!arc.isLast()) { - fst.readNextArc(arc); - arcs.add(new FST.Arc().copyFrom(arc)); - } - - // pick one - arc = arcs.get(random.nextInt(arcs.size())); - arcs.clear(); - - // accumulate output - output = fst.outputs.add(output, arc.output); - - // append label - if (arc.label == FST.END_LABEL) { - break; - } - - if (in.ints.length == in.length) { - in.grow(1+in.length); - } - in.ints[in.length++] = arc.label; - } - - return output; - } - - - FST doTest(int prune1, int prune2, boolean allowRandomSuffixSharing) throws IOException { - if (VERBOSE) { - System.out.println("TEST: prune1=" + prune1 + " prune2=" + prune2); - } - - final Builder builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, - prune1, prune2, - prune1==0 && prune2==0, - allowRandomSuffixSharing ? random.nextBoolean() : true, - allowRandomSuffixSharing ? _TestUtil.nextInt(random, 1, 10) : Integer.MAX_VALUE, - outputs); - - for(InputOutput pair : pairs) { - if (pair.output instanceof UpToTwoPositiveIntOutputs.TwoLongs) { - final UpToTwoPositiveIntOutputs _outputs = (UpToTwoPositiveIntOutputs) outputs; - final UpToTwoPositiveIntOutputs.TwoLongs twoLongs = (UpToTwoPositiveIntOutputs.TwoLongs) pair.output; - @SuppressWarnings("unchecked") final Builder builderObject = (Builder) builder; - builderObject.add(pair.input, _outputs.get(twoLongs.first)); - builderObject.add(pair.input, _outputs.get(twoLongs.second)); - } else { - builder.add(pair.input, pair.output); - } - } - FST fst = builder.finish(); - - if (random.nextBoolean() && fst != null) { - IndexOutput out = dir.createOutput("fst.bin"); - fst.save(out); - out.close(); - IndexInput in = dir.openInput("fst.bin"); - try { - fst = new FST(in, outputs); - } finally { - in.close(); - dir.deleteFile("fst.bin"); - } - } - - if (VERBOSE && pairs.size() <= 20 && fst != null) { - Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - Util.toDot(fst, w, false, false); - w.close(); - System.out.println("SAVED out.dot"); - } - - if (VERBOSE) { - if (fst == null) { - System.out.println(" fst has 0 nodes (fully pruned)"); - } else { - System.out.println(" fst has " + fst.getNodeCount() + " nodes and " + fst.getArcCount() + " arcs"); - } - } - - if (prune1 == 0 && prune2 == 0) { - verifyUnPruned(inputMode, fst); - } else { - verifyPruned(inputMode, fst, prune1, prune2); - } - - return fst; - } - - // FST is complete - private void verifyUnPruned(int inputMode, FST fst) throws IOException { - - if (pairs.size() == 0) { - assertNull(fst); - return; - } - - if (VERBOSE) { - System.out.println("TEST: now verify " + pairs.size() + " terms"); - for(InputOutput pair : pairs) { - assertNotNull(pair); - assertNotNull(pair.input); - assertNotNull(pair.output); - System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); - } - } - - assertNotNull(fst); - - // visit valid paris in order -- make sure all words - // are accepted, and FSTEnum's next() steps through - // them correctly - if (VERBOSE) { - System.out.println("TEST: check valid terms/next()"); - } - { - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - for(InputOutput pair : pairs) { - IntsRef term = pair.input; - if (VERBOSE) { - System.out.println("TEST: check term=" + inputToString(inputMode, term) + " output=" + fst.outputs.outputToString(pair.output)); - } - Object output = run(fst, term, null); - - assertNotNull("term " + inputToString(inputMode, term) + " is not accepted", output); - assertEquals(pair.output, output); - - // verify enum's next - IntsRefFSTEnum.InputOutput t = fstEnum.next(); - assertNotNull(t); - assertEquals("expected input=" + inputToString(inputMode, term) + " but fstEnum returned " + inputToString(inputMode, t.input), term, t.input); - assertEquals(pair.output, t.output); - } - assertNull(fstEnum.next()); - } - - final Map termsMap = new HashMap(); - for(InputOutput pair : pairs) { - termsMap.put(pair.input, pair.output); - } - - // find random matching word and make sure it's valid - if (VERBOSE) { - System.out.println("TEST: verify random accepted terms"); - } - final IntsRef scratch = new IntsRef(10); - int num = atLeast(500); - for(int iter=0;iter fstEnum = new IntsRefFSTEnum(fst); - num = atLeast(100); - for(int iter=0;iter seekResult; - if (random.nextBoolean()) { - if (VERBOSE) { - System.out.println(" do non-exist seekFloor term=" + inputToString(inputMode, term)); - } - seekResult = fstEnum.seekFloor(term); - pos--; - } else { - if (VERBOSE) { - System.out.println(" do non-exist seekCeil term=" + inputToString(inputMode, term)); - } - seekResult = fstEnum.seekCeil(term); - } - - if (pos != -1 && pos < pairs.size()) { - //System.out.println(" got " + inputToString(inputMode,seekResult.input) + " output=" + fst.outputs.outputToString(seekResult.output)); - assertNotNull("got null but expected term=" + inputToString(inputMode, pairs.get(pos).input), seekResult); - if (VERBOSE) { - System.out.println(" got " + inputToString(inputMode, seekResult.input)); - } - assertEquals("expected " + inputToString(inputMode, pairs.get(pos).input) + " but got " + inputToString(inputMode, seekResult.input), pairs.get(pos).input, seekResult.input); - assertEquals(pairs.get(pos).output, seekResult.output); - } else { - // seeked before start or beyond end - //System.out.println("seek=" + seekTerm); - assertNull("expected null but got " + (seekResult==null ? "null" : inputToString(inputMode, seekResult.input)), seekResult); - if (VERBOSE) { - System.out.println(" got null"); - } - } - - break; - } - } - } else { - // seek to term that does exist: - InputOutput pair = pairs.get(random.nextInt(pairs.size())); - final IntsRefFSTEnum.InputOutput seekResult; - if (random.nextBoolean()) { - if (VERBOSE) { - System.out.println(" do exists seekFloor " + inputToString(inputMode, pair.input)); - } - seekResult = fstEnum.seekFloor(pair.input); - } else { - if (VERBOSE) { - System.out.println(" do exists seekCeil " + inputToString(inputMode, pair.input)); - } - seekResult = fstEnum.seekCeil(pair.input); - } - assertNotNull(seekResult); - assertEquals("got " + inputToString(inputMode, seekResult.input) + " but expected " + inputToString(inputMode, pair.input), pair.input, seekResult.input); - assertEquals(pair.output, seekResult.output); - } - } - - if (VERBOSE) { - System.out.println("TEST: mixed next/seek"); - } - - // test mixed next/seek - num = atLeast(100); - for(int iter=0;iter(fst); - int upto = -1; - while(true) { - boolean isDone = false; - if (upto == pairs.size()-1 || random.nextBoolean()) { - // next - upto++; - if (VERBOSE) { - System.out.println(" do next"); - } - isDone = fstEnum.next() == null; - } else if (upto != -1 && upto < 0.75 * pairs.size() && random.nextBoolean()) { - int attempt = 0; - for(;attempt<10;attempt++) { - IntsRef term = toIntsRef(getRandomString(), inputMode); - if (!termsMap.containsKey(term) && term.compareTo(pairs.get(upto).input) > 0) { - int pos = Collections.binarySearch(pairs, new InputOutput(term, null)); - assert pos < 0; - upto = -(pos+1); - - if (random.nextBoolean()) { - upto--; - assertTrue(upto != -1); - if (VERBOSE) { - System.out.println(" do non-exist seekFloor(" + inputToString(inputMode, term) + ")"); - } - isDone = fstEnum.seekFloor(term) == null; - } else { - if (VERBOSE) { - System.out.println(" do non-exist seekCeil(" + inputToString(inputMode, term) + ")"); - } - isDone = fstEnum.seekCeil(term) == null; - } - - break; - } - } - if (attempt == 10) { - continue; - } - - } else { - final int inc = random.nextInt(pairs.size() - upto - 1); - upto += inc; - if (upto == -1) { - upto = 0; - } - - if (random.nextBoolean()) { - if (VERBOSE) { - System.out.println(" do advanceCeil(" + inputToString(inputMode, pairs.get(upto).input) + ")"); - } - isDone = fstEnum.seekCeil(pairs.get(upto).input) == null; - } else { - if (VERBOSE) { - System.out.println(" do advanceFloor(" + inputToString(inputMode, pairs.get(upto).input) + ")"); - } - isDone = fstEnum.seekFloor(pairs.get(upto).input) == null; - } - } - if (VERBOSE) { - if (!isDone) { - System.out.println(" got " + inputToString(inputMode, fstEnum.current().input)); - } else { - System.out.println(" got null"); - } - } - - if (upto == pairs.size()) { - assertTrue(isDone); - break; - } else { - assertFalse(isDone); - assertEquals(pairs.get(upto).input, fstEnum.current().input); - assertEquals(pairs.get(upto).output, fstEnum.current().output); - - /* - if (upto < pairs.size()-1) { - int tryCount = 0; - while(tryCount < 10) { - final IntsRef t = toIntsRef(getRandomString(), inputMode); - if (pairs.get(upto).input.compareTo(t) < 0) { - final boolean expected = t.compareTo(pairs.get(upto+1).input) < 0; - if (VERBOSE) { - System.out.println("TEST: call beforeNext(" + inputToString(inputMode, t) + "); current=" + inputToString(inputMode, pairs.get(upto).input) + " next=" + inputToString(inputMode, pairs.get(upto+1).input) + " expected=" + expected); - } - assertEquals(expected, fstEnum.beforeNext(t)); - break; - } - tryCount++; - } - } - */ - } - } - } - } - - private static class CountMinOutput { - int count; - T output; - T finalOutput; - boolean isLeaf = true; - boolean isFinal; - } - - // FST is pruned - private void verifyPruned(int inputMode, FST fst, int prune1, int prune2) throws IOException { - - if (VERBOSE) { - System.out.println("TEST: now verify pruned " + pairs.size() + " terms; outputs=" + outputs); - for(InputOutput pair : pairs) { - System.out.println(" " + inputToString(inputMode, pair.input) + ": " + outputs.outputToString(pair.output)); - } - } - - // To validate the FST, we brute-force compute all prefixes - // in the terms, matched to their "common" outputs, prune that - // set according to the prune thresholds, then assert the FST - // matches that same set. - - // NOTE: Crazy RAM intensive!! - - //System.out.println("TEST: tally prefixes"); - - // build all prefixes - final Map> prefixes = new HashMap>(); - final IntsRef scratch = new IntsRef(10); - for(InputOutput pair: pairs) { - scratch.copy(pair.input); - for(int idx=0;idx<=pair.input.length;idx++) { - scratch.length = idx; - CountMinOutput cmo = prefixes.get(scratch); - if (cmo == null) { - cmo = new CountMinOutput(); - cmo.count = 1; - cmo.output = pair.output; - prefixes.put(new IntsRef(scratch), cmo); - } else { - cmo.count++; - cmo.output = outputs.common(cmo.output, pair.output); - } - if (idx == pair.input.length) { - cmo.isFinal = true; - cmo.finalOutput = cmo.output; - } - } - } - - if (VERBOSE) { - System.out.println("TEST: now prune"); - } - - // prune 'em - final Iterator>> it = prefixes.entrySet().iterator(); - while(it.hasNext()) { - Map.Entry> ent = it.next(); - final IntsRef prefix = ent.getKey(); - final CountMinOutput cmo = ent.getValue(); - if (VERBOSE) { - System.out.println(" term prefix=" + inputToString(inputMode, prefix, false) + " count=" + cmo.count + " isLeaf=" + cmo.isLeaf + " output=" + outputs.outputToString(cmo.output) + " isFinal=" + cmo.isFinal); - } - final boolean keep; - if (prune1 > 0) { - keep = cmo.count >= prune1; - } else { - assert prune2 > 0; - if (prune2 > 1 && cmo.count >= prune2) { - keep = true; - } else if (prefix.length > 0) { - // consult our parent - scratch.length = prefix.length-1; - System.arraycopy(prefix.ints, prefix.offset, scratch.ints, 0, scratch.length); - final CountMinOutput cmo2 = prefixes.get(scratch); - //System.out.println(" parent count = " + (cmo2 == null ? -1 : cmo2.count)); - keep = cmo2 != null && ((prune2 > 1 && cmo2.count >= prune2) || (prune2 == 1 && (cmo2.count >= 2 || prefix.length <= 1))); - } else if (cmo.count >= prune2) { - keep = true; - } else { - keep = false; - } - } - - if (!keep) { - it.remove(); - //System.out.println(" remove"); - } else { - // clear isLeaf for all ancestors - //System.out.println(" keep"); - scratch.copy(prefix); - scratch.length--; - while(scratch.length >= 0) { - final CountMinOutput cmo2 = prefixes.get(scratch); - if (cmo2 != null) { - //System.out.println(" clear isLeaf " + inputToString(inputMode, scratch)); - cmo2.isLeaf = false; - } - scratch.length--; - } - } - } - - //System.out.println("TEST: after prune"); - /* - for(Map.Entry ent : prefixes.entrySet()) { - System.out.println(" " + inputToString(inputMode, ent.getKey()) + ": isLeaf=" + ent.getValue().isLeaf + " isFinal=" + ent.getValue().isFinal); - if (ent.getValue().isFinal) { - System.out.println(" finalOutput=" + outputs.outputToString(ent.getValue().finalOutput)); - } - } - */ - - if (prefixes.size() <= 1) { - assertNull(fst); - return; - } - - assertNotNull(fst); - - // make sure FST only enums valid prefixes - if (VERBOSE) { - System.out.println("TEST: check pruned enum"); - } - IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - IntsRefFSTEnum.InputOutput current; - while((current = fstEnum.next()) != null) { - if (VERBOSE) { - System.out.println(" fstEnum.next prefix=" + inputToString(inputMode, current.input, false) + " output=" + outputs.outputToString(current.output)); - } - final CountMinOutput cmo = prefixes.get(current.input); - assertNotNull(cmo); - assertTrue(cmo.isLeaf || cmo.isFinal); - //if (cmo.isFinal && !cmo.isLeaf) { - if (cmo.isFinal) { - assertEquals(cmo.finalOutput, current.output); - } else { - assertEquals(cmo.output, current.output); - } - } - - // make sure all non-pruned prefixes are present in the FST - if (VERBOSE) { - System.out.println("TEST: verify all prefixes"); - } - final int[] stopNode = new int[1]; - for(Map.Entry> ent : prefixes.entrySet()) { - if (ent.getKey().length > 0) { - final CountMinOutput cmo = ent.getValue(); - final T output = run(fst, ent.getKey(), stopNode); - if (VERBOSE) { - System.out.println("TEST: verify prefix=" + inputToString(inputMode, ent.getKey(), false) + " output=" + outputs.outputToString(cmo.output)); - } - // if (cmo.isFinal && !cmo.isLeaf) { - if (cmo.isFinal) { - assertEquals(cmo.finalOutput, output); - } else { - assertEquals(cmo.output, output); - } - assertEquals(ent.getKey().length, stopNode[0]); - } - } - } - } - - public void testRandomWords() throws IOException { - testRandomWords(1000, atLeast(2)); - //testRandomWords(20, 100); - } - - private String inputModeToString(int mode) { - if (mode == 0) { - return "utf8"; - } else { - return "utf32"; - } - } - - private void testRandomWords(int maxNumWords, int numIter) throws IOException { - for(int iter=0;iter termsSet = new HashSet(); - IntsRef[] terms = new IntsRef[numWords]; - while(termsSet.size() < numWords) { - final String term = getRandomString(); - termsSet.add(toIntsRef(term, inputMode)); - } - doTest(inputMode, termsSet.toArray(new IntsRef[termsSet.size()])); - } - } - } - - static String getRandomString() { - final String term; - if (random.nextBoolean()) { - term = _TestUtil.randomRealisticUnicodeString(random); - } else { - // we want to mix in limited-alphabet symbols so - // we get more sharing of the nodes given how few - // terms we are testing... - term = simpleRandomString(random); - } - return term; - } - - @Nightly - public void testBigSet() throws IOException { - testRandomWords(_TestUtil.nextInt(random, 50000, 60000), 1); - } - - private static String inputToString(int inputMode, IntsRef term) { - return inputToString(inputMode, term, true); - } - - private static String inputToString(int inputMode, IntsRef term, boolean isValidUnicode) { - if (!isValidUnicode) { - return term.toString(); - } else if (inputMode == 0) { - // utf8 - return toBytesRef(term).utf8ToString() + " " + term; - } else { - // utf32 - return UnicodeUtil.newString(term.ints, term.offset, term.length) + " " + term; - } - } - - private static IntsRef toIntsRef(String s) { - final int charCount = s.length(); - IntsRef ir = new IntsRef(charCount); - for(int charIDX=0;charIDX= 0 && ch < 65536); - chars[charIDX] = (char) ch; - } - return new String(chars); - } - - // Build FST for all unique terms in the test line docs - // file, up until a time limit - public void testRealTerms() throws Exception { - - /* - if (CodecProvider.getDefault().getDefaultFieldCodec().equals("SimpleText")) { - // no - CodecProvider.getDefault().setDefaultFieldCodec("Standard"); - } - */ - - final LineFileDocs docs = new LineFileDocs(random); - final int RUN_TIME_MSEC = atLeast(500); - final IndexWriterConfig conf = newIndexWriterConfig(TEST_VERSION_CURRENT, new MockAnalyzer(random)).setMaxBufferedDocs(-1).setRAMBufferSizeMB(64); - final File tempDir = _TestUtil.getTempDir("fstlines"); - final MockDirectoryWrapper dir = newFSDirectory(tempDir); - final IndexWriter writer = new IndexWriter(dir, conf); - writer.setInfoStream(VERBOSE ? System.out : null); - final long stopTime = System.currentTimeMillis() + RUN_TIME_MSEC; - Document doc; - int docCount = 0; - while((doc = docs.nextDoc()) != null && System.currentTimeMillis() < stopTime) { - writer.addDocument(doc); - docCount++; - } - IndexReader r = IndexReader.open(writer, true); - writer.close(); - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(random.nextBoolean()); - Builder builder = new Builder(FST.INPUT_TYPE.BYTE2, outputs); - - boolean storeOrd = false; - if (VERBOSE) { - if (storeOrd) { - System.out.println("FST stores ord"); - } else { - System.out.println("FST stores docFreq"); - } - } - TermEnum termEnum = r.terms(new Term("body", "")); - if (VERBOSE) { - System.out.println("TEST: got termEnum=" + termEnum); - } - int ord = 0; - while(true) { - final Term term = termEnum.term(); - if (term == null || !"body".equals(term.field())) { - break; - } - - // No ord in 3.x: - /* - if (ord == 0) { - try { - termsEnum.ord(); - } catch (UnsupportedOperationException uoe) { - if (VERBOSE) { - System.out.println("TEST: codec doesn't support ord; FST stores docFreq"); - } - storeOrd = false; - } - } - */ - final int output; - if (storeOrd) { - output = ord; - } else { - output = termEnum.docFreq(); - } - //System.out.println("ADD: " + term.text() + " ch[0]=" + (term.text().length() == 0 ? -1 : term.text().charAt(0))); - builder.add(toIntsRef(term.text()), outputs.get(output)); - ord++; - if (VERBOSE && ord % 100000 == 0 && LuceneTestCase.TEST_NIGHTLY) { - System.out.println(ord + " terms..."); - } - termEnum.next(); - } - final FST fst = builder.finish(); - if (VERBOSE) { - System.out.println("FST: " + docCount + " docs; " + ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs;" + " " + fst.sizeInBytes() + " bytes"); - } - - if (ord > 0) { - // Now confirm BytesRefFSTEnum and TermEnum act the - // same: - final IntsRefFSTEnum fstEnum = new IntsRefFSTEnum(fst); - int num = atLeast(1000); - for(int iter=0;iter nextResult = fstEnum.next(); - if (nextResult != null) { - System.out.println("expected null but got: input=" + toString(nextResult.input) + " output=" + outputs.outputToString(nextResult.output)); - fail(); - } - break; - } - } - } - } - } - - r.close(); - dir.close(); - } - - private void assertSame(TermEnum termEnum, IntsRefFSTEnum fstEnum, boolean storeOrd) throws Exception { - if (termEnum.term() == null || !"body".equals(termEnum.term().field())) { - if (fstEnum.current() != null) { - fail("fstEnum.current().input=" + toString(fstEnum.current().input)); - } - } else { - assertNotNull(fstEnum.current()); - assertEquals(termEnum.term() + " != " + toString(fstEnum.current().input), termEnum.term().text(), toString(fstEnum.current().input)); - if (storeOrd) { - // fst stored the ord - // No ord in 3.x - // assertEquals(termEnum.ord(), ((Long) fstEnum.current().output).longValue()); - } else { - // fst stored the docFreq - assertEquals(termEnum.docFreq(), (int) (((Long) fstEnum.current().output).longValue())); - } - } - } - - private static abstract class VisitTerms { - private final String dirOut; - private final String wordsFileIn; - private int inputMode; - private final Outputs outputs; - private final Builder builder; - - public VisitTerms(String dirOut, String wordsFileIn, int inputMode, int prune, Outputs outputs) { - this.dirOut = dirOut; - this.wordsFileIn = wordsFileIn; - this.inputMode = inputMode; - this.outputs = outputs; - - builder = new Builder(inputMode == 0 ? FST.INPUT_TYPE.BYTE1 : FST.INPUT_TYPE.BYTE4, 0, prune, prune == 0, true, Integer.MAX_VALUE, outputs); - } - - protected abstract T getOutput(IntsRef input, int ord) throws IOException; - - public void run(int limit, boolean verify) throws IOException { - BufferedReader is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); - try { - final IntsRef intsRef = new IntsRef(10); - long tStart = System.currentTimeMillis(); - int ord = 0; - while(true) { - String w = is.readLine(); - if (w == null) { - break; - } - toIntsRef(w, inputMode, intsRef); - builder.add(intsRef, - getOutput(intsRef, ord)); - - ord++; - if (ord % 500000 == 0) { - System.out.println( - String.format(Locale.ENGLISH, - "%6.2fs: %9d...", ((System.currentTimeMillis() - tStart) / 1000.0), ord)); - } - if (ord >= limit) { - break; - } - } - - assert builder.getTermCount() == ord; - final FST fst = builder.finish(); - if (fst == null) { - System.out.println("FST was fully pruned!"); - System.exit(0); - } - - if (dirOut == null) - return; - - System.out.println(ord + " terms; " + fst.getNodeCount() + " nodes; " + fst.getArcCount() + " arcs; " + fst.getArcWithOutputCount() + " arcs w/ output; tot size " + fst.sizeInBytes()); - if (fst.getNodeCount() < 100) { - Writer w = new OutputStreamWriter(new FileOutputStream("out.dot"), "UTF-8"); - Util.toDot(fst, w, false, false); - w.close(); - System.out.println("Wrote FST to out.dot"); - } - - Directory dir = FSDirectory.open(new File(dirOut)); - IndexOutput out = dir.createOutput("fst.bin"); - fst.save(out); - out.close(); - - System.out.println("Saved FST to fst.bin."); - - if (!verify) { - return; - } - - System.out.println("\nNow verify..."); - - is.close(); - is = new BufferedReader(new InputStreamReader(new FileInputStream(wordsFileIn), "UTF-8"), 65536); - - ord = 0; - tStart = System.currentTimeMillis(); - while(true) { - String w = is.readLine(); - if (w == null) { - break; - } - toIntsRef(w, inputMode, intsRef); - T expected = getOutput(intsRef, ord); - T actual = Util.get(fst, intsRef); - if (actual == null) { - throw new RuntimeException("unexpected null output on input=" + w); - } - if (!actual.equals(expected)) { - throw new RuntimeException("wrong output (got " + outputs.outputToString(actual) + " but expected " + outputs.outputToString(expected) + ") on input=" + w); - } - - ord++; - if (ord % 500000 == 0) { - System.out.println(((System.currentTimeMillis()-tStart)/1000.0) + "s: " + ord + "..."); - } - if (ord >= limit) { - break; - } - } - - double totSec = ((System.currentTimeMillis() - tStart)/1000.0); - System.out.println("Verify took " + totSec + " sec + (" + (int) ((totSec*1000000000/ord)) + " nsec per lookup)"); - - } finally { - is.close(); - } - } - } - - // java -cp build/classes/test:build/classes/java:build/classes/test-framework:lib/junit-4.7.jar org.apache.lucene.util.fst.TestFSTs /x/tmp/allTerms3.txt out - public static void main(String[] args) throws IOException { - int prune = 0; - int limit = Integer.MAX_VALUE; - int inputMode = 0; // utf8 - boolean storeOrds = false; - boolean storeDocFreqs = false; - boolean verify = true; - - String wordsFileIn = null; - String dirOut = null; - - int idx = 0; - while (idx < args.length) { - if (args[idx].equals("-prune")) { - prune = Integer.valueOf(args[1 + idx]); - idx++; - } else if (args[idx].equals("-limit")) { - limit = Integer.valueOf(args[1 + idx]); - idx++; - } else if (args[idx].equals("-utf8")) { - inputMode = 0; - } else if (args[idx].equals("-utf32")) { - inputMode = 1; - } else if (args[idx].equals("-docFreq")) { - storeDocFreqs = true; - } else if (args[idx].equals("-ords")) { - storeOrds = true; - } else if (args[idx].equals("-noverify")) { - verify = false; - } else if (args[idx].startsWith("-")) { - System.err.println("Unrecognized option: " + args[idx]); - System.exit(-1); - } else { - if (wordsFileIn == null) { - wordsFileIn = args[idx]; - } else if (dirOut == null) { - dirOut = args[idx]; - } else { - System.err.println("Too many arguments, expected: input [output]"); - System.exit(-1); - } - } - idx++; - } - - if (wordsFileIn == null) { - System.err.println("No input file."); - System.exit(-1); - } - - // ord benefits from share, docFreqs don't: - - if (storeOrds && storeDocFreqs) { - // Store both ord & docFreq: - final PositiveIntOutputs o1 = PositiveIntOutputs.getSingleton(true); - final PositiveIntOutputs o2 = PositiveIntOutputs.getSingleton(false); - final PairOutputs outputs = new PairOutputs(o1, o2); - new VisitTerms>(dirOut, wordsFileIn, inputMode, prune, outputs) { - Random rand; - @Override - public PairOutputs.Pair getOutput(IntsRef input, int ord) { - if (ord == 0) { - rand = new Random(17); - } - return new PairOutputs.Pair(o1.get(ord), - o2.get(_TestUtil.nextInt(rand, 1, 5000))); - } - }.run(limit, verify); - } else if (storeOrds) { - // Store only ords - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { - @Override - public Long getOutput(IntsRef input, int ord) { - return outputs.get(ord); - } - }.run(limit, verify); - } else if (storeDocFreqs) { - // Store only docFreq - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(false); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { - Random rand; - @Override - public Long getOutput(IntsRef input, int ord) { - if (ord == 0) { - rand = new Random(17); - } - return outputs.get(_TestUtil.nextInt(rand, 1, 5000)); - } - }.run(limit, verify); - } else { - // Store nothing - final NoOutputs outputs = NoOutputs.getSingleton(); - final Object NO_OUTPUT = outputs.getNoOutput(); - new VisitTerms(dirOut, wordsFileIn, inputMode, prune, outputs) { - @Override - public Object getOutput(IntsRef input, int ord) { - return NO_OUTPUT; - } - }.run(limit, verify); - } - } - - public void testSingleString() throws Exception { - final Outputs outputs = NoOutputs.getSingleton(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - b.add(new BytesRef("foobar"), outputs.getNoOutput()); - final BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(b.finish()); - assertNull(fstEnum.seekFloor(new BytesRef("foo"))); - assertNull(fstEnum.seekCeil(new BytesRef("foobaz"))); - } - - public void testSimple() throws Exception { - - // Get outputs -- passing true means FST will share - // (delta code) the outputs. This should result in - // smaller FST if the outputs grow monotonically. But - // if numbers are "random", false should give smaller - // final size: - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - - // Build an FST mapping BytesRef -> Long - final Builder builder = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - final BytesRef a = new BytesRef("a"); - final BytesRef b = new BytesRef("b"); - final BytesRef c = new BytesRef("c"); - - builder.add(a, outputs.get(17)); - builder.add(b, outputs.get(42)); - builder.add(c, outputs.get(13824324872317238L)); - - final FST fst = builder.finish(); - - assertEquals(13824324872317238L, (long) Util.get(fst, c)); - assertEquals(42, (long) Util.get(fst, b)); - assertEquals(17, (long) Util.get(fst, a)); - - BytesRefFSTEnum fstEnum = new BytesRefFSTEnum(fst); - BytesRefFSTEnum.InputOutput seekResult; - seekResult = fstEnum.seekFloor(a); - assertNotNull(seekResult); - assertEquals(17, (long) seekResult.output); - - // goes to a - seekResult = fstEnum.seekFloor(new BytesRef("aa")); - assertNotNull(seekResult); - assertEquals(17, (long) seekResult.output); - - // goes to b - seekResult = fstEnum.seekCeil(new BytesRef("aa")); - assertNotNull(seekResult); - assertEquals(b, seekResult.input); - assertEquals(42, (long) seekResult.output); - } - - /** - * Test state expansion (array format) on close-to-root states. Creates - * synthetic input that has one expanded state on each level. - * - * @see "https://issues.apache.org/jira/browse/LUCENE-2933" - */ - public void testExpandedCloseToRoot() throws Exception { - class SyntheticData { - FST compile(String[] lines) throws IOException { - final NoOutputs outputs = NoOutputs.getSingleton(); - final Object nothing = outputs.getNoOutput(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - int line = 0; - final BytesRef term = new BytesRef(); - while (line < lines.length) { - String w = lines[line++]; - if (w == null) { - break; - } - term.copy(w); - b.add(term, nothing); - } - - return b.finish(); - } - - void generate(ArrayList out, StringBuilder b, char from, char to, - int depth) { - if (depth == 0 || from == to) { - String seq = b.toString() + "_" + out.size() + "_end"; - out.add(seq); - } else { - for (char c = from; c <= to; c++) { - b.append(c); - generate(out, b, from, c == to ? to : from, depth - 1); - b.deleteCharAt(b.length() - 1); - } - } - } - - public int verifyStateAndBelow(FST fst, Arc arc, int depth) - throws IOException { - if (fst.targetHasArcs(arc)) { - int childCount = 0; - for (arc = fst.readFirstTargetArc(arc, arc);; - arc = fst.readNextArc(arc), childCount++) - { - boolean expanded = fst.isExpandedTarget(arc); - int children = verifyStateAndBelow(fst, new FST.Arc().copyFrom(arc), depth + 1); - - assertEquals( - expanded, - (depth <= FST.FIXED_ARRAY_SHALLOW_DISTANCE && - children >= FST.FIXED_ARRAY_NUM_ARCS_SHALLOW) || - children >= FST.FIXED_ARRAY_NUM_ARCS_DEEP); - if (arc.isLast()) break; - } - - return childCount; - } - return 0; - } - } - - // Sanity check. - assertTrue(FST.FIXED_ARRAY_NUM_ARCS_SHALLOW < FST.FIXED_ARRAY_NUM_ARCS_DEEP); - assertTrue(FST.FIXED_ARRAY_SHALLOW_DISTANCE >= 0); - - SyntheticData s = new SyntheticData(); - - ArrayList out = new ArrayList(); - StringBuilder b = new StringBuilder(); - s.generate(out, b, 'a', 'i', 10); - String[] input = out.toArray(new String[out.size()]); - Arrays.sort(input); - FST fst = s.compile(input); - FST.Arc arc = fst.getFirstArc(new FST.Arc()); - s.verifyStateAndBelow(fst, arc, 1); - } - - // Make sure raw FST can differentiate between final vs - // non-final end nodes - public void testNonFinalStopNodes() throws Exception { - final PositiveIntOutputs outputs = PositiveIntOutputs.getSingleton(true); - final Long nothing = outputs.getNoOutput(); - final Builder b = new Builder(FST.INPUT_TYPE.BYTE1, outputs); - - final FST fst = new FST(FST.INPUT_TYPE.BYTE1, outputs); - - final Builder.UnCompiledNode rootNode = new Builder.UnCompiledNode(b, 0); - - // Add final stop node - { - final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); - node.isFinal = true; - rootNode.addArc('a', node); - final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); - rootNode.arcs[0].nextFinalOutput = outputs.get(17); - rootNode.arcs[0].isFinal = true; - rootNode.arcs[0].output = nothing; - rootNode.arcs[0].target = frozen; - } - - // Add non-final stop node - { - final Builder.UnCompiledNode node = new Builder.UnCompiledNode(b, 0); - rootNode.addArc('b', node); - final Builder.CompiledNode frozen = new Builder.CompiledNode(); - frozen.address = fst.addNode(node); - rootNode.arcs[1].nextFinalOutput = nothing; - rootNode.arcs[1].output = outputs.get(42); - rootNode.arcs[1].target = frozen; - } - - fst.finish(fst.addNode(rootNode)); - - checkStopNodes(fst, outputs); - - // Make sure it still works after save/load: - Directory dir = newDirectory(); - IndexOutput out = dir.createOutput("fst"); - fst.save(out); - out.close(); - - IndexInput in = dir.openInput("fst"); - final FST fst2 = new FST(in, outputs); - checkStopNodes(fst2, outputs); - in.close(); - dir.close(); - } - - private void checkStopNodes(FST fst, PositiveIntOutputs outputs) throws Exception { - final Long nothing = outputs.getNoOutput(); - FST.Arc startArc = fst.getFirstArc(new FST.Arc()); - assertEquals(nothing, startArc.output); - assertEquals(nothing, startArc.nextFinalOutput); - - FST.Arc arc = fst.readFirstTargetArc(startArc, new FST.Arc()); - assertEquals('a', arc.label); - assertEquals(17, arc.nextFinalOutput.longValue()); - assertTrue(arc.isFinal()); - - arc = fst.readNextArc(arc); - assertEquals('b', arc.label); - assertFalse(arc.isFinal()); - assertEquals(42, arc.output.longValue()); - } -}