X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/FST.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/FST.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/FST.java new file mode 100644 index 0000000..f312e84 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/FST.java @@ -0,0 +1,940 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; + +import org.apache.lucene.store.DataInput; +import org.apache.lucene.store.DataOutput; +import org.apache.lucene.util.ArrayUtil; +import org.apache.lucene.util.CodecUtil; +import org.apache.lucene.util.fst.Builder.UnCompiledNode; + +// TODO: if FST is pure prefix trie we can do a more compact +// job, ie, once we are at a 'suffix only', just store the +// completion labels as a string not as a series of arcs. + +// NOTE: while the FST is able to represent a non-final +// dead-end state (NON_FINAL_END_NODE=0), the layers above +// (FSTEnum, Util) have problems with this!! + +/** Represents an FST using a compact byte[] format. + *

The format is similar to what's used by Morfologik + * (http://sourceforge.net/projects/morfologik). + * + *

NOTE: the FST cannot be larger than ~2.1 GB + * because it uses int to address the byte[]. + * + * @lucene.experimental + */ +public class FST { + public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4}; + public final INPUT_TYPE inputType; + + private final static int BIT_FINAL_ARC = 1 << 0; + private final static int BIT_LAST_ARC = 1 << 1; + private final static int BIT_TARGET_NEXT = 1 << 2; + private final static int BIT_STOP_NODE = 1 << 3; + private final static int BIT_ARC_HAS_OUTPUT = 1 << 4; + private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5; + + // Arcs are stored as fixed-size (per entry) array, so + // that we can find an arc using binary search. We do + // this when number of arcs is > NUM_ARCS_ARRAY: + private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node. + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5; + + /** + * @see #shouldExpand(UnCompiledNode) + */ + final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10; + + private int[] bytesPerArc = new int[0]; + + // Increment version to change it + private final static String FILE_FORMAT_NAME = "FST"; + private final static int VERSION_START = 0; + + /** Changed numBytesPerArc for array'd case from byte to int. */ + private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1; + + private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC; + + // Never serialized; just used to represent the virtual + // final node w/ no arcs: + private final static int FINAL_END_NODE = -1; + + // Never serialized; just used to represent the virtual + // non-final node w/ no arcs: + private final static int NON_FINAL_END_NODE = 0; + + // if non-null, this FST accepts the empty string and + // produces this output + T emptyOutput; + private byte[] emptyOutputBytes; + + private byte[] bytes; + int byteUpto = 0; + + private int startNode = -1; + + public final Outputs outputs; + + private int lastFrozenNode; + + private final T NO_OUTPUT; + + public int nodeCount; + public int arcCount; + public int arcWithOutputCount; + + // If arc has this label then that arc is final/accepted + public static final int END_LABEL = -1; + + private Arc cachedRootArcs[]; + + public final static class Arc { + public int label; + public T output; + + int target; + + byte flags; + public T nextFinalOutput; + int nextArc; + + // This is non-zero if current arcs are fixed array: + int posArcsStart; + int bytesPerArc; + int arcIdx; + int numArcs; + + /** Returns this */ + public Arc copyFrom(Arc other) { + label = other.label; + target = other.target; + flags = other.flags; + output = other.output; + nextFinalOutput = other.nextFinalOutput; + nextArc = other.nextArc; + if (other.bytesPerArc != 0) { + bytesPerArc = other.bytesPerArc; + posArcsStart = other.posArcsStart; + arcIdx = other.arcIdx; + numArcs = other.numArcs; + } else { + bytesPerArc = 0; + } + return this; + } + + boolean flag(int flag) { + return FST.flag(flags, flag); + } + + public boolean isLast() { + return flag(BIT_LAST_ARC); + } + + public boolean isFinal() { + return flag(BIT_FINAL_ARC); + } + }; + + static boolean flag(int flags, int bit) { + return (flags & bit) != 0; + } + + private final BytesWriter writer; + + // make a new empty FST, for building + public FST(INPUT_TYPE inputType, Outputs outputs) { + this.inputType = inputType; + this.outputs = outputs; + bytes = new byte[128]; + NO_OUTPUT = outputs.getNoOutput(); + + writer = new BytesWriter(); + + emptyOutput = null; + } + + // create an existing FST + public FST(DataInput in, Outputs outputs) throws IOException { + this.outputs = outputs; + writer = null; + CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC); + if (in.readByte() == 1) { + // accepts empty string + int numBytes = in.readVInt(); + // messy + bytes = new byte[numBytes]; + in.readBytes(bytes, 0, numBytes); + emptyOutput = outputs.read(getBytesReader(numBytes-1)); + } else { + emptyOutput = null; + } + final byte t = in.readByte(); + switch(t) { + case 0: + inputType = INPUT_TYPE.BYTE1; + break; + case 1: + inputType = INPUT_TYPE.BYTE2; + break; + case 2: + inputType = INPUT_TYPE.BYTE4; + break; + default: + throw new IllegalStateException("invalid input type " + t); + } + startNode = in.readVInt(); + nodeCount = in.readVInt(); + arcCount = in.readVInt(); + arcWithOutputCount = in.readVInt(); + + bytes = new byte[in.readVInt()]; + in.readBytes(bytes, 0, bytes.length); + NO_OUTPUT = outputs.getNoOutput(); + + cacheRootArcs(); + } + + public INPUT_TYPE getInputType() { + return inputType; + } + + /** Returns bytes used to represent the FST */ + public int sizeInBytes() { + return bytes.length; + } + + void finish(int startNode) throws IOException { + if (startNode == FINAL_END_NODE && emptyOutput != null) { + startNode = 0; + } + if (this.startNode != -1) { + throw new IllegalStateException("already finished"); + } + byte[] finalBytes = new byte[writer.posWrite]; + System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite); + bytes = finalBytes; + this.startNode = startNode; + + cacheRootArcs(); + } + + // Caches first 128 labels + @SuppressWarnings("unchecked") + private void cacheRootArcs() throws IOException { + cachedRootArcs = (FST.Arc[]) new FST.Arc[0x80]; + final FST.Arc arc = new FST.Arc(); + getFirstArc(arc); + final BytesReader in = getBytesReader(0); + if (targetHasArcs(arc)) { + readFirstRealArc(arc.target, arc); + while(true) { + assert arc.label != END_LABEL; + if (arc.label < cachedRootArcs.length) { + cachedRootArcs[arc.label] = new Arc().copyFrom(arc); + } else { + break; + } + if (arc.isLast()) { + break; + } + readNextRealArc(arc, in); + } + } + } + + void setEmptyOutput(T v) throws IOException { + if (emptyOutput != null) { + emptyOutput = outputs.merge(emptyOutput, v); + } else { + emptyOutput = v; + } + + // TODO: this is messy -- replace with sillyBytesWriter; maybe make + // bytes private + final int posSave = writer.posWrite; + outputs.write(emptyOutput, writer); + emptyOutputBytes = new byte[writer.posWrite-posSave]; + + // reverse + final int stopAt = (writer.posWrite - posSave)/2; + int upto = 0; + while(upto < stopAt) { + final byte b = bytes[posSave + upto]; + bytes[posSave+upto] = bytes[writer.posWrite-upto-1]; + bytes[writer.posWrite-upto-1] = b; + upto++; + } + System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave); + writer.posWrite = posSave; + } + + public void save(DataOutput out) throws IOException { + if (startNode == -1) { + throw new IllegalStateException("call finish first"); + } + CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT); + // TODO: really we should encode this as an arc, arriving + // to the root node, instead of special casing here: + if (emptyOutput != null) { + out.writeByte((byte) 1); + out.writeVInt(emptyOutputBytes.length); + out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length); + } else { + out.writeByte((byte) 0); + } + final byte t; + if (inputType == INPUT_TYPE.BYTE1) { + t = 0; + } else if (inputType == INPUT_TYPE.BYTE2) { + t = 1; + } else { + t = 2; + } + out.writeByte(t); + out.writeVInt(startNode); + out.writeVInt(nodeCount); + out.writeVInt(arcCount); + out.writeVInt(arcWithOutputCount); + out.writeVInt(bytes.length); + out.writeBytes(bytes, 0, bytes.length); + } + + private void writeLabel(int v) throws IOException { + assert v >= 0: "v=" + v; + if (inputType == INPUT_TYPE.BYTE1) { + assert v <= 255: "v=" + v; + writer.writeByte((byte) v); + } else if (inputType == INPUT_TYPE.BYTE2) { + assert v <= 65535: "v=" + v; + writer.writeVInt(v); + } else { + //writeInt(v); + writer.writeVInt(v); + } + } + + int readLabel(DataInput in) throws IOException { + final int v; + if (inputType == INPUT_TYPE.BYTE1) { + v = in.readByte()&0xFF; + } else { + v = in.readVInt(); + } + return v; + } + + // returns true if the node at this address has any + // outgoing arcs + public boolean targetHasArcs(Arc arc) { + return arc.target > 0; + } + + // serializes new node by appending its bytes to the end + // of the current byte[] + int addNode(Builder.UnCompiledNode node) throws IOException { + //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs); + if (node.numArcs == 0) { + if (node.isFinal) { + return FINAL_END_NODE; + } else { + return NON_FINAL_END_NODE; + } + } + + int startAddress = writer.posWrite; + //System.out.println(" startAddr=" + startAddress); + + final boolean doFixedArray = shouldExpand(node); + final int fixedArrayStart; + if (doFixedArray) { + if (bytesPerArc.length < node.numArcs) { + bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)]; + } + // write a "false" first arc: + writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY); + writer.writeVInt(node.numArcs); + // placeholder -- we'll come back and write the number + // of bytes per arc (int) here: + // TODO: we could make this a vInt instead + writer.writeInt(0); + fixedArrayStart = writer.posWrite; + //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart); + } else { + fixedArrayStart = 0; + } + + nodeCount++; + arcCount += node.numArcs; + + final int lastArc = node.numArcs-1; + + int lastArcStart = writer.posWrite; + int maxBytesPerArc = 0; + for(int arcIdx=0;arcIdx arc = node.arcs[arcIdx]; + final Builder.CompiledNode target = (Builder.CompiledNode) arc.target; + int flags = 0; + + if (arcIdx == lastArc) { + flags += BIT_LAST_ARC; + } + + if (lastFrozenNode == target.address && !doFixedArray) { + flags += BIT_TARGET_NEXT; + } + + if (arc.isFinal) { + flags += BIT_FINAL_ARC; + if (arc.nextFinalOutput != NO_OUTPUT) { + flags += BIT_ARC_HAS_FINAL_OUTPUT; + } + } else { + assert arc.nextFinalOutput == NO_OUTPUT; + } + + boolean targetHasArcs = target.address > 0; + + if (!targetHasArcs) { + flags += BIT_STOP_NODE; + } + + if (arc.output != NO_OUTPUT) { + flags += BIT_ARC_HAS_OUTPUT; + } + + writer.writeByte((byte) flags); + writeLabel(arc.label); + + //System.out.println(" write arc: label=" + arc.label + " flags=" + flags); + + if (arc.output != NO_OUTPUT) { + outputs.write(arc.output, writer); + arcWithOutputCount++; + } + if (arc.nextFinalOutput != NO_OUTPUT) { + outputs.write(arc.nextFinalOutput, writer); + } + + if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) { + assert target.address > 0; + writer.writeInt(target.address); + } + + // just write the arcs "like normal" on first pass, + // but record how many bytes each one took, and max + // byte size: + if (doFixedArray) { + bytesPerArc[arcIdx] = writer.posWrite - lastArcStart; + lastArcStart = writer.posWrite; + maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]); + //System.out.println(" bytes=" + bytesPerArc[arcIdx]); + } + } + + // TODO: if arc'd arrays will be "too wasteful" by some + // measure, eg if arcs have vastly different sized + // outputs, then we should selectively disable array for + // such cases + + if (doFixedArray) { + assert maxBytesPerArc > 0; + // 2nd pass just "expands" all arcs to take up a fixed + // byte size + final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc; + bytes = ArrayUtil.grow(bytes, sizeNeeded); + // TODO: we could make this a vInt instead + bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24); + bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16); + bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8); + bytes[fixedArrayStart-1] = (byte) maxBytesPerArc; + + // expand the arcs in place, backwards + int srcPos = writer.posWrite; + int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc; + writer.posWrite = destPos; + for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) { + //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos); + destPos -= maxBytesPerArc; + srcPos -= bytesPerArc[arcIdx]; + if (srcPos != destPos) { + assert destPos > srcPos; + System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]); + } + } + } + + // reverse bytes in-place; we do this so that the + // "BIT_TARGET_NEXT" opto can work, ie, it reads the + // node just before the current one + final int endAddress = lastFrozenNode = writer.posWrite - 1; + + int left = startAddress; + int right = endAddress; + while (left < right) { + final byte b = bytes[left]; + bytes[left++] = bytes[right]; + bytes[right--] = b; + } + + return endAddress; + } + + /** Fills virtual 'start' arc, ie, an empty incoming arc to + * the FST's start node */ + public Arc getFirstArc(Arc arc) { + if (emptyOutput != null) { + arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC; + arc.nextFinalOutput = emptyOutput; + } else { + arc.flags = BIT_LAST_ARC; + arc.nextFinalOutput = NO_OUTPUT; + } + arc.output = NO_OUTPUT; + + // If there are no nodes, ie, the FST only accepts the + // empty string, then startNode is 0, and then readFirstTargetArc + arc.target = startNode; + return arc; + } + + /** Follows the follow arc and reads the last + * arc of its target; this changes the provided + * arc (2nd arg) in-place and returns it. + * + * @return Returns the second argument + * (arc). */ + public Arc readLastTargetArc(Arc follow, Arc arc) throws IOException { + //System.out.println("readLast"); + if (!targetHasArcs(follow)) { + //System.out.println(" end node"); + assert follow.isFinal(); + arc.label = END_LABEL; + arc.output = follow.nextFinalOutput; + arc.flags = BIT_LAST_ARC; + return arc; + } else { + final BytesReader in = getBytesReader(follow.target); + arc.flags = in.readByte(); + if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + // array: jump straight to end + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readInt(); + //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc); + arc.posArcsStart = in.pos; + arc.arcIdx = arc.numArcs - 2; + } else { + // non-array: linear scan + arc.bytesPerArc = 0; + //System.out.println(" scan"); + while(!arc.isLast()) { + // skip this arc: + readLabel(in); + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + outputs.read(in); + } + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.read(in); + } + if (arc.flag(BIT_STOP_NODE)) { + } else if (arc.flag(BIT_TARGET_NEXT)) { + } else { + in.pos -= 4; + } + arc.flags = in.readByte(); + } + arc.nextArc = in.pos+1; + } + readNextRealArc(arc, in); + assert arc.isLast(); + return arc; + } + } + + /** + * Follow the follow arc and read the first arc of its target; + * this changes the provided arc (2nd arg) in-place and returns + * it. + * + * @return Returns the second argument (arc). + */ + public Arc readFirstTargetArc(Arc follow, Arc arc) throws IOException { + //int pos = address; + //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal()); + if (follow.isFinal()) { + // Insert "fake" final first arc: + arc.label = END_LABEL; + arc.output = follow.nextFinalOutput; + if (follow.target <= 0) { + arc.flags = BIT_LAST_ARC; + } else { + arc.flags = 0; + arc.nextArc = follow.target; + } + //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output)); + return arc; + } else { + return readFirstRealArc(follow.target, arc); + } + } + + // Not private because NodeHash needs access: + Arc readFirstRealArc(int address, Arc arc) throws IOException { + + final BytesReader in = getBytesReader(address); + + arc.flags = in.readByte(); + + if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) { + //System.out.println(" fixedArray"); + // this is first arc in a fixed-array + arc.numArcs = in.readVInt(); + arc.bytesPerArc = in.readInt(); + arc.arcIdx = -1; + arc.nextArc = arc.posArcsStart = in.pos; + //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos); + } else { + arc.nextArc = address; + arc.bytesPerArc = 0; + } + return readNextRealArc(arc, in); + } + + /** + * Checks if arc's target state is in expanded (or vector) format. + * + * @return Returns true if arc points to a state in an + * expanded array format. + */ + boolean isExpandedTarget(Arc follow) throws IOException { + if (!targetHasArcs(follow)) { + return false; + } else { + final BytesReader in = getBytesReader(follow.target); + final byte b = in.readByte(); + return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0; + } + } + + /** In-place read; returns the arc. */ + public Arc readNextArc(Arc arc) throws IOException { + if (arc.label == END_LABEL) { + // This was a fake inserted "final" arc + if (arc.nextArc <= 0) { + // This arc went to virtual final node, ie has no outgoing arcs + return null; + } + return readFirstRealArc(arc.nextArc, arc); + } else { + return readNextRealArc(arc, getBytesReader(0)); + } + } + + /** Peeks at next arc's label; does not alter arc. Do + * not call this if arc.isLast()! */ + public int readNextArcLabel(Arc arc) throws IOException { + assert !arc.isLast(); + + final BytesReader in; + if (arc.label == END_LABEL) { + //System.out.println(" nextArc fake " + arc.nextArc); + in = getBytesReader(arc.nextArc); + byte flags = bytes[in.pos]; + if (flag(flags, BIT_ARCS_AS_FIXED_ARRAY)) { + //System.out.println(" nextArc fake array"); + in.pos--; + in.readVInt(); + in.readInt(); + } + } else { + if (arc.bytesPerArc != 0) { + //System.out.println(" nextArc real array"); + // arcs are at fixed entries + in = getBytesReader(arc.posArcsStart - (1+arc.arcIdx)*arc.bytesPerArc); + } else { + // arcs are packed + //System.out.println(" nextArc real packed"); + in = getBytesReader(arc.nextArc); + } + } + // skip flags + in.readByte(); + return readLabel(in); + } + + Arc readNextRealArc(Arc arc, final BytesReader in) throws IOException { + // this is a continuing arc in a fixed array + if (arc.bytesPerArc != 0) { + // arcs are at fixed entries + arc.arcIdx++; + assert arc.arcIdx < arc.numArcs; + in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc; + } else { + // arcs are packed + in.pos = arc.nextArc; + } + arc.flags = in.readByte(); + arc.label = readLabel(in); + + if (arc.flag(BIT_ARC_HAS_OUTPUT)) { + arc.output = outputs.read(in); + } else { + arc.output = outputs.getNoOutput(); + } + + if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) { + arc.nextFinalOutput = outputs.read(in); + } else { + arc.nextFinalOutput = outputs.getNoOutput(); + } + + if (arc.flag(BIT_STOP_NODE)) { + if (arc.flag(BIT_FINAL_ARC)) { + arc.target = FINAL_END_NODE; + } else { + arc.target = NON_FINAL_END_NODE; + } + arc.nextArc = in.pos; + } else if (arc.flag(BIT_TARGET_NEXT)) { + arc.nextArc = in.pos; + if (!arc.flag(BIT_LAST_ARC)) { + if (arc.bytesPerArc == 0) { + // must scan + seekToNextNode(in); + } else { + in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs; + } + } + arc.target = in.pos; + } else { + arc.target = in.readInt(); + arc.nextArc = in.pos; + } + + return arc; + } + + /** Finds an arc leaving the incoming arc, replacing the arc in place. + * This returns null if the arc was not found, else the incoming arc. */ + public Arc findTargetArc(int labelToMatch, Arc follow, Arc arc) throws IOException { + assert cachedRootArcs != null; + // Short-circuit if this arc is in the root arc cache: + if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) { + final Arc result = cachedRootArcs[labelToMatch]; + if (result == null) { + return result; + } else { + arc.copyFrom(result); + return arc; + } + } + + if (labelToMatch == END_LABEL) { + if (follow.isFinal()) { + arc.output = follow.nextFinalOutput; + arc.label = END_LABEL; + return arc; + } else { + return null; + } + } + + if (!targetHasArcs(follow)) { + return null; + } + + // TODO: maybe make an explicit thread state that holds + // reusable stuff eg BytesReader: + final BytesReader in = getBytesReader(follow.target); + + // System.out.println("fta label=" + (char) labelToMatch); + + if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) { + // Arcs are full array; do binary search: + arc.numArcs = in.readVInt(); + //System.out.println(" bs " + arc.numArcs); + arc.bytesPerArc = in.readInt(); + arc.posArcsStart = in.pos; + int low = 0; + int high = arc.numArcs-1; + while (low <= high) { + //System.out.println(" cycle"); + int mid = (low + high) >>> 1; + in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1; + int midLabel = readLabel(in); + final int cmp = midLabel - labelToMatch; + if (cmp < 0) + low = mid + 1; + else if (cmp > 0) + high = mid - 1; + else { + arc.arcIdx = mid-1; + //System.out.println(" found!"); + return readNextRealArc(arc, in); + } + } + + return null; + } + + // Linear scan + readFirstTargetArc(follow, arc); + while(true) { + //System.out.println(" non-bs cycle"); + // TODO: we should fix this code to not have to create + // object for the output of every arc we scan... only + // for the matching arc, if found + if (arc.label == labelToMatch) { + //System.out.println(" found!"); + return arc; + } else if (arc.label > labelToMatch) { + return null; + } else if (arc.isLast()) { + return null; + } else { + readNextArc(arc); + } + } + } + + private void seekToNextNode(BytesReader in) throws IOException { + + while(true) { + + final int flags = in.readByte(); + readLabel(in); + + if (flag(flags, BIT_ARC_HAS_OUTPUT)) { + outputs.read(in); + } + + if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) { + outputs.read(in); + } + + if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) { + in.readInt(); + } + + if (flag(flags, BIT_LAST_ARC)) { + return; + } + } + } + + public int getNodeCount() { + // 1+ in order to count the -1 implicit final node + return 1+nodeCount; + } + + public int getArcCount() { + return arcCount; + } + + public int getArcWithOutputCount() { + return arcWithOutputCount; + } + + /** + * Nodes will be expanded if their depth (distance from the root node) is + * <= this value and their number of arcs is >= + * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}. + * + *

+ * Fixed array consumes more RAM but enables binary search on the arcs + * (instead of a linear scan) on lookup by arc label. + * + * @return true if node should be stored in an + * expanded (array) form. + * + * @see #FIXED_ARRAY_NUM_ARCS_DEEP + * @see Builder.UnCompiledNode#depth + */ + private boolean shouldExpand(UnCompiledNode node) { + return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) || + node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP; + } + + // Non-static: writes to FST's byte[] + class BytesWriter extends DataOutput { + int posWrite; + + public BytesWriter() { + // pad: ensure no node gets address 0 which is reserved to mean + // the stop state w/ no arcs + posWrite = 1; + } + + @Override + public void writeByte(byte b) { + if (bytes.length == posWrite) { + bytes = ArrayUtil.grow(bytes); + } + assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length; + bytes[posWrite++] = b; + } + + @Override + public void writeBytes(byte[] b, int offset, int length) { + final int size = posWrite + length; + bytes = ArrayUtil.grow(bytes, size); + System.arraycopy(b, offset, bytes, posWrite, length); + posWrite += length; + } + } + + final BytesReader getBytesReader(int pos) { + // TODO: maybe re-use via ThreadLocal? + return new BytesReader(pos); + } + + // Non-static: reads byte[] from FST + final class BytesReader extends DataInput { + int pos; + + public BytesReader(int pos) { + this.pos = pos; + } + + @Override + public byte readByte() { + return bytes[pos--]; + } + + @Override + public void readBytes(byte[] b, int offset, int len) { + for(int i=0;i