1 package org.apache.lucene.util.fst;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.store.DataInput;
23 import org.apache.lucene.store.DataOutput;
24 import org.apache.lucene.util.ArrayUtil;
25 import org.apache.lucene.util.CodecUtil;
26 import org.apache.lucene.util.fst.Builder.UnCompiledNode;
28 // TODO: if FST is pure prefix trie we can do a more compact
29 // job, ie, once we are at a 'suffix only', just store the
30 // completion labels as a string not as a series of arcs.
32 // NOTE: while the FST is able to represent a non-final
33 // dead-end state (NON_FINAL_END_NODE=0), the layers above
34 // (FSTEnum, Util) have problems with this!!
36 /** Represents an FST using a compact byte[] format.
37 * <p> The format is similar to what's used by Morfologik
38 * (http://sourceforge.net/projects/morfologik).
40 * <p><b>NOTE</b>: the FST cannot be larger than ~2.1 GB
41 * because it uses int to address the byte[].
43 * @lucene.experimental
46 public static enum INPUT_TYPE {BYTE1, BYTE2, BYTE4};
47 public final INPUT_TYPE inputType;
49 private final static int BIT_FINAL_ARC = 1 << 0;
50 private final static int BIT_LAST_ARC = 1 << 1;
51 private final static int BIT_TARGET_NEXT = 1 << 2;
52 private final static int BIT_STOP_NODE = 1 << 3;
53 private final static int BIT_ARC_HAS_OUTPUT = 1 << 4;
54 private final static int BIT_ARC_HAS_FINAL_OUTPUT = 1 << 5;
56 // Arcs are stored as fixed-size (per entry) array, so
57 // that we can find an arc using binary search. We do
58 // this when number of arcs is > NUM_ARCS_ARRAY:
59 private final static int BIT_ARCS_AS_FIXED_ARRAY = 1 << 6;
62 * @see #shouldExpand(UnCompiledNode)
64 final static int FIXED_ARRAY_SHALLOW_DISTANCE = 3; // 0 => only root node.
67 * @see #shouldExpand(UnCompiledNode)
69 final static int FIXED_ARRAY_NUM_ARCS_SHALLOW = 5;
72 * @see #shouldExpand(UnCompiledNode)
74 final static int FIXED_ARRAY_NUM_ARCS_DEEP = 10;
76 private int[] bytesPerArc = new int[0];
78 // Increment version to change it
79 private final static String FILE_FORMAT_NAME = "FST";
80 private final static int VERSION_START = 0;
82 /** Changed numBytesPerArc for array'd case from byte to int. */
83 private final static int VERSION_INT_NUM_BYTES_PER_ARC = 1;
85 private final static int VERSION_CURRENT = VERSION_INT_NUM_BYTES_PER_ARC;
87 // Never serialized; just used to represent the virtual
88 // final node w/ no arcs:
89 private final static int FINAL_END_NODE = -1;
91 // Never serialized; just used to represent the virtual
92 // non-final node w/ no arcs:
93 private final static int NON_FINAL_END_NODE = 0;
95 // if non-null, this FST accepts the empty string and
96 // produces this output
98 private byte[] emptyOutputBytes;
100 private byte[] bytes;
103 private int startNode = -1;
105 public final Outputs<T> outputs;
107 private int lastFrozenNode;
109 private final T NO_OUTPUT;
111 public int nodeCount;
113 public int arcWithOutputCount;
115 // If arc has this label then that arc is final/accepted
116 public static final int END_LABEL = -1;
118 private Arc<T> cachedRootArcs[];
120 public final static class Arc<T> {
127 public T nextFinalOutput;
130 // This is non-zero if current arcs are fixed array:
137 public Arc<T> copyFrom(Arc<T> other) {
139 target = other.target;
141 output = other.output;
142 nextFinalOutput = other.nextFinalOutput;
143 nextArc = other.nextArc;
144 if (other.bytesPerArc != 0) {
145 bytesPerArc = other.bytesPerArc;
146 posArcsStart = other.posArcsStart;
147 arcIdx = other.arcIdx;
148 numArcs = other.numArcs;
155 boolean flag(int flag) {
156 return FST.flag(flags, flag);
159 public boolean isLast() {
160 return flag(BIT_LAST_ARC);
163 public boolean isFinal() {
164 return flag(BIT_FINAL_ARC);
168 static boolean flag(int flags, int bit) {
169 return (flags & bit) != 0;
172 private final BytesWriter writer;
174 // make a new empty FST, for building
175 public FST(INPUT_TYPE inputType, Outputs<T> outputs) {
176 this.inputType = inputType;
177 this.outputs = outputs;
178 bytes = new byte[128];
179 NO_OUTPUT = outputs.getNoOutput();
181 writer = new BytesWriter();
186 // create an existing FST
187 public FST(DataInput in, Outputs<T> outputs) throws IOException {
188 this.outputs = outputs;
190 CodecUtil.checkHeader(in, FILE_FORMAT_NAME, VERSION_INT_NUM_BYTES_PER_ARC, VERSION_INT_NUM_BYTES_PER_ARC);
191 if (in.readByte() == 1) {
192 // accepts empty string
193 int numBytes = in.readVInt();
195 bytes = new byte[numBytes];
196 in.readBytes(bytes, 0, numBytes);
197 emptyOutput = outputs.read(getBytesReader(numBytes-1));
201 final byte t = in.readByte();
204 inputType = INPUT_TYPE.BYTE1;
207 inputType = INPUT_TYPE.BYTE2;
210 inputType = INPUT_TYPE.BYTE4;
213 throw new IllegalStateException("invalid input type " + t);
215 startNode = in.readVInt();
216 nodeCount = in.readVInt();
217 arcCount = in.readVInt();
218 arcWithOutputCount = in.readVInt();
220 bytes = new byte[in.readVInt()];
221 in.readBytes(bytes, 0, bytes.length);
222 NO_OUTPUT = outputs.getNoOutput();
227 public INPUT_TYPE getInputType() {
231 /** Returns bytes used to represent the FST */
232 public int sizeInBytes() {
236 void finish(int startNode) throws IOException {
237 if (startNode == FINAL_END_NODE && emptyOutput != null) {
240 if (this.startNode != -1) {
241 throw new IllegalStateException("already finished");
243 byte[] finalBytes = new byte[writer.posWrite];
244 System.arraycopy(bytes, 0, finalBytes, 0, writer.posWrite);
246 this.startNode = startNode;
251 // Caches first 128 labels
252 @SuppressWarnings("unchecked")
253 private void cacheRootArcs() throws IOException {
254 cachedRootArcs = (FST.Arc<T>[]) new FST.Arc[0x80];
255 final FST.Arc<T> arc = new FST.Arc<T>();
257 final BytesReader in = getBytesReader(0);
258 if (targetHasArcs(arc)) {
259 readFirstRealArc(arc.target, arc);
261 assert arc.label != END_LABEL;
262 if (arc.label < cachedRootArcs.length) {
263 cachedRootArcs[arc.label] = new Arc<T>().copyFrom(arc);
270 readNextRealArc(arc, in);
275 void setEmptyOutput(T v) throws IOException {
276 if (emptyOutput != null) {
277 emptyOutput = outputs.merge(emptyOutput, v);
282 // TODO: this is messy -- replace with sillyBytesWriter; maybe make
284 final int posSave = writer.posWrite;
285 outputs.write(emptyOutput, writer);
286 emptyOutputBytes = new byte[writer.posWrite-posSave];
289 final int stopAt = (writer.posWrite - posSave)/2;
291 while(upto < stopAt) {
292 final byte b = bytes[posSave + upto];
293 bytes[posSave+upto] = bytes[writer.posWrite-upto-1];
294 bytes[writer.posWrite-upto-1] = b;
297 System.arraycopy(bytes, posSave, emptyOutputBytes, 0, writer.posWrite-posSave);
298 writer.posWrite = posSave;
301 public void save(DataOutput out) throws IOException {
302 if (startNode == -1) {
303 throw new IllegalStateException("call finish first");
305 CodecUtil.writeHeader(out, FILE_FORMAT_NAME, VERSION_CURRENT);
306 // TODO: really we should encode this as an arc, arriving
307 // to the root node, instead of special casing here:
308 if (emptyOutput != null) {
309 out.writeByte((byte) 1);
310 out.writeVInt(emptyOutputBytes.length);
311 out.writeBytes(emptyOutputBytes, 0, emptyOutputBytes.length);
313 out.writeByte((byte) 0);
316 if (inputType == INPUT_TYPE.BYTE1) {
318 } else if (inputType == INPUT_TYPE.BYTE2) {
324 out.writeVInt(startNode);
325 out.writeVInt(nodeCount);
326 out.writeVInt(arcCount);
327 out.writeVInt(arcWithOutputCount);
328 out.writeVInt(bytes.length);
329 out.writeBytes(bytes, 0, bytes.length);
332 private void writeLabel(int v) throws IOException {
333 assert v >= 0: "v=" + v;
334 if (inputType == INPUT_TYPE.BYTE1) {
335 assert v <= 255: "v=" + v;
336 writer.writeByte((byte) v);
337 } else if (inputType == INPUT_TYPE.BYTE2) {
338 assert v <= 65535: "v=" + v;
346 int readLabel(DataInput in) throws IOException {
348 if (inputType == INPUT_TYPE.BYTE1) {
349 v = in.readByte()&0xFF;
356 // returns true if the node at this address has any
358 public boolean targetHasArcs(Arc<T> arc) {
359 return arc.target > 0;
362 // serializes new node by appending its bytes to the end
363 // of the current byte[]
364 int addNode(Builder.UnCompiledNode<T> node) throws IOException {
365 //System.out.println("FST.addNode pos=" + posWrite + " numArcs=" + node.numArcs);
366 if (node.numArcs == 0) {
368 return FINAL_END_NODE;
370 return NON_FINAL_END_NODE;
374 int startAddress = writer.posWrite;
375 //System.out.println(" startAddr=" + startAddress);
377 final boolean doFixedArray = shouldExpand(node);
378 final int fixedArrayStart;
380 if (bytesPerArc.length < node.numArcs) {
381 bytesPerArc = new int[ArrayUtil.oversize(node.numArcs, 1)];
383 // write a "false" first arc:
384 writer.writeByte((byte) BIT_ARCS_AS_FIXED_ARRAY);
385 writer.writeVInt(node.numArcs);
386 // placeholder -- we'll come back and write the number
387 // of bytes per arc (int) here:
388 // TODO: we could make this a vInt instead
390 fixedArrayStart = writer.posWrite;
391 //System.out.println(" do fixed arcs array arcsStart=" + fixedArrayStart);
397 arcCount += node.numArcs;
399 final int lastArc = node.numArcs-1;
401 int lastArcStart = writer.posWrite;
402 int maxBytesPerArc = 0;
403 for(int arcIdx=0;arcIdx<node.numArcs;arcIdx++) {
404 final Builder.Arc<T> arc = node.arcs[arcIdx];
405 final Builder.CompiledNode target = (Builder.CompiledNode) arc.target;
408 if (arcIdx == lastArc) {
409 flags += BIT_LAST_ARC;
412 if (lastFrozenNode == target.address && !doFixedArray) {
413 flags += BIT_TARGET_NEXT;
417 flags += BIT_FINAL_ARC;
418 if (arc.nextFinalOutput != NO_OUTPUT) {
419 flags += BIT_ARC_HAS_FINAL_OUTPUT;
422 assert arc.nextFinalOutput == NO_OUTPUT;
425 boolean targetHasArcs = target.address > 0;
427 if (!targetHasArcs) {
428 flags += BIT_STOP_NODE;
431 if (arc.output != NO_OUTPUT) {
432 flags += BIT_ARC_HAS_OUTPUT;
435 writer.writeByte((byte) flags);
436 writeLabel(arc.label);
438 //System.out.println(" write arc: label=" + arc.label + " flags=" + flags);
440 if (arc.output != NO_OUTPUT) {
441 outputs.write(arc.output, writer);
442 arcWithOutputCount++;
444 if (arc.nextFinalOutput != NO_OUTPUT) {
445 outputs.write(arc.nextFinalOutput, writer);
448 if (targetHasArcs && (doFixedArray || lastFrozenNode != target.address)) {
449 assert target.address > 0;
450 writer.writeInt(target.address);
453 // just write the arcs "like normal" on first pass,
454 // but record how many bytes each one took, and max
457 bytesPerArc[arcIdx] = writer.posWrite - lastArcStart;
458 lastArcStart = writer.posWrite;
459 maxBytesPerArc = Math.max(maxBytesPerArc, bytesPerArc[arcIdx]);
460 //System.out.println(" bytes=" + bytesPerArc[arcIdx]);
464 // TODO: if arc'd arrays will be "too wasteful" by some
465 // measure, eg if arcs have vastly different sized
466 // outputs, then we should selectively disable array for
470 assert maxBytesPerArc > 0;
471 // 2nd pass just "expands" all arcs to take up a fixed
473 final int sizeNeeded = fixedArrayStart + node.numArcs * maxBytesPerArc;
474 bytes = ArrayUtil.grow(bytes, sizeNeeded);
475 // TODO: we could make this a vInt instead
476 bytes[fixedArrayStart-4] = (byte) (maxBytesPerArc >> 24);
477 bytes[fixedArrayStart-3] = (byte) (maxBytesPerArc >> 16);
478 bytes[fixedArrayStart-2] = (byte) (maxBytesPerArc >> 8);
479 bytes[fixedArrayStart-1] = (byte) maxBytesPerArc;
481 // expand the arcs in place, backwards
482 int srcPos = writer.posWrite;
483 int destPos = fixedArrayStart + node.numArcs*maxBytesPerArc;
484 writer.posWrite = destPos;
485 for(int arcIdx=node.numArcs-1;arcIdx>=0;arcIdx--) {
486 //System.out.println(" repack arcIdx=" + arcIdx + " srcPos=" + srcPos + " destPos=" + destPos);
487 destPos -= maxBytesPerArc;
488 srcPos -= bytesPerArc[arcIdx];
489 if (srcPos != destPos) {
490 assert destPos > srcPos;
491 System.arraycopy(bytes, srcPos, bytes, destPos, bytesPerArc[arcIdx]);
496 // reverse bytes in-place; we do this so that the
497 // "BIT_TARGET_NEXT" opto can work, ie, it reads the
498 // node just before the current one
499 final int endAddress = lastFrozenNode = writer.posWrite - 1;
501 int left = startAddress;
502 int right = endAddress;
503 while (left < right) {
504 final byte b = bytes[left];
505 bytes[left++] = bytes[right];
512 /** Fills virtual 'start' arc, ie, an empty incoming arc to
513 * the FST's start node */
514 public Arc<T> getFirstArc(Arc<T> arc) {
515 if (emptyOutput != null) {
516 arc.flags = BIT_FINAL_ARC | BIT_LAST_ARC;
517 arc.nextFinalOutput = emptyOutput;
519 arc.flags = BIT_LAST_ARC;
520 arc.nextFinalOutput = NO_OUTPUT;
522 arc.output = NO_OUTPUT;
524 // If there are no nodes, ie, the FST only accepts the
525 // empty string, then startNode is 0, and then readFirstTargetArc
526 arc.target = startNode;
530 /** Follows the <code>follow</code> arc and reads the last
531 * arc of its target; this changes the provided
532 * <code>arc</code> (2nd arg) in-place and returns it.
534 * @return Returns the second argument
535 * (<code>arc</code>). */
536 public Arc<T> readLastTargetArc(Arc<T> follow, Arc<T> arc) throws IOException {
537 //System.out.println("readLast");
538 if (!targetHasArcs(follow)) {
539 //System.out.println(" end node");
540 assert follow.isFinal();
541 arc.label = END_LABEL;
542 arc.output = follow.nextFinalOutput;
543 arc.flags = BIT_LAST_ARC;
546 final BytesReader in = getBytesReader(follow.target);
547 arc.flags = in.readByte();
548 if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
549 // array: jump straight to end
550 arc.numArcs = in.readVInt();
551 arc.bytesPerArc = in.readInt();
552 //System.out.println(" array numArcs=" + arc.numArcs + " bpa=" + arc.bytesPerArc);
553 arc.posArcsStart = in.pos;
554 arc.arcIdx = arc.numArcs - 2;
556 // non-array: linear scan
558 //System.out.println(" scan");
559 while(!arc.isLast()) {
562 if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
565 if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
568 if (arc.flag(BIT_STOP_NODE)) {
569 } else if (arc.flag(BIT_TARGET_NEXT)) {
573 arc.flags = in.readByte();
575 arc.nextArc = in.pos+1;
577 readNextRealArc(arc, in);
584 * Follow the <code>follow</code> arc and read the first arc of its target;
585 * this changes the provided <code>arc</code> (2nd arg) in-place and returns
588 * @return Returns the second argument (<code>arc</code>).
590 public Arc<T> readFirstTargetArc(Arc<T> follow, Arc<T> arc) throws IOException {
592 //System.out.println(" readFirstTarget follow.target=" + follow.target + " isFinal=" + follow.isFinal());
593 if (follow.isFinal()) {
594 // Insert "fake" final first arc:
595 arc.label = END_LABEL;
596 arc.output = follow.nextFinalOutput;
597 if (follow.target <= 0) {
598 arc.flags = BIT_LAST_ARC;
601 arc.nextArc = follow.target;
603 //System.out.println(" insert isFinal; nextArc=" + follow.target + " isLast=" + arc.isLast() + " output=" + outputs.outputToString(arc.output));
606 return readFirstRealArc(follow.target, arc);
610 // Not private because NodeHash needs access:
611 Arc<T> readFirstRealArc(int address, Arc<T> arc) throws IOException {
613 final BytesReader in = getBytesReader(address);
615 arc.flags = in.readByte();
617 if (arc.flag(BIT_ARCS_AS_FIXED_ARRAY)) {
618 //System.out.println(" fixedArray");
619 // this is first arc in a fixed-array
620 arc.numArcs = in.readVInt();
621 arc.bytesPerArc = in.readInt();
623 arc.nextArc = arc.posArcsStart = in.pos;
624 //System.out.println(" bytesPer=" + arc.bytesPerArc + " numArcs=" + arc.numArcs + " arcsStart=" + pos);
626 arc.nextArc = address;
629 return readNextRealArc(arc, in);
633 * Checks if <code>arc</code>'s target state is in expanded (or vector) format.
635 * @return Returns <code>true</code> if <code>arc</code> points to a state in an
636 * expanded array format.
638 boolean isExpandedTarget(Arc<T> follow) throws IOException {
639 if (!targetHasArcs(follow)) {
642 final BytesReader in = getBytesReader(follow.target);
643 final byte b = in.readByte();
644 return (b & BIT_ARCS_AS_FIXED_ARRAY) != 0;
648 /** In-place read; returns the arc. */
649 public Arc<T> readNextArc(Arc<T> arc) throws IOException {
650 if (arc.label == END_LABEL) {
651 // This was a fake inserted "final" arc
652 if (arc.nextArc <= 0) {
653 // This arc went to virtual final node, ie has no outgoing arcs
656 return readFirstRealArc(arc.nextArc, arc);
658 return readNextRealArc(arc, getBytesReader(0));
662 /** Peeks at next arc's label; does not alter arc. Do
663 * not call this if arc.isLast()! */
664 public int readNextArcLabel(Arc<T> arc) throws IOException {
665 assert !arc.isLast();
667 final BytesReader in;
668 if (arc.label == END_LABEL) {
669 //System.out.println(" nextArc fake " + arc.nextArc);
670 in = getBytesReader(arc.nextArc);
671 byte flags = bytes[in.pos];
672 if (flag(flags, BIT_ARCS_AS_FIXED_ARRAY)) {
673 //System.out.println(" nextArc fake array");
679 if (arc.bytesPerArc != 0) {
680 //System.out.println(" nextArc real array");
681 // arcs are at fixed entries
682 in = getBytesReader(arc.posArcsStart - (1+arc.arcIdx)*arc.bytesPerArc);
685 //System.out.println(" nextArc real packed");
686 in = getBytesReader(arc.nextArc);
691 return readLabel(in);
694 Arc<T> readNextRealArc(Arc<T> arc, final BytesReader in) throws IOException {
695 // this is a continuing arc in a fixed array
696 if (arc.bytesPerArc != 0) {
697 // arcs are at fixed entries
699 assert arc.arcIdx < arc.numArcs;
700 in.pos = arc.posArcsStart - arc.arcIdx*arc.bytesPerArc;
703 in.pos = arc.nextArc;
705 arc.flags = in.readByte();
706 arc.label = readLabel(in);
708 if (arc.flag(BIT_ARC_HAS_OUTPUT)) {
709 arc.output = outputs.read(in);
711 arc.output = outputs.getNoOutput();
714 if (arc.flag(BIT_ARC_HAS_FINAL_OUTPUT)) {
715 arc.nextFinalOutput = outputs.read(in);
717 arc.nextFinalOutput = outputs.getNoOutput();
720 if (arc.flag(BIT_STOP_NODE)) {
721 if (arc.flag(BIT_FINAL_ARC)) {
722 arc.target = FINAL_END_NODE;
724 arc.target = NON_FINAL_END_NODE;
726 arc.nextArc = in.pos;
727 } else if (arc.flag(BIT_TARGET_NEXT)) {
728 arc.nextArc = in.pos;
729 if (!arc.flag(BIT_LAST_ARC)) {
730 if (arc.bytesPerArc == 0) {
734 in.pos = arc.posArcsStart - arc.bytesPerArc * arc.numArcs;
739 arc.target = in.readInt();
740 arc.nextArc = in.pos;
746 /** Finds an arc leaving the incoming arc, replacing the arc in place.
747 * This returns null if the arc was not found, else the incoming arc. */
748 public Arc<T> findTargetArc(int labelToMatch, Arc<T> follow, Arc<T> arc) throws IOException {
749 assert cachedRootArcs != null;
750 // Short-circuit if this arc is in the root arc cache:
751 if (follow.target == startNode && labelToMatch != END_LABEL && labelToMatch < cachedRootArcs.length) {
752 final Arc<T> result = cachedRootArcs[labelToMatch];
753 if (result == null) {
756 arc.copyFrom(result);
761 if (labelToMatch == END_LABEL) {
762 if (follow.isFinal()) {
763 arc.output = follow.nextFinalOutput;
764 arc.label = END_LABEL;
771 if (!targetHasArcs(follow)) {
775 // TODO: maybe make an explicit thread state that holds
776 // reusable stuff eg BytesReader:
777 final BytesReader in = getBytesReader(follow.target);
779 // System.out.println("fta label=" + (char) labelToMatch);
781 if ((in.readByte() & BIT_ARCS_AS_FIXED_ARRAY) != 0) {
782 // Arcs are full array; do binary search:
783 arc.numArcs = in.readVInt();
784 //System.out.println(" bs " + arc.numArcs);
785 arc.bytesPerArc = in.readInt();
786 arc.posArcsStart = in.pos;
788 int high = arc.numArcs-1;
789 while (low <= high) {
790 //System.out.println(" cycle");
791 int mid = (low + high) >>> 1;
792 in.pos = arc.posArcsStart - arc.bytesPerArc*mid - 1;
793 int midLabel = readLabel(in);
794 final int cmp = midLabel - labelToMatch;
801 //System.out.println(" found!");
802 return readNextRealArc(arc, in);
810 readFirstTargetArc(follow, arc);
812 //System.out.println(" non-bs cycle");
813 // TODO: we should fix this code to not have to create
814 // object for the output of every arc we scan... only
815 // for the matching arc, if found
816 if (arc.label == labelToMatch) {
817 //System.out.println(" found!");
819 } else if (arc.label > labelToMatch) {
821 } else if (arc.isLast()) {
829 private void seekToNextNode(BytesReader in) throws IOException {
833 final int flags = in.readByte();
836 if (flag(flags, BIT_ARC_HAS_OUTPUT)) {
840 if (flag(flags, BIT_ARC_HAS_FINAL_OUTPUT)) {
844 if (!flag(flags, BIT_STOP_NODE) && !flag(flags, BIT_TARGET_NEXT)) {
848 if (flag(flags, BIT_LAST_ARC)) {
854 public int getNodeCount() {
855 // 1+ in order to count the -1 implicit final node
859 public int getArcCount() {
863 public int getArcWithOutputCount() {
864 return arcWithOutputCount;
868 * Nodes will be expanded if their depth (distance from the root node) is
869 * <= this value and their number of arcs is >=
870 * {@link #FIXED_ARRAY_NUM_ARCS_SHALLOW}.
873 * Fixed array consumes more RAM but enables binary search on the arcs
874 * (instead of a linear scan) on lookup by arc label.
876 * @return <code>true</code> if <code>node</code> should be stored in an
877 * expanded (array) form.
879 * @see #FIXED_ARRAY_NUM_ARCS_DEEP
880 * @see Builder.UnCompiledNode#depth
882 private boolean shouldExpand(UnCompiledNode<T> node) {
883 return (node.depth <= FIXED_ARRAY_SHALLOW_DISTANCE && node.numArcs >= FIXED_ARRAY_NUM_ARCS_SHALLOW) ||
884 node.numArcs >= FIXED_ARRAY_NUM_ARCS_DEEP;
887 // Non-static: writes to FST's byte[]
888 class BytesWriter extends DataOutput {
891 public BytesWriter() {
892 // pad: ensure no node gets address 0 which is reserved to mean
893 // the stop state w/ no arcs
898 public void writeByte(byte b) {
899 if (bytes.length == posWrite) {
900 bytes = ArrayUtil.grow(bytes);
902 assert posWrite < bytes.length: "posWrite=" + posWrite + " bytes.length=" + bytes.length;
903 bytes[posWrite++] = b;
907 public void writeBytes(byte[] b, int offset, int length) {
908 final int size = posWrite + length;
909 bytes = ArrayUtil.grow(bytes, size);
910 System.arraycopy(b, offset, bytes, posWrite, length);
915 final BytesReader getBytesReader(int pos) {
916 // TODO: maybe re-use via ThreadLocal?
917 return new BytesReader(pos);
920 // Non-static: reads byte[] from FST
921 final class BytesReader extends DataInput {
924 public BytesReader(int pos) {
929 public byte readByte() {
934 public void readBytes(byte[] b, int offset, int len) {
935 for(int i=0;i<len;i++) {
936 b[offset+i] = bytes[pos--];