1 package org.apache.lucene.util.fst;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
23 import org.apache.lucene.util.BytesRef;
24 import org.apache.lucene.util.IntsRef;
26 /** Static helper methods
28 * @lucene.experimental */
29 public final class Util {
33 /** Looks up the output for this input, or null if the
34 * input is not accepted. FST must be
35 * INPUT_TYPE.BYTE4. */
36 public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
37 assert fst.inputType == FST.INPUT_TYPE.BYTE4;
39 // TODO: would be nice not to alloc this on every lookup
40 final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
42 // Accumulate output as we go
43 final T NO_OUTPUT = fst.outputs.getNoOutput();
45 for(int i=0;i<input.length;i++) {
46 if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
48 } else if (arc.output != NO_OUTPUT) {
49 output = fst.outputs.add(output, arc.output);
53 if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
55 } else if (arc.output != NO_OUTPUT) {
56 return fst.outputs.add(output, arc.output);
62 /** Logically casts input to UTF32 ints then looks up the output
63 * or null if the input is not accepted. FST must be
64 * INPUT_TYPE.BYTE4. */
65 public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
66 assert fst.inputType == FST.INPUT_TYPE.BYTE4;
68 // TODO: would be nice not to alloc this on every lookup
69 final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
72 final int charLimit = offset + length;
74 // Accumulate output as we go
75 final T NO_OUTPUT = fst.outputs.getNoOutput();
77 while(charIdx < charLimit) {
78 final int utf32 = Character.codePointAt(input, charIdx);
79 charIdx += Character.charCount(utf32);
81 if (fst.findTargetArc(utf32, arc, arc) == null) {
83 } else if (arc.output != NO_OUTPUT) {
84 output = fst.outputs.add(output, arc.output);
88 if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
90 } else if (arc.output != NO_OUTPUT) {
91 return fst.outputs.add(output, arc.output);
98 /** Logically casts input to UTF32 ints then looks up the output
99 * or null if the input is not accepted. FST must be
100 * INPUT_TYPE.BYTE4. */
101 public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
102 assert fst.inputType == FST.INPUT_TYPE.BYTE4;
104 // TODO: would be nice not to alloc this on every lookup
105 final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
108 final int charLimit = input.length();
110 // Accumulate output as we go
111 final T NO_OUTPUT = fst.outputs.getNoOutput();
112 T output = NO_OUTPUT;
114 while(charIdx < charLimit) {
115 final int utf32 = Character.codePointAt(input, charIdx);
116 charIdx += Character.charCount(utf32);
118 if (fst.findTargetArc(utf32, arc, arc) == null) {
120 } else if (arc.output != NO_OUTPUT) {
121 output = fst.outputs.add(output, arc.output);
125 if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
127 } else if (arc.output != NO_OUTPUT) {
128 return fst.outputs.add(output, arc.output);
134 /** Looks up the output for this input, or null if the
135 * input is not accepted */
136 public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
137 assert fst.inputType == FST.INPUT_TYPE.BYTE1;
139 // TODO: would be nice not to alloc this on every lookup
140 final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
142 // Accumulate output as we go
143 final T NO_OUTPUT = fst.outputs.getNoOutput();
144 T output = NO_OUTPUT;
145 for(int i=0;i<input.length;i++) {
146 if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
148 } else if (arc.output != NO_OUTPUT) {
149 output = fst.outputs.add(output, arc.output);
153 if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
155 } else if (arc.output != NO_OUTPUT) {
156 return fst.outputs.add(output, arc.output);
163 * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
164 * for visualization. Example of use:
167 * PrintStream ps = new PrintStream("out.dot");
172 * and then, from command line:
175 * dot -Tpng -o out.png out.dot
179 * Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
182 * If <code>true</code>, the resulting <code>dot</code> file will try
183 * to order states in layers of breadth-first traversal. This may
184 * mess up arcs, but makes the output FST's structure a bit clearer.
187 * If <code>true</code> states will have labels equal to their offsets in their
188 * binary format. Expands the graph considerably.
190 * @see "http://www.graphviz.org/"
192 public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
194 final String expandedNodeColor = "blue";
196 // This is the start arc in the automaton (from the epsilon state to the first state
197 // with outgoing transitions.
198 final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
200 // A queue of transitions to consider for the next level.
201 final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();
203 // A queue of transitions to consider when processing the next level.
204 final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
205 nextLevelQueue.add(startArc);
207 // A list of states on the same level (for ranking).
208 final List<Integer> sameLevelStates = new ArrayList<Integer>();
210 // A bitset of already seen states (target offset).
211 final BitSet seen = new BitSet();
212 seen.set(startArc.target);
215 final String stateShape = "circle";
217 // Emit DOT prologue.
218 out.write("digraph FST {\n");
219 out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
222 out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
225 emitDotState(out, "initial", "point", "white", "");
226 emitDotState(out, Integer.toString(startArc.target), stateShape,
227 fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
229 out.write(" initial -> " + startArc.target + "\n");
231 final T NO_OUTPUT = fst.outputs.getNoOutput();
234 while (!nextLevelQueue.isEmpty()) {
235 // we could double buffer here, but it doesn't matter probably.
236 thisLevelQueue.addAll(nextLevelQueue);
237 nextLevelQueue.clear();
240 out.write("\n // Transitions and states at level: " + level + "\n");
241 while (!thisLevelQueue.isEmpty()) {
242 final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
244 if (fst.targetHasArcs(arc)) {
246 final int node = arc.target;
247 fst.readFirstTargetArc(arc, arc);
250 // Emit the unseen state and add it to the queue for the next level.
251 if (arc.target >= 0 && !seen.get(arc.target)) {
252 final boolean isExpanded = fst.isExpandedTarget(arc);
253 emitDotState(out, Integer.toString(arc.target), stateShape,
254 isExpanded ? expandedNodeColor : null,
255 labelStates ? Integer.toString(arc.target) : "");
256 seen.set(arc.target);
257 nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
258 sameLevelStates.add(arc.target);
262 if (arc.output != NO_OUTPUT) {
263 outs = "/" + fst.outputs.outputToString(arc.output);
269 if (arc.label == FST.END_LABEL) {
272 cl = printableLabel(arc.label);
275 out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");
277 // Break the loop if we're on the last arc of this state.
281 fst.readNextArc(arc);
286 // Emit state ranking information.
287 if (sameRank && sameLevelStates.size() > 1) {
288 out.write(" {rank=same; ");
289 for (int state : sameLevelStates) {
290 out.write(state + "; ");
294 sameLevelStates.clear();
297 // Emit terminating state (always there anyway).
298 out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
299 out.write(" {rank=sink; -1 }\n");
306 * Emit a single state in the <code>dot</code> language.
308 private static void emitDotState(Writer out, String name, String shape,
309 String color, String label) throws IOException {
312 + (shape != null ? "shape=" + shape : "") + " "
313 + (color != null ? "color=" + color : "") + " "
314 + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
319 * Ensures an arc's label is indeed printable (dot uses US-ASCII).
321 private static String printableLabel(int label) {
322 if (label >= 0x20 && label <= 0x7d) {
323 return Character.toString((char) label);
325 return "0x" + Integer.toHexString(label);