+++ /dev/null
-package org.apache.lucene.util.fst;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.*;
-import java.util.*;
-
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.IntsRef;
-
-/** Static helper methods
- *
- * @lucene.experimental */
-public final class Util {
- private Util() {
- }
-
- /** Looks up the output for this input, or null if the
- * input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
- public static<T> T get(FST<T> fst, IntsRef input) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
- for(int i=0;i<input.length;i++) {
- if (fst.findTargetArc(input.ints[input.offset + i], arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
-
- /** Logically casts input to UTF32 ints then looks up the output
- * or null if the input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
- public static<T> T get(FST<T> fst, char[] input, int offset, int length) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- int charIdx = offset;
- final int charLimit = offset + length;
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
- while(charIdx < charLimit) {
- final int utf32 = Character.codePointAt(input, charIdx);
- charIdx += Character.charCount(utf32);
-
- if (fst.findTargetArc(utf32, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
-
-
- /** Logically casts input to UTF32 ints then looks up the output
- * or null if the input is not accepted. FST must be
- * INPUT_TYPE.BYTE4. */
- public static<T> T get(FST<T> fst, CharSequence input) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE4;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- int charIdx = 0;
- final int charLimit = input.length();
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
-
- while(charIdx < charLimit) {
- final int utf32 = Character.codePointAt(input, charIdx);
- charIdx += Character.charCount(utf32);
-
- if (fst.findTargetArc(utf32, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
-
- /** Looks up the output for this input, or null if the
- * input is not accepted */
- public static<T> T get(FST<T> fst, BytesRef input) throws IOException {
- assert fst.inputType == FST.INPUT_TYPE.BYTE1;
-
- // TODO: would be nice not to alloc this on every lookup
- final FST.Arc<T> arc = fst.getFirstArc(new FST.Arc<T>());
-
- // Accumulate output as we go
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- T output = NO_OUTPUT;
- for(int i=0;i<input.length;i++) {
- if (fst.findTargetArc(input.bytes[i+input.offset] & 0xFF, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- output = fst.outputs.add(output, arc.output);
- }
- }
-
- if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) {
- return null;
- } else if (arc.output != NO_OUTPUT) {
- return fst.outputs.add(output, arc.output);
- } else {
- return output;
- }
- }
-
- /**
- * Dumps an {@link FST} to a GraphViz's <code>dot</code> language description
- * for visualization. Example of use:
- *
- * <pre>
- * PrintStream ps = new PrintStream("out.dot");
- * fst.toDot(ps);
- * ps.close();
- * </pre>
- *
- * and then, from command line:
- *
- * <pre>
- * dot -Tpng -o out.png out.dot
- * </pre>
- *
- * <p>
- * Note: larger FSTs (a few thousand nodes) won't even render, don't bother.
- *
- * @param sameRank
- * If <code>true</code>, the resulting <code>dot</code> file will try
- * to order states in layers of breadth-first traversal. This may
- * mess up arcs, but makes the output FST's structure a bit clearer.
- *
- * @param labelStates
- * If <code>true</code> states will have labels equal to their offsets in their
- * binary format. Expands the graph considerably.
- *
- * @see "http://www.graphviz.org/"
- */
- public static <T> void toDot(FST<T> fst, Writer out, boolean sameRank, boolean labelStates)
- throws IOException {
- final String expandedNodeColor = "blue";
-
- // This is the start arc in the automaton (from the epsilon state to the first state
- // with outgoing transitions.
- final FST.Arc<T> startArc = fst.getFirstArc(new FST.Arc<T>());
-
- // A queue of transitions to consider for the next level.
- final List<FST.Arc<T>> thisLevelQueue = new ArrayList<FST.Arc<T>>();
-
- // A queue of transitions to consider when processing the next level.
- final List<FST.Arc<T>> nextLevelQueue = new ArrayList<FST.Arc<T>>();
- nextLevelQueue.add(startArc);
-
- // A list of states on the same level (for ranking).
- final List<Integer> sameLevelStates = new ArrayList<Integer>();
-
- // A bitset of already seen states (target offset).
- final BitSet seen = new BitSet();
- seen.set(startArc.target);
-
- // Shape for states.
- final String stateShape = "circle";
-
- // Emit DOT prologue.
- out.write("digraph FST {\n");
- out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n");
-
- if (!labelStates) {
- out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n");
- }
-
- emitDotState(out, "initial", "point", "white", "");
- emitDotState(out, Integer.toString(startArc.target), stateShape,
- fst.isExpandedTarget(startArc) ? expandedNodeColor : null,
- "");
- out.write(" initial -> " + startArc.target + "\n");
-
- final T NO_OUTPUT = fst.outputs.getNoOutput();
- int level = 0;
-
- while (!nextLevelQueue.isEmpty()) {
- // we could double buffer here, but it doesn't matter probably.
- thisLevelQueue.addAll(nextLevelQueue);
- nextLevelQueue.clear();
-
- level++;
- out.write("\n // Transitions and states at level: " + level + "\n");
- while (!thisLevelQueue.isEmpty()) {
- final FST.Arc<T> arc = thisLevelQueue.remove(thisLevelQueue.size() - 1);
-
- if (fst.targetHasArcs(arc)) {
- // scan all arcs
- final int node = arc.target;
- fst.readFirstTargetArc(arc, arc);
-
- while (true) {
- // Emit the unseen state and add it to the queue for the next level.
- if (arc.target >= 0 && !seen.get(arc.target)) {
- final boolean isExpanded = fst.isExpandedTarget(arc);
- emitDotState(out, Integer.toString(arc.target), stateShape,
- isExpanded ? expandedNodeColor : null,
- labelStates ? Integer.toString(arc.target) : "");
- seen.set(arc.target);
- nextLevelQueue.add(new FST.Arc<T>().copyFrom(arc));
- sameLevelStates.add(arc.target);
- }
-
- String outs;
- if (arc.output != NO_OUTPUT) {
- outs = "/" + fst.outputs.outputToString(arc.output);
- } else {
- outs = "";
- }
-
- final String cl;
- if (arc.label == FST.END_LABEL) {
- cl = "~";
- } else {
- cl = printableLabel(arc.label);
- }
-
- out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n");
-
- // Break the loop if we're on the last arc of this state.
- if (arc.isLast()) {
- break;
- }
- fst.readNextArc(arc);
- }
- }
- }
-
- // Emit state ranking information.
- if (sameRank && sameLevelStates.size() > 1) {
- out.write(" {rank=same; ");
- for (int state : sameLevelStates) {
- out.write(state + "; ");
- }
- out.write(" }\n");
- }
- sameLevelStates.clear();
- }
-
- // Emit terminating state (always there anyway).
- out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n");
- out.write(" {rank=sink; -1 }\n");
-
- out.write("}\n");
- out.flush();
- }
-
- /**
- * Emit a single state in the <code>dot</code> language.
- */
- private static void emitDotState(Writer out, String name, String shape,
- String color, String label) throws IOException {
- out.write(" " + name
- + " ["
- + (shape != null ? "shape=" + shape : "") + " "
- + (color != null ? "color=" + color : "") + " "
- + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " "
- + "]\n");
- }
-
- /**
- * Ensures an arc's label is indeed printable (dot uses US-ASCII).
- */
- private static String printableLabel(int label) {
- if (label >= 0x20 && label <= 0x7d) {
- return Character.toString((char) label);
- } else {
- return "0x" + Integer.toHexString(label);
- }
- }
-}