X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/Util.java diff --git a/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/Util.java b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/Util.java new file mode 100644 index 0000000..2101f89 --- /dev/null +++ b/lucene-java-3.5.0/lucene/src/java/org/apache/lucene/util/fst/Util.java @@ -0,0 +1,328 @@ +package org.apache.lucene.util.fst; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.*; +import java.util.*; + +import org.apache.lucene.util.BytesRef; +import org.apache.lucene.util.IntsRef; + +/** Static helper methods + * + * @lucene.experimental */ +public final class Util { + private Util() { + } + + /** Looks up the output for this input, or null if the + * input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public static T get(FST fst, IntsRef input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + for(int i=0;i T get(FST fst, char[] input, int offset, int length) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + int charIdx = offset; + final int charLimit = offset + length; + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + while(charIdx < charLimit) { + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (fst.findTargetArc(utf32, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + + if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } + + + /** Logically casts input to UTF32 ints then looks up the output + * or null if the input is not accepted. FST must be + * INPUT_TYPE.BYTE4. */ + public static T get(FST fst, CharSequence input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE4; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + int charIdx = 0; + final int charLimit = input.length(); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + + while(charIdx < charLimit) { + final int utf32 = Character.codePointAt(input, charIdx); + charIdx += Character.charCount(utf32); + + if (fst.findTargetArc(utf32, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + output = fst.outputs.add(output, arc.output); + } + } + + if (fst.findTargetArc(FST.END_LABEL, arc, arc) == null) { + return null; + } else if (arc.output != NO_OUTPUT) { + return fst.outputs.add(output, arc.output); + } else { + return output; + } + } + + /** Looks up the output for this input, or null if the + * input is not accepted */ + public static T get(FST fst, BytesRef input) throws IOException { + assert fst.inputType == FST.INPUT_TYPE.BYTE1; + + // TODO: would be nice not to alloc this on every lookup + final FST.Arc arc = fst.getFirstArc(new FST.Arc()); + + // Accumulate output as we go + final T NO_OUTPUT = fst.outputs.getNoOutput(); + T output = NO_OUTPUT; + for(int i=0;idot language description + * for visualization. Example of use: + * + *
+   * PrintStream ps = new PrintStream("out.dot");
+   * fst.toDot(ps);
+   * ps.close();
+   * 
+ * + * and then, from command line: + * + *
+   * dot -Tpng -o out.png out.dot
+   * 
+ * + *

+ * Note: larger FSTs (a few thousand nodes) won't even render, don't bother. + * + * @param sameRank + * If true, the resulting dot file will try + * to order states in layers of breadth-first traversal. This may + * mess up arcs, but makes the output FST's structure a bit clearer. + * + * @param labelStates + * If true states will have labels equal to their offsets in their + * binary format. Expands the graph considerably. + * + * @see "http://www.graphviz.org/" + */ + public static void toDot(FST fst, Writer out, boolean sameRank, boolean labelStates) + throws IOException { + final String expandedNodeColor = "blue"; + + // This is the start arc in the automaton (from the epsilon state to the first state + // with outgoing transitions. + final FST.Arc startArc = fst.getFirstArc(new FST.Arc()); + + // A queue of transitions to consider for the next level. + final List> thisLevelQueue = new ArrayList>(); + + // A queue of transitions to consider when processing the next level. + final List> nextLevelQueue = new ArrayList>(); + nextLevelQueue.add(startArc); + + // A list of states on the same level (for ranking). + final List sameLevelStates = new ArrayList(); + + // A bitset of already seen states (target offset). + final BitSet seen = new BitSet(); + seen.set(startArc.target); + + // Shape for states. + final String stateShape = "circle"; + + // Emit DOT prologue. + out.write("digraph FST {\n"); + out.write(" rankdir = LR; splines=true; concentrate=true; ordering=out; ranksep=2.5; \n"); + + if (!labelStates) { + out.write(" node [shape=circle, width=.2, height=.2, style=filled]\n"); + } + + emitDotState(out, "initial", "point", "white", ""); + emitDotState(out, Integer.toString(startArc.target), stateShape, + fst.isExpandedTarget(startArc) ? expandedNodeColor : null, + ""); + out.write(" initial -> " + startArc.target + "\n"); + + final T NO_OUTPUT = fst.outputs.getNoOutput(); + int level = 0; + + while (!nextLevelQueue.isEmpty()) { + // we could double buffer here, but it doesn't matter probably. + thisLevelQueue.addAll(nextLevelQueue); + nextLevelQueue.clear(); + + level++; + out.write("\n // Transitions and states at level: " + level + "\n"); + while (!thisLevelQueue.isEmpty()) { + final FST.Arc arc = thisLevelQueue.remove(thisLevelQueue.size() - 1); + + if (fst.targetHasArcs(arc)) { + // scan all arcs + final int node = arc.target; + fst.readFirstTargetArc(arc, arc); + + while (true) { + // Emit the unseen state and add it to the queue for the next level. + if (arc.target >= 0 && !seen.get(arc.target)) { + final boolean isExpanded = fst.isExpandedTarget(arc); + emitDotState(out, Integer.toString(arc.target), stateShape, + isExpanded ? expandedNodeColor : null, + labelStates ? Integer.toString(arc.target) : ""); + seen.set(arc.target); + nextLevelQueue.add(new FST.Arc().copyFrom(arc)); + sameLevelStates.add(arc.target); + } + + String outs; + if (arc.output != NO_OUTPUT) { + outs = "/" + fst.outputs.outputToString(arc.output); + } else { + outs = ""; + } + + final String cl; + if (arc.label == FST.END_LABEL) { + cl = "~"; + } else { + cl = printableLabel(arc.label); + } + + out.write(" " + node + " -> " + arc.target + " [label=\"" + cl + outs + "\"]\n"); + + // Break the loop if we're on the last arc of this state. + if (arc.isLast()) { + break; + } + fst.readNextArc(arc); + } + } + } + + // Emit state ranking information. + if (sameRank && sameLevelStates.size() > 1) { + out.write(" {rank=same; "); + for (int state : sameLevelStates) { + out.write(state + "; "); + } + out.write(" }\n"); + } + sameLevelStates.clear(); + } + + // Emit terminating state (always there anyway). + out.write(" -1 [style=filled, color=black, shape=circle, label=\"\"]\n\n"); + out.write(" {rank=sink; -1 }\n"); + + out.write("}\n"); + out.flush(); + } + + /** + * Emit a single state in the dot language. + */ + private static void emitDotState(Writer out, String name, String shape, + String color, String label) throws IOException { + out.write(" " + name + + " [" + + (shape != null ? "shape=" + shape : "") + " " + + (color != null ? "color=" + color : "") + " " + + (label != null ? "label=\"" + label + "\"" : "label=\"\"") + " " + + "]\n"); + } + + /** + * Ensures an arc's label is indeed printable (dot uses US-ASCII). + */ + private static String printableLabel(int label) { + if (label >= 0x20 && label <= 0x7d) { + return Character.toString((char) label); + } else { + return "0x" + Integer.toHexString(label); + } + } +}