+++ /dev/null
-package org.apache.lucene.search.suggest.fst;
-
-import java.io.BufferedInputStream;
-import java.io.BufferedOutputStream;
-import java.io.File;
-import java.io.FileInputStream;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStream;
-import java.io.OutputStream;
-import java.util.ArrayList;
-import java.util.Collections;
-import java.util.Comparator;
-import java.util.List;
-
-import org.apache.lucene.util.IOUtils;
-import org.apache.lucene.util.IntsRef;
-import org.apache.lucene.util.fst.Builder;
-import org.apache.lucene.util.fst.FST;
-import org.apache.lucene.util.fst.FST.Arc;
-import org.apache.lucene.util.fst.NoOutputs;
-import org.apache.lucene.util.fst.Outputs;
-
-import org.apache.lucene.search.suggest.Lookup;
-import org.apache.lucene.search.suggest.tst.TSTLookup;
-import org.apache.lucene.search.spell.TermFreqIterator;
-import org.apache.lucene.store.InputStreamDataInput;
-import org.apache.lucene.store.OutputStreamDataOutput;
-
-/**
- * Finite state automata based implementation of {@link Lookup} query
- * suggestion/ autocomplete interface.
- *
- * <h2>Implementation details</h2>
- *
- * <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
- * <ul>
- * <li>A set of input terms (String) and weights (float) is given.</li>
- * <li>The range of weights is determined and then all weights are discretized into a fixed set
- * of values ({@link #buckets}).
- * Note that this means that minor changes in weights may be lost during automaton construction.
- * In general, this is not a big problem because the "priorities" of completions can be split
- * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
- * If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
- * <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
- * of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
- * become <code>1abc</code>.</li>
- * <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
- * term in front).</li>
- * <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
- * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
- * </ul>
- *
- * <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
- * <ul>
- * <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
- * starting with the highest one, we descend along the path of the input key. If the key is not
- * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
- * No completions.
- * <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
- * subautomaton (all paths) starting from this node form the key's completions.</b> We start
- * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
- * suggestion to the list of results (the weight of this suggestion is constant and equal to the
- * root path we started from). The tricky part is that because automaton edges are sorted and
- * we scan depth-first, we can terminate the entire procedure as soon as we collect enough
- * suggestions the user requested.
- * <li>In case the number of suggestions collected in the step above is still insufficient,
- * we proceed to the next (smaller) weight leaving the root node and repeat the same
- * algorithm again.
- * </li>
- * </ul>
- *
- * <h2>Runtime behavior and performance characteristic</h2>
- *
- * <p>The algorithm described above is optimized for finding suggestions to short prefixes
- * in a top-weights-first order. This is probably the most common use case: it allows
- * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
- *
- * <p>If there is an exact match in the automaton, it is returned first on the results
- * list (even with by-weight sorting).
- *
- * <p>Note that the maximum lookup time for <b>any prefix</b>
- * is the time of descending to the subtree, plus traversal of the subtree up to the number
- * of requested suggestions (because they are already presorted by weight on the root level
- * and alphabetically at any node level).
- *
- * <p>To order alphabetically only (no ordering by priorities), use identical term weights
- * for all terms. Alphabetical suggestions are returned even if non-constant weights are
- * used, but the algorithm for doing this is suboptimal.
- *
- * <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
- * nothing else.
- */
-public class FSTLookup extends Lookup {
-
- public FSTLookup() {
- this(10, true);
- }
-
- public FSTLookup(int buckets, boolean exactMatchFirst) {
- this.buckets = buckets;
- this.exactMatchFirst = exactMatchFirst;
- }
-
- /** A structure for a single entry (for sorting/ preprocessing). */
- private static class Entry {
- char [] term;
- float weight;
-
- public Entry(char [] term, float freq) {
- this.term = term;
- this.weight = freq;
- }
- }
-
- /** Serialized automaton file name (storage). */
- public static final String FILENAME = "fst.dat";
-
- /** An empty result. */
- private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
-
- /**
- * The number of separate buckets for weights (discretization). The more buckets,
- * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
- * will not decrease for prefixes which have highly-weighted completions (because these
- * are filled-in first), but will decrease significantly for low-weighted terms (but
- * these should be infrequent, so it is all right).
- *
- * <p>The number of buckets must be within [1, 255] range.
- */
- private final int buckets;
-
- /**
- * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
- * of other strings in the automaton (possibly with larger weights).
- */
- private final boolean exactMatchFirst;
-
- /**
- * Finite state automaton encoding all the lookup terms. See class
- * notes for details.
- */
- private FST<Object> automaton;
-
- /**
- * An array of arcs leaving the root automaton state and encoding weights of all
- * completions in their sub-trees.
- */
- private Arc<Object> [] rootArcs;
-
- /* */
- @Override
- public void build(TermFreqIterator tfit) throws IOException {
- // Buffer the input because we will need it twice: for calculating
- // weights distribution and for the actual automata building.
- List<Entry> entries = new ArrayList<Entry>();
- while (tfit.hasNext()) {
- String term = tfit.next();
- char [] termChars = new char [term.length() + 1]; // add padding for weight.
- for (int i = 0; i < term.length(); i++)
- termChars[i + 1] = term.charAt(i);
- entries.add(new Entry(termChars, tfit.freq()));
- }
-
- // Distribute weights into at most N buckets. This is a form of discretization to
- // limit the number of possible weights so that they can be efficiently encoded in the
- // automaton.
- //
- // It is assumed the distribution of weights is _linear_ so proportional division
- // of [min, max] range will be enough here. Other approaches could be to sort
- // weights and divide into proportional ranges.
- if (entries.size() > 0) {
- redistributeWeightsProportionalMinMax(entries, buckets);
- encodeWeightPrefix(entries);
- }
-
- // Build the automaton (includes input sorting) and cache root arcs in order from the highest,
- // to the lowest weight.
- this.automaton = buildAutomaton(entries);
- cacheRootArcs();
- }
-
- /**
- * Cache the root node's output arcs starting with completions with the highest weights.
- */
- @SuppressWarnings("unchecked")
- private void cacheRootArcs() throws IOException {
- if (automaton != null) {
- List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
- Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
- automaton.readFirstTargetArc(arc, arc);
- while (true) {
- rootArcs.add(new Arc<Object>().copyFrom(arc));
- if (arc.isLast())
- break;
- automaton.readNextArc(arc);
- }
-
- Collections.reverse(rootArcs); // we want highest weights first.
- this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
- }
- }
-
- /**
- * Not implemented.
- */
- @Override
- public boolean add(String key, Object value) {
- // This implementation does not support ad-hoc additions (all input
- // must be sorted for the builder).
- return false;
- }
-
- /**
- * Get the (approximated) weight of a single key (if there is a perfect match
- * for it in the automaton).
- *
- * @return Returns the approximated weight of the input key or <code>null</code>
- * if not found.
- */
- @Override
- public Float get(String key) {
- return getExactMatchStartingFromRootArc(0, key);
- }
-
- /**
- * Returns the first exact match by traversing root arcs, starting from
- * the arc <code>i</code>.
- *
- * @param i The first root arc index in {@link #rootArcs} to consider when
- * matching.
- */
- private Float getExactMatchStartingFromRootArc(int i, String key) {
- // Get the UTF-8 bytes representation of the input key.
- try {
- final FST.Arc<Object> scratch = new FST.Arc<Object>();
- for (; i < rootArcs.length; i++) {
- final FST.Arc<Object> rootArc = rootArcs[i];
- final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
-
- // Descend into the automaton using the key as prefix.
- if (descendWithPrefix(arc, key)) {
- automaton.readFirstTargetArc(arc, arc);
- if (arc.label == FST.END_LABEL) {
- // Prefix-encoded weight.
- return rootArc.label / (float) buckets;
- }
- }
- }
- } catch (IOException e) {
- // Should never happen, but anyway.
- throw new RuntimeException(e);
- }
-
- return null;
- }
-
- /**
- * Lookup autocomplete suggestions to <code>key</code>.
- *
- * @param key The prefix to which suggestions should be sought.
- * @param onlyMorePopular Return most popular suggestions first. This is the default
- * behavior for this implementation. Setting it to <code>false</code> has no effect (use
- * constant term weights to sort alphabetically only).
- * @param num At most this number of suggestions will be returned.
- * @return Returns the suggestions, sorted by their approximated weight first (decreasing)
- * and then alphabetically (utf16 codepoint order).
- */
- @Override
- public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
- if (key.length() == 0 || automaton == null) {
- // Keep the result an ArrayList to keep calls monomorphic.
- return EMPTY_RESULT;
- }
-
- try {
- if (!onlyMorePopular && rootArcs.length > 1) {
- // We could emit a warning here (?). An optimal strategy for alphabetically sorted
- // suggestions would be to add them with a constant weight -- this saves unnecessary
- // traversals and sorting.
- return lookupSortedAlphabetically(key, num);
- } else {
- return lookupSortedByWeight(key, num, true);
- }
- } catch (IOException e) {
- // Should never happen, but anyway.
- throw new RuntimeException(e);
- }
- }
-
- /**
- * Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
- * is a workaround: in general, use constant weights for alphabetically sorted result.
- */
- private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
- // Greedily get num results from each weight branch.
- List<LookupResult> res = lookupSortedByWeight(key, num, false);
-
- // Sort and trim.
- Collections.sort(res, new Comparator<LookupResult>() {
- // not till java6 @Override
- public int compare(LookupResult o1, LookupResult o2) {
- return o1.key.compareTo(o2.key);
- }
- });
- if (res.size() > num) {
- res = res.subList(0, num);
- }
- return res;
- }
-
- /**
- * Lookup suggestions sorted by weight (descending order).
- *
- * @param greedy If <code>true</code>, the routine terminates immediately when <code>num</code>
- * suggestions have been collected. If <code>false</code>, it will collect suggestions from
- * all weight arcs (needed for {@link #lookupSortedAlphabetically}.
- */
- private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean greedy) throws IOException {
- final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
- final StringBuilder output = new StringBuilder(key);
- final int matchLength = key.length() - 1;
-
- for (int i = 0; i < rootArcs.length; i++) {
- final FST.Arc<Object> rootArc = rootArcs[i];
- final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
-
- // Descend into the automaton using the key as prefix.
- if (descendWithPrefix(arc, key)) {
- // Prefix-encoded weight.
- final float weight = rootArc.label / (float) buckets;
-
- // A subgraph starting from the current node has the completions
- // of the key prefix. The arc we're at is the last key's byte,
- // so we will collect it too.
- output.setLength(matchLength);
- if (collect(res, num, weight, output, arc) && greedy) {
- // We have enough suggestion to return immediately. Keep on looking for an
- // exact match, if requested.
- if (exactMatchFirst) {
- Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
- if (exactMatchWeight != null) {
- res.add(0, new LookupResult(key, exactMatchWeight));
- while (res.size() > num) {
- res.remove(res.size() - 1);
- }
- }
- }
- break;
- }
- }
- }
- return res;
- }
-
- /**
- * Descend along the path starting at <code>arc</code> and going through
- * bytes in <code>utf8</code> argument.
- *
- * @param arc The starting arc. This argument is modified in-place.
- * @param term The term to descend with.
- * @return If <code>true</code>, <code>arc</code> will be set to the arc matching
- * last byte of <code>utf8</code>. <code>false</code> is returned if no such
- * prefix <code>utf8</code> exists.
- */
- private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
- final int max = term.length();
-
- for (int i = 0; i < max; i++) {
- if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
- // No matching prefixes, return an empty result.
- return false;
- }
- }
-
- return true;
- }
-
- /**
- * Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
- *
- * @param num Maximum number of results needed (early termination).
- * @param weight Weight of all results found during this collection.
- */
- private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
- output.append((char) arc.label);
-
- automaton.readFirstTargetArc(arc, arc);
- while (true) {
- if (arc.label == FST.END_LABEL) {
- res.add(new LookupResult(output.toString(), weight));
- if (res.size() >= num)
- return true;
- } else {
- int save = output.length();
- if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
- return true;
- }
- output.setLength(save);
- }
-
- if (arc.isLast()) {
- break;
- }
- automaton.readNextArc(arc);
- }
- return false;
- }
-
- /**
- * Builds the final automaton from a list of entries.
- */
- private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
- if (entries.size() == 0)
- return null;
-
- // Sort by utf16 (raw char value)
- final Comparator<Entry> comp = new Comparator<Entry>() {
- public int compare(Entry o1, Entry o2) {
- char [] ch1 = o1.term;
- char [] ch2 = o2.term;
- int len1 = ch1.length;
- int len2 = ch2.length;
-
- int max = Math.min(len1, len2);
- for (int i = 0; i < max; i++) {
- int v = ch1[i] - ch2[i];
- if (v != 0) return v;
- }
- return len1 - len2;
- }
- };
- Collections.sort(entries, comp);
-
- // Avoid duplicated identical entries, if possible. This is required because
- // it breaks automaton construction otherwise.
- int len = entries.size();
- int j = 0;
- for (int i = 1; i < len; i++) {
- if (comp.compare(entries.get(j), entries.get(i)) != 0) {
- entries.set(++j, entries.get(i));
- }
- }
- entries = entries.subList(0, j + 1);
-
- // Build the automaton.
- final Outputs<Object> outputs = NoOutputs.getSingleton();
- final Object empty = outputs.getNoOutput();
- final Builder<Object> builder =
- new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
- final IntsRef scratchIntsRef = new IntsRef(10);
- for (Entry e : entries) {
- final int termLength = scratchIntsRef.length = e.term.length;
-
- scratchIntsRef.grow(termLength);
- final int [] ints = scratchIntsRef.ints;
- final char [] chars = e.term;
- for (int i = termLength; --i >= 0;) {
- ints[i] = chars[i];
- }
- builder.add(scratchIntsRef, empty);
- }
- return builder.finish();
- }
-
- /**
- * Prepends the entry's weight to each entry, encoded as a single byte, so that the
- * root automaton node fans out to all possible priorities, starting with the arc that has
- * the highest weights.
- */
- private void encodeWeightPrefix(List<Entry> entries) {
- for (Entry e : entries) {
- int weight = (int) e.weight;
- assert (weight >= 0 && weight <= buckets) :
- "Weight out of range: " + weight + " [" + buckets + "]";
-
- // There should be a single empty char reserved in front for the weight.
- e.term[0] = (char) weight;
- }
- }
-
- /**
- * Split [min, max] range into buckets, reassigning weights. Entries' weights are
- * remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
- */
- private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
- float min = entries.get(0).weight;
- float max = min;
- for (Entry e : entries) {
- min = Math.min(e.weight, min);
- max = Math.max(e.weight, max);
- }
-
- final float range = max - min;
- for (Entry e : entries) {
- e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
- }
- }
-
- /**
- * Deserialization from disk.
- */
- @Override
- public synchronized boolean load(File storeDir) throws IOException {
- File data = new File(storeDir, FILENAME);
- if (!data.exists() || !data.canRead()) {
- return false;
- }
-
- InputStream is = new BufferedInputStream(new FileInputStream(data));
- try {
- this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
- cacheRootArcs();
- } finally {
- IOUtils.close(is);
- }
- return true;
- }
-
- /**
- * Serialization to disk.
- */
- @Override
- public synchronized boolean store(File storeDir) throws IOException {
- if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
- return false;
- }
-
- if (this.automaton == null)
- return false;
-
- File data = new File(storeDir, FILENAME);
- OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
- try {
- this.automaton.save(new OutputStreamDataOutput(os));
- } finally {
- IOUtils.close(os);
- }
-
- return true;
- }
-}