1 package org.apache.lucene.search.suggest.fst;
3 import java.io.BufferedInputStream;
4 import java.io.BufferedOutputStream;
6 import java.io.FileInputStream;
7 import java.io.FileOutputStream;
8 import java.io.IOException;
9 import java.io.InputStream;
10 import java.io.OutputStream;
11 import java.util.ArrayList;
12 import java.util.Collections;
13 import java.util.Comparator;
14 import java.util.List;
16 import org.apache.lucene.util.IOUtils;
17 import org.apache.lucene.util.IntsRef;
18 import org.apache.lucene.util.fst.Builder;
19 import org.apache.lucene.util.fst.FST;
20 import org.apache.lucene.util.fst.FST.Arc;
21 import org.apache.lucene.util.fst.NoOutputs;
22 import org.apache.lucene.util.fst.Outputs;
24 import org.apache.lucene.search.suggest.Lookup;
25 import org.apache.lucene.search.suggest.tst.TSTLookup;
26 import org.apache.lucene.search.spell.TermFreqIterator;
27 import org.apache.lucene.store.InputStreamDataInput;
28 import org.apache.lucene.store.OutputStreamDataOutput;
31 * Finite state automata based implementation of {@link Lookup} query
32 * suggestion/ autocomplete interface.
34 * <h2>Implementation details</h2>
36 * <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
38 * <li>A set of input terms (String) and weights (float) is given.</li>
39 * <li>The range of weights is determined and then all weights are discretized into a fixed set
40 * of values ({@link #buckets}).
41 * Note that this means that minor changes in weights may be lost during automaton construction.
42 * In general, this is not a big problem because the "priorities" of completions can be split
43 * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
44 * If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
45 * <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
46 * of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
47 * become <code>1abc</code>.</li>
48 * <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
49 * term in front).</li>
50 * <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
51 * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
54 * <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
56 * <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
57 * starting with the highest one, we descend along the path of the input key. If the key is not
58 * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
60 * <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
61 * subautomaton (all paths) starting from this node form the key's completions.</b> We start
62 * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
63 * suggestion to the list of results (the weight of this suggestion is constant and equal to the
64 * root path we started from). The tricky part is that because automaton edges are sorted and
65 * we scan depth-first, we can terminate the entire procedure as soon as we collect enough
66 * suggestions the user requested.
67 * <li>In case the number of suggestions collected in the step above is still insufficient,
68 * we proceed to the next (smaller) weight leaving the root node and repeat the same
73 * <h2>Runtime behavior and performance characteristic</h2>
75 * <p>The algorithm described above is optimized for finding suggestions to short prefixes
76 * in a top-weights-first order. This is probably the most common use case: it allows
77 * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
79 * <p>If there is an exact match in the automaton, it is returned first on the results
80 * list (even with by-weight sorting).
82 * <p>Note that the maximum lookup time for <b>any prefix</b>
83 * is the time of descending to the subtree, plus traversal of the subtree up to the number
84 * of requested suggestions (because they are already presorted by weight on the root level
85 * and alphabetically at any node level).
87 * <p>To order alphabetically only (no ordering by priorities), use identical term weights
88 * for all terms. Alphabetical suggestions are returned even if non-constant weights are
89 * used, but the algorithm for doing this is suboptimal.
91 * <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
94 public class FSTLookup extends Lookup {
100 public FSTLookup(int buckets, boolean exactMatchFirst) {
101 this.buckets = buckets;
102 this.exactMatchFirst = exactMatchFirst;
105 /** A structure for a single entry (for sorting/ preprocessing). */
106 private static class Entry {
110 public Entry(char [] term, float freq) {
116 /** Serialized automaton file name (storage). */
117 public static final String FILENAME = "fst.dat";
119 /** An empty result. */
120 private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
123 * The number of separate buckets for weights (discretization). The more buckets,
124 * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
125 * will not decrease for prefixes which have highly-weighted completions (because these
126 * are filled-in first), but will decrease significantly for low-weighted terms (but
127 * these should be infrequent, so it is all right).
129 * <p>The number of buckets must be within [1, 255] range.
131 private final int buckets;
134 * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
135 * of other strings in the automaton (possibly with larger weights).
137 private final boolean exactMatchFirst;
140 * Finite state automaton encoding all the lookup terms. See class
143 private FST<Object> automaton;
146 * An array of arcs leaving the root automaton state and encoding weights of all
147 * completions in their sub-trees.
149 private Arc<Object> [] rootArcs;
153 public void build(TermFreqIterator tfit) throws IOException {
154 // Buffer the input because we will need it twice: for calculating
155 // weights distribution and for the actual automata building.
156 List<Entry> entries = new ArrayList<Entry>();
157 while (tfit.hasNext()) {
158 String term = tfit.next();
159 char [] termChars = new char [term.length() + 1]; // add padding for weight.
160 for (int i = 0; i < term.length(); i++)
161 termChars[i + 1] = term.charAt(i);
162 entries.add(new Entry(termChars, tfit.freq()));
165 // Distribute weights into at most N buckets. This is a form of discretization to
166 // limit the number of possible weights so that they can be efficiently encoded in the
169 // It is assumed the distribution of weights is _linear_ so proportional division
170 // of [min, max] range will be enough here. Other approaches could be to sort
171 // weights and divide into proportional ranges.
172 if (entries.size() > 0) {
173 redistributeWeightsProportionalMinMax(entries, buckets);
174 encodeWeightPrefix(entries);
177 // Build the automaton (includes input sorting) and cache root arcs in order from the highest,
178 // to the lowest weight.
179 this.automaton = buildAutomaton(entries);
184 * Cache the root node's output arcs starting with completions with the highest weights.
186 @SuppressWarnings("unchecked")
187 private void cacheRootArcs() throws IOException {
188 if (automaton != null) {
189 List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
190 Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
191 automaton.readFirstTargetArc(arc, arc);
193 rootArcs.add(new Arc<Object>().copyFrom(arc));
196 automaton.readNextArc(arc);
199 Collections.reverse(rootArcs); // we want highest weights first.
200 this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
208 public boolean add(String key, Object value) {
209 // This implementation does not support ad-hoc additions (all input
210 // must be sorted for the builder).
215 * Get the (approximated) weight of a single key (if there is a perfect match
216 * for it in the automaton).
218 * @return Returns the approximated weight of the input key or <code>null</code>
222 public Float get(String key) {
223 return getExactMatchStartingFromRootArc(0, key);
227 * Returns the first exact match by traversing root arcs, starting from
228 * the arc <code>i</code>.
230 * @param i The first root arc index in {@link #rootArcs} to consider when
233 private Float getExactMatchStartingFromRootArc(int i, String key) {
234 // Get the UTF-8 bytes representation of the input key.
236 final FST.Arc<Object> scratch = new FST.Arc<Object>();
237 for (; i < rootArcs.length; i++) {
238 final FST.Arc<Object> rootArc = rootArcs[i];
239 final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
241 // Descend into the automaton using the key as prefix.
242 if (descendWithPrefix(arc, key)) {
243 automaton.readFirstTargetArc(arc, arc);
244 if (arc.label == FST.END_LABEL) {
245 // Prefix-encoded weight.
246 return rootArc.label / (float) buckets;
250 } catch (IOException e) {
251 // Should never happen, but anyway.
252 throw new RuntimeException(e);
259 * Lookup autocomplete suggestions to <code>key</code>.
261 * @param key The prefix to which suggestions should be sought.
262 * @param onlyMorePopular Return most popular suggestions first. This is the default
263 * behavior for this implementation. Setting it to <code>false</code> has no effect (use
264 * constant term weights to sort alphabetically only).
265 * @param num At most this number of suggestions will be returned.
266 * @return Returns the suggestions, sorted by their approximated weight first (decreasing)
267 * and then alphabetically (utf16 codepoint order).
270 public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
271 if (key.length() == 0 || automaton == null) {
272 // Keep the result an ArrayList to keep calls monomorphic.
277 if (!onlyMorePopular && rootArcs.length > 1) {
278 // We could emit a warning here (?). An optimal strategy for alphabetically sorted
279 // suggestions would be to add them with a constant weight -- this saves unnecessary
280 // traversals and sorting.
281 return lookupSortedAlphabetically(key, num);
283 return lookupSortedByWeight(key, num, false);
285 } catch (IOException e) {
286 // Should never happen, but anyway.
287 throw new RuntimeException(e);
292 * Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
293 * is a workaround: in general, use constant weights for alphabetically sorted result.
295 private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
296 // Greedily get num results from each weight branch.
297 List<LookupResult> res = lookupSortedByWeight(key, num, true);
300 Collections.sort(res, new Comparator<LookupResult>() {
301 // not till java6 @Override
302 public int compare(LookupResult o1, LookupResult o2) {
303 return o1.key.compareTo(o2.key);
306 if (res.size() > num) {
307 res = res.subList(0, num);
313 * Lookup suggestions sorted by weight (descending order).
315 * @param collectAll If <code>true</code>, the routine terminates immediately when <code>num</code>
316 * suggestions have been collected. If <code>false</code>, it will collect suggestions from
317 * all weight arcs (needed for {@link #lookupSortedAlphabetically}.
319 private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException {
320 // Don't overallocate the results buffers. This also serves the purpose of allowing
321 // the user of this class to request all matches using Integer.MAX_VALUE as the number
323 final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
324 final StringBuilder output = new StringBuilder(key);
325 final int matchLength = key.length() - 1;
327 for (int i = 0; i < rootArcs.length; i++) {
328 final FST.Arc<Object> rootArc = rootArcs[i];
329 final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
331 // Descend into the automaton using the key as prefix.
332 if (descendWithPrefix(arc, key)) {
333 // Prefix-encoded weight.
334 final float weight = rootArc.label / (float) buckets;
336 // A subgraph starting from the current node has the completions
337 // of the key prefix. The arc we're at is the last key's byte,
338 // so we will collect it too.
339 output.setLength(matchLength);
340 if (collect(res, num, weight, output, arc) && !collectAll) {
341 // We have enough suggestions to return immediately. Keep on looking for an
342 // exact match, if requested.
343 if (exactMatchFirst) {
344 if (!checkExistingAndReorder(res, key)) {
345 Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
346 if (exactMatchWeight != null) {
347 // Insert as the first result and truncate at num.
348 while (res.size() >= num) {
349 res.remove(res.size() - 1);
351 res.add(0, new LookupResult(key, exactMatchWeight));
363 * Checks if the list of {@link LookupResult}s already has a <code>key</code>. If so,
364 * reorders that {@link LookupResult} to the first position.
366 * @return Returns <code>true<code> if and only if <code>list</code> contained <code>key</code>.
368 private boolean checkExistingAndReorder(ArrayList<LookupResult> list, String key) {
369 // We assume list does not have duplicates (because of how the FST is created).
370 for (int i = list.size(); --i >= 0;) {
371 if (key.equals(list.get(i).key)) {
372 // Key found. Unless already at i==0, remove it and push up front so that the ordering
373 // remains identical with the exception of the exact match.
374 list.add(0, list.remove(i));
382 * Descend along the path starting at <code>arc</code> and going through
383 * bytes in <code>utf8</code> argument.
385 * @param arc The starting arc. This argument is modified in-place.
386 * @param term The term to descend with.
387 * @return If <code>true</code>, <code>arc</code> will be set to the arc matching
388 * last byte of <code>utf8</code>. <code>false</code> is returned if no such
389 * prefix <code>utf8</code> exists.
391 private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
392 final int max = term.length();
394 for (int i = 0; i < max; i++) {
395 if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
396 // No matching prefixes, return an empty result.
405 * Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
407 * @param num Maximum number of results needed (early termination).
408 * @param weight Weight of all results found during this collection.
410 private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
411 output.append((char) arc.label);
413 automaton.readFirstTargetArc(arc, arc);
415 if (arc.label == FST.END_LABEL) {
416 res.add(new LookupResult(output.toString(), weight));
417 if (res.size() >= num)
420 int save = output.length();
421 if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
424 output.setLength(save);
430 automaton.readNextArc(arc);
436 * Builds the final automaton from a list of entries.
438 private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
439 if (entries.size() == 0)
442 // Sort by utf16 (raw char value)
443 final Comparator<Entry> comp = new Comparator<Entry>() {
444 public int compare(Entry o1, Entry o2) {
445 char [] ch1 = o1.term;
446 char [] ch2 = o2.term;
447 int len1 = ch1.length;
448 int len2 = ch2.length;
450 int max = Math.min(len1, len2);
451 for (int i = 0; i < max; i++) {
452 int v = ch1[i] - ch2[i];
453 if (v != 0) return v;
458 Collections.sort(entries, comp);
460 // Avoid duplicated identical entries, if possible. This is required because
461 // it breaks automaton construction otherwise.
462 int len = entries.size();
464 for (int i = 1; i < len; i++) {
465 if (comp.compare(entries.get(j), entries.get(i)) != 0) {
466 entries.set(++j, entries.get(i));
469 entries = entries.subList(0, j + 1);
471 // Build the automaton.
472 final Outputs<Object> outputs = NoOutputs.getSingleton();
473 final Object empty = outputs.getNoOutput();
474 final Builder<Object> builder =
475 new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
476 final IntsRef scratchIntsRef = new IntsRef(10);
477 for (Entry e : entries) {
478 final int termLength = scratchIntsRef.length = e.term.length;
480 scratchIntsRef.grow(termLength);
481 final int [] ints = scratchIntsRef.ints;
482 final char [] chars = e.term;
483 for (int i = termLength; --i >= 0;) {
486 builder.add(scratchIntsRef, empty);
488 return builder.finish();
492 * Prepends the entry's weight to each entry, encoded as a single byte, so that the
493 * root automaton node fans out to all possible priorities, starting with the arc that has
494 * the highest weights.
496 private void encodeWeightPrefix(List<Entry> entries) {
497 for (Entry e : entries) {
498 int weight = (int) e.weight;
499 assert (weight >= 0 && weight <= buckets) :
500 "Weight out of range: " + weight + " [" + buckets + "]";
502 // There should be a single empty char reserved in front for the weight.
503 e.term[0] = (char) weight;
508 * Split [min, max] range into buckets, reassigning weights. Entries' weights are
509 * remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
511 private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
512 float min = entries.get(0).weight;
514 for (Entry e : entries) {
515 min = Math.min(e.weight, min);
516 max = Math.max(e.weight, max);
519 final float range = max - min;
520 for (Entry e : entries) {
521 e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
526 * Deserialization from disk.
529 public synchronized boolean load(File storeDir) throws IOException {
530 File data = new File(storeDir, FILENAME);
531 if (!data.exists() || !data.canRead()) {
535 InputStream is = new BufferedInputStream(new FileInputStream(data));
537 this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
546 * Serialization to disk.
549 public synchronized boolean store(File storeDir) throws IOException {
550 if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
554 if (this.automaton == null)
557 File data = new File(storeDir, FILENAME);
558 OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
560 this.automaton.save(new OutputStreamDataOutput(os));