lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java

   1 package org.apache.lucene.search.suggest.fst;
   2
   3 import java.io.BufferedInputStream;
   4 import java.io.BufferedOutputStream;
   5 import java.io.File;
   6 import java.io.FileInputStream;
   7 import java.io.FileOutputStream;
   8 import java.io.IOException;
   9 import java.io.InputStream;
  10 import java.io.OutputStream;
  11 import java.util.ArrayList;
  12 import java.util.Collections;
  13 import java.util.Comparator;
  14 import java.util.List;
  15
  16 import org.apache.lucene.util.IOUtils;
  17 import org.apache.lucene.util.IntsRef;
  18 import org.apache.lucene.util.fst.Builder;
  19 import org.apache.lucene.util.fst.FST;
  20 import org.apache.lucene.util.fst.FST.Arc;
  21 import org.apache.lucene.util.fst.NoOutputs;
  22 import org.apache.lucene.util.fst.Outputs;
  23
  24 import org.apache.lucene.search.suggest.Lookup;
  25 import org.apache.lucene.search.suggest.tst.TSTLookup;
  26 import org.apache.lucene.search.spell.TermFreqIterator;
  27 import org.apache.lucene.store.InputStreamDataInput;
  28 import org.apache.lucene.store.OutputStreamDataOutput;
  29
  30 /**
  31  * Finite state automata based implementation of {@link Lookup} query
  32  * suggestion/ autocomplete interface.
  33  *
  34  * <h2>Implementation details</h2>
  35  *
  36  * <p>The construction step in {@link #build(TermFreqIterator)} works as follows:
  37  * <ul>
  38  * <li>A set of input terms (String) and weights (float) is given.</li>
  39  * <li>The range of weights is determined and then all weights are discretized into a fixed set
  40  * of values ({@link #buckets}).
  41  * Note that this means that minor changes in weights may be lost during automaton construction.
  42  * In general, this is not a big problem because the "priorities" of completions can be split
  43  * into a fixed set of classes (even as rough as: very frequent, frequent, baseline, marginal).
  44  * If you need exact, fine-grained weights, use {@link TSTLookup} instead.<li>
  45  * <li>All terms in the input are preprended with a synthetic pseudo-character being the weight
  46  * of that term. For example a term <code>abc</code> with a discretized weight equal '1' would
  47  * become <code>1abc</code>.</li>
  48  * <li>The terms are sorted by their raw value of utf16 character values (including the synthetic
  49  * term in front).</li>
  50  * <li>A finite state automaton ({@link FST}) is constructed from the input. The root node has
  51  * arcs labeled with all possible weights. We cache all these arcs, highest-weight first.</li>
  52  * </ul>
  53  *
  54  * <p>At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows:
  55  * <ul>
  56  * <li>For each possible term weight encoded in the automaton (cached arcs from the root above),
  57  * starting with the highest one, we descend along the path of the input key. If the key is not
  58  * a prefix of a sequence in the automaton (path ends prematurely), we exit immediately.
  59  * No completions.
  60  * <li>Otherwise, we have found an internal automaton node that ends the key. <b>The entire
  61  * subautomaton (all paths) starting from this node form the key's completions.</b> We start
  62  * the traversal of this subautomaton. Every time we reach a final state (arc), we add a single
  63  * suggestion to the list of results (the weight of this suggestion is constant and equal to the
  64  * root path we started from). The tricky part is that because automaton edges are sorted and
  65  * we scan depth-first, we can terminate the entire procedure as soon as we collect enough
  66  * suggestions the user requested.
  67  * <li>In case the number of suggestions collected in the step above is still insufficient,
  68  * we proceed to the next (smaller) weight leaving the root node and repeat the same
  69  * algorithm again.
  70  * </li>
  71  * </ul>
  72  *
  73  * <h2>Runtime behavior and performance characteristic</h2>
  74  *
  75  * <p>The algorithm described above is optimized for finding suggestions to short prefixes
  76  * in a top-weights-first order. This is probably the most common use case: it allows
  77  * presenting suggestions early and sorts them by the global frequency (and then alphabetically).
  78  *
  79  * <p>If there is an exact match in the automaton, it is returned first on the results
  80  * list (even with by-weight sorting).
  81  *
  82  * <p>Note that the maximum lookup time for <b>any prefix</b>
  83  * is the time of descending to the subtree, plus traversal of the subtree up to the number
  84  * of requested suggestions (because they are already presorted by weight on the root level
  85  * and alphabetically at any node level).
  86  *
  87  * <p>To order alphabetically only (no ordering by priorities), use identical term weights
  88  * for all terms. Alphabetical suggestions are returned even if non-constant weights are
  89  * used, but the algorithm for doing this is suboptimal.
  90  *
  91  * <p>"alphabetically" in any of the documentation above indicates utf16 codepoint order,
  92  * nothing else.
  93  */
  94 public class FSTLookup extends Lookup {
  95
  96   public FSTLookup() {
  97     this(10, true);
  98   }
  99
 100   public FSTLookup(int buckets, boolean exactMatchFirst) {
 101     this.buckets = buckets;
 102     this.exactMatchFirst = exactMatchFirst;
 103   }
 104
 105   /** A structure for a single entry (for sorting/ preprocessing). */
 106   private static class Entry {
 107     char [] term;
 108     float weight;
 109
 110     public Entry(char [] term, float freq) {
 111       this.term = term;
 112       this.weight = freq;
 113     }
 114   }
 115
 116   /** Serialized automaton file name (storage). */
 117   public static final String FILENAME = "fst.dat";
 118
 119   /** An empty result. */
 120   private static final List<LookupResult> EMPTY_RESULT = Collections.emptyList();
 121
 122   /**
 123    * The number of separate buckets for weights (discretization). The more buckets,
 124    * the more fine-grained term weights (priorities) can be assigned. The speed of lookup
 125    * will not decrease for prefixes which have highly-weighted completions (because these
 126    * are filled-in first), but will decrease significantly for low-weighted terms (but
 127    * these should be infrequent, so it is all right).
 128    *
 129    * <p>The number of buckets must be within [1, 255] range.
 130    */
 131   private final int buckets;
 132
 133   /**
 134    * If <code>true</code>, exact suggestions are returned first, even if they are prefixes
 135    * of other strings in the automaton (possibly with larger weights).
 136    */
 137   private final boolean exactMatchFirst;
 138
 139   /**
 140    * Finite state automaton encoding all the lookup terms. See class
 141    * notes for details.
 142    */
 143   private FST<Object> automaton;
 144
 145   /**
 146    * An array of arcs leaving the root automaton state and encoding weights of all
 147    * completions in their sub-trees.
 148    */
 149   private Arc<Object> [] rootArcs;
 150
 151   /* */
 152   @Override
 153   public void build(TermFreqIterator tfit) throws IOException {
 154     // Buffer the input because we will need it twice: for calculating
 155     // weights distribution and for the actual automata building.
 156     List<Entry> entries = new ArrayList<Entry>();
 157     while (tfit.hasNext()) {
 158       String term = tfit.next();
 159       char [] termChars = new char [term.length() + 1]; // add padding for weight.
 160       for (int i = 0; i < term.length(); i++)
 161         termChars[i + 1] = term.charAt(i);
 162       entries.add(new Entry(termChars, tfit.freq()));
 163     }
 164
 165     // Distribute weights into at most N buckets. This is a form of discretization to
 166     // limit the number of possible weights so that they can be efficiently encoded in the
 167     // automaton.
 168     //
 169     // It is assumed the distribution of weights is _linear_ so proportional division
 170     // of [min, max] range will be enough here. Other approaches could be to sort
 171     // weights and divide into proportional ranges.
 172     if (entries.size() > 0) {
 173       redistributeWeightsProportionalMinMax(entries, buckets);
 174       encodeWeightPrefix(entries);
 175     }
 176
 177     // Build the automaton (includes input sorting) and cache root arcs in order from the highest,
 178     // to the lowest weight.
 179     this.automaton = buildAutomaton(entries);
 180     cacheRootArcs();
 181   }
 182
 183   /**
 184    * Cache the root node's output arcs starting with completions with the highest weights.
 185    */
 186   @SuppressWarnings("unchecked")
 187   private void cacheRootArcs() throws IOException {
 188     if (automaton != null) {
 189       List<Arc<Object>> rootArcs = new ArrayList<Arc<Object>>();
 190       Arc<Object> arc = automaton.getFirstArc(new Arc<Object>());
 191       automaton.readFirstTargetArc(arc, arc);
 192       while (true) {
 193         rootArcs.add(new Arc<Object>().copyFrom(arc));
 194         if (arc.isLast())
 195           break;
 196         automaton.readNextArc(arc);
 197       }
 198
 199       Collections.reverse(rootArcs); // we want highest weights first.
 200       this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]);
 201     }
 202   }
 203
 204   /**
 205    * Not implemented.
 206    */
 207   @Override
 208   public boolean add(String key, Object value) {
 209     // This implementation does not support ad-hoc additions (all input
 210     // must be sorted for the builder).
 211     return false;
 212   }
 213
 214   /**
 215    * Get the (approximated) weight of a single key (if there is a perfect match
 216    * for it in the automaton).
 217    *
 218    * @return Returns the approximated weight of the input key or <code>null</code>
 219    * if not found.
 220    */
 221   @Override
 222   public Float get(String key) {
 223     return getExactMatchStartingFromRootArc(0, key);
 224   }
 225
 226   /**
 227    * Returns the first exact match by traversing root arcs, starting from
 228    * the arc <code>i</code>.
 229    *
 230    * @param i The first root arc index in {@link #rootArcs} to consider when
 231    * matching.
 232    */
 233   private Float getExactMatchStartingFromRootArc(int i, String key) {
 234     // Get the UTF-8 bytes representation of the input key.
 235     try {
 236       final FST.Arc<Object> scratch = new FST.Arc<Object>();
 237       for (; i < rootArcs.length; i++) {
 238         final FST.Arc<Object> rootArc = rootArcs[i];
 239         final FST.Arc<Object> arc = scratch.copyFrom(rootArc);
 240
 241         // Descend into the automaton using the key as prefix.
 242         if (descendWithPrefix(arc, key)) {
 243           automaton.readFirstTargetArc(arc, arc);
 244           if (arc.label == FST.END_LABEL) {
 245             // Prefix-encoded weight.
 246             return rootArc.label / (float) buckets;
 247           }
 248         }
 249       }
 250     } catch (IOException e) {
 251       // Should never happen, but anyway.
 252       throw new RuntimeException(e);
 253     }
 254
 255     return null;
 256   }
 257
 258   /**
 259    * Lookup autocomplete suggestions to <code>key</code>.
 260    *
 261    * @param key The prefix to which suggestions should be sought.
 262    * @param onlyMorePopular Return most popular suggestions first. This is the default
 263    * behavior for this implementation. Setting it to <code>false</code> has no effect (use
 264    * constant term weights to sort alphabetically only).
 265    * @param num At most this number of suggestions will be returned.
 266    * @return Returns the suggestions, sorted by their approximated weight first (decreasing)
 267    * and then alphabetically (utf16 codepoint order).
 268    */
 269   @Override
 270   public List<LookupResult> lookup(String key, boolean onlyMorePopular, int num) {
 271     if (key.length() == 0 || automaton == null) {
 272       // Keep the result an ArrayList to keep calls monomorphic.
 273       return EMPTY_RESULT;
 274     }
 275
 276     try {
 277       if (!onlyMorePopular && rootArcs.length > 1) {
 278         // We could emit a warning here (?). An optimal strategy for alphabetically sorted
 279         // suggestions would be to add them with a constant weight -- this saves unnecessary
 280         // traversals and sorting.
 281         return lookupSortedAlphabetically(key, num);
 282       } else {
 283         return lookupSortedByWeight(key, num, false);
 284       }
 285     } catch (IOException e) {
 286       // Should never happen, but anyway.
 287       throw new RuntimeException(e);
 288     }
 289   }
 290
 291   /**
 292    * Lookup suggestions sorted alphabetically <b>if weights are not constant</b>. This
 293    * is a workaround: in general, use constant weights for alphabetically sorted result.
 294    */
 295   private List<LookupResult> lookupSortedAlphabetically(String key, int num) throws IOException {
 296     // Greedily get num results from each weight branch.
 297     List<LookupResult> res = lookupSortedByWeight(key, num, true);
 298
 299     // Sort and trim.
 300     Collections.sort(res, new Comparator<LookupResult>() {
 301       // not till java6 @Override
 302       public int compare(LookupResult o1, LookupResult o2) {
 303         return o1.key.compareTo(o2.key);
 304       }
 305     });
 306     if (res.size() > num) {
 307       res = res.subList(0, num);
 308     }
 309     return res;
 310   }
 311
 312   /**
 313    * Lookup suggestions sorted by weight (descending order).
 314    *
 315    * @param collectAll If <code>true</code>, the routine terminates immediately when <code>num</code>
 316    * suggestions have been collected. If <code>false</code>, it will collect suggestions from
 317    * all weight arcs (needed for {@link #lookupSortedAlphabetically}.
 318    */
 319   private ArrayList<LookupResult> lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException {
 320     // Don't overallocate the results buffers. This also serves the purpose of allowing
 321     // the user of this class to request all matches using Integer.MAX_VALUE as the number
 322     // of results.
 323     final ArrayList<LookupResult> res = new ArrayList<LookupResult>(Math.min(10, num));
 324     final StringBuilder output = new StringBuilder(key);
 325     final int matchLength = key.length() - 1;
 326
 327     for (int i = 0; i < rootArcs.length; i++) {
 328       final FST.Arc<Object> rootArc = rootArcs[i];
 329       final FST.Arc<Object> arc = new FST.Arc<Object>().copyFrom(rootArc);
 330
 331       // Descend into the automaton using the key as prefix.
 332       if (descendWithPrefix(arc, key)) {
 333         // Prefix-encoded weight.
 334         final float weight = rootArc.label / (float) buckets;
 335
 336         // A subgraph starting from the current node has the completions
 337         // of the key prefix. The arc we're at is the last key's byte,
 338         // so we will collect it too.
 339         output.setLength(matchLength);
 340         if (collect(res, num, weight, output, arc) && !collectAll) {
 341           // We have enough suggestions to return immediately. Keep on looking for an
 342           // exact match, if requested.
 343           if (exactMatchFirst) {
 344             if (!checkExistingAndReorder(res, key)) {
 345               Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key);
 346               if (exactMatchWeight != null) {
 347                 // Insert as the first result and truncate at num.
 348                 while (res.size() >= num) {
 349                   res.remove(res.size() - 1);
 350                 }
 351                 res.add(0, new LookupResult(key, exactMatchWeight));
 352               }
 353             }
 354           }
 355           break;
 356         }
 357       }
 358     }
 359     return res;
 360   }
 361
 362   /**
 363    * Checks if the list of {@link LookupResult}s already has a <code>key</code>. If so,
 364    * reorders that {@link LookupResult} to the first position.
 365    *
 366    * @return Returns <code>true<code> if and only if <code>list</code> contained <code>key</code>.
 367    */
 368   private boolean checkExistingAndReorder(ArrayList<LookupResult> list, String key) {
 369     // We assume list does not have duplicates (because of how the FST is created).
 370     for (int i = list.size(); --i >= 0;) {
 371       if (key.equals(list.get(i).key)) {
 372         // Key found. Unless already at i==0, remove it and push up front so that the ordering
 373         // remains identical with the exception of the exact match.
 374         list.add(0,  list.remove(i));
 375         return true;
 376       }
 377     }
 378     return false;
 379   }
 380
 381   /**
 382    * Descend along the path starting at <code>arc</code> and going through
 383    * bytes in <code>utf8</code> argument.
 384    *
 385    * @param arc The starting arc. This argument is modified in-place.
 386    * @param term The term to descend with.
 387    * @return If <code>true</code>, <code>arc</code> will be set to the arc matching
 388    * last byte of <code>utf8</code>. <code>false</code> is returned if no such
 389    * prefix <code>utf8</code> exists.
 390    */
 391   private boolean descendWithPrefix(Arc<Object> arc, String term) throws IOException {
 392     final int max = term.length();
 393
 394     for (int i = 0; i < max; i++) {
 395       if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) {
 396         // No matching prefixes, return an empty result.
 397         return false;
 398       }
 399     }
 400
 401     return true;
 402   }
 403
 404   /**
 405    * Recursive collect lookup results from the automaton subgraph starting at <code>arc</code>.
 406    *
 407    * @param num Maximum number of results needed (early termination).
 408    * @param weight Weight of all results found during this collection.
 409    */
 410   private boolean collect(List<LookupResult> res, int num, float weight, StringBuilder output, Arc<Object> arc) throws IOException {
 411     output.append((char) arc.label);
 412
 413     automaton.readFirstTargetArc(arc, arc);
 414     while (true) {
 415       if (arc.label == FST.END_LABEL) {
 416         res.add(new LookupResult(output.toString(), weight));
 417         if (res.size() >= num)
 418           return true;
 419       } else {
 420         int save = output.length();
 421         if (collect(res, num, weight, output, new Arc<Object>().copyFrom(arc))) {
 422           return true;
 423         }
 424         output.setLength(save);
 425       }
 426
 427       if (arc.isLast()) {
 428         break;
 429       }
 430       automaton.readNextArc(arc);
 431     }
 432     return false;
 433   }
 434
 435   /**
 436    * Builds the final automaton from a list of entries.
 437    */
 438   private FST<Object> buildAutomaton(List<Entry> entries) throws IOException {
 439     if (entries.size() == 0)
 440       return null;
 441
 442     // Sort by utf16 (raw char value)
 443     final Comparator<Entry> comp = new Comparator<Entry>() {
 444       public int compare(Entry o1, Entry o2) {
 445         char [] ch1 = o1.term;
 446         char [] ch2 = o2.term;
 447         int len1 = ch1.length;
 448         int len2 = ch2.length;
 449
 450         int max = Math.min(len1, len2);
 451         for (int i = 0; i < max; i++) {
 452           int v = ch1[i] - ch2[i];
 453           if (v != 0) return v;
 454         }
 455         return len1 - len2;
 456       }
 457     };
 458     Collections.sort(entries, comp);
 459
 460     // Avoid duplicated identical entries, if possible. This is required because
 461     // it breaks automaton construction otherwise.
 462     int len = entries.size();
 463     int j = 0;
 464     for (int i = 1; i < len; i++) {
 465       if (comp.compare(entries.get(j), entries.get(i)) != 0) {
 466         entries.set(++j, entries.get(i));
 467       }
 468     }
 469     entries = entries.subList(0, j + 1);
 470
 471     // Build the automaton.
 472     final Outputs<Object> outputs = NoOutputs.getSingleton();
 473     final Object empty = outputs.getNoOutput();
 474     final Builder<Object> builder =
 475       new Builder<Object>(FST.INPUT_TYPE.BYTE4, outputs);
 476     final IntsRef scratchIntsRef = new IntsRef(10);
 477     for (Entry e : entries) {
 478       final int termLength = scratchIntsRef.length = e.term.length;
 479
 480       scratchIntsRef.grow(termLength);
 481       final int [] ints = scratchIntsRef.ints;
 482       final char [] chars = e.term;
 483       for (int i = termLength; --i >= 0;) {
 484         ints[i] = chars[i];
 485       }
 486       builder.add(scratchIntsRef, empty);
 487     }
 488     return builder.finish();
 489   }
 490
 491   /**
 492    * Prepends the entry's weight to each entry, encoded as a single byte, so that the
 493    * root automaton node fans out to all possible priorities, starting with the arc that has
 494    * the highest weights.
 495    */
 496   private void encodeWeightPrefix(List<Entry> entries) {
 497     for (Entry e : entries) {
 498       int weight = (int) e.weight;
 499       assert (weight >= 0 && weight <= buckets) :
 500         "Weight out of range: " + weight + " [" + buckets + "]";
 501
 502       // There should be a single empty char reserved in front for the weight.
 503       e.term[0] = (char) weight;
 504     }
 505   }
 506
 507   /**
 508    *  Split [min, max] range into buckets, reassigning weights. Entries' weights are
 509    *  remapped to [0, buckets] range (so, buckets + 1 buckets, actually).
 510    */
 511   private void redistributeWeightsProportionalMinMax(List<Entry> entries, int buckets) {
 512     float min = entries.get(0).weight;
 513     float max = min;
 514     for (Entry e : entries) {
 515       min = Math.min(e.weight, min);
 516       max = Math.max(e.weight, max);
 517     }
 518
 519     final float range = max - min;
 520     for (Entry e : entries) {
 521       e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor()
 522     }
 523   }
 524
 525   /**
 526    * Deserialization from disk.
 527    */
 528   @Override
 529   public synchronized boolean load(File storeDir) throws IOException {
 530     File data = new File(storeDir, FILENAME);
 531     if (!data.exists() || !data.canRead()) {
 532       return false;
 533     }
 534
 535     InputStream is = new BufferedInputStream(new FileInputStream(data));
 536     try {
 537       this.automaton = new FST<Object>(new InputStreamDataInput(is), NoOutputs.getSingleton());
 538       cacheRootArcs();
 539     } finally {
 540       IOUtils.close(is);
 541     }
 542     return true;
 543   }
 544
 545   /**
 546    * Serialization to disk.
 547    */
 548   @Override
 549   public synchronized boolean store(File storeDir) throws IOException {
 550     if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) {
 551       return false;
 552     }
 553
 554     if (this.automaton == null)
 555       return false;
 556
 557     File data = new File(storeDir, FILENAME);
 558     OutputStream os = new BufferedOutputStream(new FileOutputStream(data));
 559     try {
 560       this.automaton.save(new OutputStreamDataOutput(os));
 561     } finally {
 562       IOUtils.close(os);
 563     }
 564
 565     return true;
 566   }
 567 }