X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java diff --git a/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java new file mode 100644 index 0000000..565db46 --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java @@ -0,0 +1,567 @@ +package org.apache.lucene.search.suggest.fst; + +import java.io.BufferedInputStream; +import java.io.BufferedOutputStream; +import java.io.File; +import java.io.FileInputStream; +import java.io.FileOutputStream; +import java.io.IOException; +import java.io.InputStream; +import java.io.OutputStream; +import java.util.ArrayList; +import java.util.Collections; +import java.util.Comparator; +import java.util.List; + +import org.apache.lucene.util.IOUtils; +import org.apache.lucene.util.IntsRef; +import org.apache.lucene.util.fst.Builder; +import org.apache.lucene.util.fst.FST; +import org.apache.lucene.util.fst.FST.Arc; +import org.apache.lucene.util.fst.NoOutputs; +import org.apache.lucene.util.fst.Outputs; + +import org.apache.lucene.search.suggest.Lookup; +import org.apache.lucene.search.suggest.tst.TSTLookup; +import org.apache.lucene.search.spell.TermFreqIterator; +import org.apache.lucene.store.InputStreamDataInput; +import org.apache.lucene.store.OutputStreamDataOutput; + +/** + * Finite state automata based implementation of {@link Lookup} query + * suggestion/ autocomplete interface. + * + *

Implementation details

+ * + *

The construction step in {@link #build(TermFreqIterator)} works as follows: + *

+ * + *

At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows: + *

+ * + *

Runtime behavior and performance characteristic

+ * + *

The algorithm described above is optimized for finding suggestions to short prefixes + * in a top-weights-first order. This is probably the most common use case: it allows + * presenting suggestions early and sorts them by the global frequency (and then alphabetically). + * + *

If there is an exact match in the automaton, it is returned first on the results + * list (even with by-weight sorting). + * + *

Note that the maximum lookup time for any prefix + * is the time of descending to the subtree, plus traversal of the subtree up to the number + * of requested suggestions (because they are already presorted by weight on the root level + * and alphabetically at any node level). + * + *

To order alphabetically only (no ordering by priorities), use identical term weights + * for all terms. Alphabetical suggestions are returned even if non-constant weights are + * used, but the algorithm for doing this is suboptimal. + * + *

"alphabetically" in any of the documentation above indicates utf16 codepoint order, + * nothing else. + */ +public class FSTLookup extends Lookup { + + public FSTLookup() { + this(10, true); + } + + public FSTLookup(int buckets, boolean exactMatchFirst) { + this.buckets = buckets; + this.exactMatchFirst = exactMatchFirst; + } + + /** A structure for a single entry (for sorting/ preprocessing). */ + private static class Entry { + char [] term; + float weight; + + public Entry(char [] term, float freq) { + this.term = term; + this.weight = freq; + } + } + + /** Serialized automaton file name (storage). */ + public static final String FILENAME = "fst.dat"; + + /** An empty result. */ + private static final List EMPTY_RESULT = Collections.emptyList(); + + /** + * The number of separate buckets for weights (discretization). The more buckets, + * the more fine-grained term weights (priorities) can be assigned. The speed of lookup + * will not decrease for prefixes which have highly-weighted completions (because these + * are filled-in first), but will decrease significantly for low-weighted terms (but + * these should be infrequent, so it is all right). + * + *

The number of buckets must be within [1, 255] range. + */ + private final int buckets; + + /** + * If true, exact suggestions are returned first, even if they are prefixes + * of other strings in the automaton (possibly with larger weights). + */ + private final boolean exactMatchFirst; + + /** + * Finite state automaton encoding all the lookup terms. See class + * notes for details. + */ + private FST automaton; + + /** + * An array of arcs leaving the root automaton state and encoding weights of all + * completions in their sub-trees. + */ + private Arc [] rootArcs; + + /* */ + @Override + public void build(TermFreqIterator tfit) throws IOException { + // Buffer the input because we will need it twice: for calculating + // weights distribution and for the actual automata building. + List entries = new ArrayList(); + while (tfit.hasNext()) { + String term = tfit.next(); + char [] termChars = new char [term.length() + 1]; // add padding for weight. + for (int i = 0; i < term.length(); i++) + termChars[i + 1] = term.charAt(i); + entries.add(new Entry(termChars, tfit.freq())); + } + + // Distribute weights into at most N buckets. This is a form of discretization to + // limit the number of possible weights so that they can be efficiently encoded in the + // automaton. + // + // It is assumed the distribution of weights is _linear_ so proportional division + // of [min, max] range will be enough here. Other approaches could be to sort + // weights and divide into proportional ranges. + if (entries.size() > 0) { + redistributeWeightsProportionalMinMax(entries, buckets); + encodeWeightPrefix(entries); + } + + // Build the automaton (includes input sorting) and cache root arcs in order from the highest, + // to the lowest weight. + this.automaton = buildAutomaton(entries); + cacheRootArcs(); + } + + /** + * Cache the root node's output arcs starting with completions with the highest weights. + */ + @SuppressWarnings("unchecked") + private void cacheRootArcs() throws IOException { + if (automaton != null) { + List> rootArcs = new ArrayList>(); + Arc arc = automaton.getFirstArc(new Arc()); + automaton.readFirstTargetArc(arc, arc); + while (true) { + rootArcs.add(new Arc().copyFrom(arc)); + if (arc.isLast()) + break; + automaton.readNextArc(arc); + } + + Collections.reverse(rootArcs); // we want highest weights first. + this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]); + } + } + + /** + * Not implemented. + */ + @Override + public boolean add(String key, Object value) { + // This implementation does not support ad-hoc additions (all input + // must be sorted for the builder). + return false; + } + + /** + * Get the (approximated) weight of a single key (if there is a perfect match + * for it in the automaton). + * + * @return Returns the approximated weight of the input key or null + * if not found. + */ + @Override + public Float get(String key) { + return getExactMatchStartingFromRootArc(0, key); + } + + /** + * Returns the first exact match by traversing root arcs, starting from + * the arc i. + * + * @param i The first root arc index in {@link #rootArcs} to consider when + * matching. + */ + private Float getExactMatchStartingFromRootArc(int i, String key) { + // Get the UTF-8 bytes representation of the input key. + try { + final FST.Arc scratch = new FST.Arc(); + for (; i < rootArcs.length; i++) { + final FST.Arc rootArc = rootArcs[i]; + final FST.Arc arc = scratch.copyFrom(rootArc); + + // Descend into the automaton using the key as prefix. + if (descendWithPrefix(arc, key)) { + automaton.readFirstTargetArc(arc, arc); + if (arc.label == FST.END_LABEL) { + // Prefix-encoded weight. + return rootArc.label / (float) buckets; + } + } + } + } catch (IOException e) { + // Should never happen, but anyway. + throw new RuntimeException(e); + } + + return null; + } + + /** + * Lookup autocomplete suggestions to key. + * + * @param key The prefix to which suggestions should be sought. + * @param onlyMorePopular Return most popular suggestions first. This is the default + * behavior for this implementation. Setting it to false has no effect (use + * constant term weights to sort alphabetically only). + * @param num At most this number of suggestions will be returned. + * @return Returns the suggestions, sorted by their approximated weight first (decreasing) + * and then alphabetically (utf16 codepoint order). + */ + @Override + public List lookup(String key, boolean onlyMorePopular, int num) { + if (key.length() == 0 || automaton == null) { + // Keep the result an ArrayList to keep calls monomorphic. + return EMPTY_RESULT; + } + + try { + if (!onlyMorePopular && rootArcs.length > 1) { + // We could emit a warning here (?). An optimal strategy for alphabetically sorted + // suggestions would be to add them with a constant weight -- this saves unnecessary + // traversals and sorting. + return lookupSortedAlphabetically(key, num); + } else { + return lookupSortedByWeight(key, num, false); + } + } catch (IOException e) { + // Should never happen, but anyway. + throw new RuntimeException(e); + } + } + + /** + * Lookup suggestions sorted alphabetically if weights are not constant. This + * is a workaround: in general, use constant weights for alphabetically sorted result. + */ + private List lookupSortedAlphabetically(String key, int num) throws IOException { + // Greedily get num results from each weight branch. + List res = lookupSortedByWeight(key, num, true); + + // Sort and trim. + Collections.sort(res, new Comparator() { + // not till java6 @Override + public int compare(LookupResult o1, LookupResult o2) { + return o1.key.compareTo(o2.key); + } + }); + if (res.size() > num) { + res = res.subList(0, num); + } + return res; + } + + /** + * Lookup suggestions sorted by weight (descending order). + * + * @param collectAll If true, the routine terminates immediately when num + * suggestions have been collected. If false, it will collect suggestions from + * all weight arcs (needed for {@link #lookupSortedAlphabetically}. + */ + private ArrayList lookupSortedByWeight(String key, int num, boolean collectAll) throws IOException { + // Don't overallocate the results buffers. This also serves the purpose of allowing + // the user of this class to request all matches using Integer.MAX_VALUE as the number + // of results. + final ArrayList res = new ArrayList(Math.min(10, num)); + final StringBuilder output = new StringBuilder(key); + final int matchLength = key.length() - 1; + + for (int i = 0; i < rootArcs.length; i++) { + final FST.Arc rootArc = rootArcs[i]; + final FST.Arc arc = new FST.Arc().copyFrom(rootArc); + + // Descend into the automaton using the key as prefix. + if (descendWithPrefix(arc, key)) { + // Prefix-encoded weight. + final float weight = rootArc.label / (float) buckets; + + // A subgraph starting from the current node has the completions + // of the key prefix. The arc we're at is the last key's byte, + // so we will collect it too. + output.setLength(matchLength); + if (collect(res, num, weight, output, arc) && !collectAll) { + // We have enough suggestions to return immediately. Keep on looking for an + // exact match, if requested. + if (exactMatchFirst) { + if (!checkExistingAndReorder(res, key)) { + Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); + if (exactMatchWeight != null) { + // Insert as the first result and truncate at num. + while (res.size() >= num) { + res.remove(res.size() - 1); + } + res.add(0, new LookupResult(key, exactMatchWeight)); + } + } + } + break; + } + } + } + return res; + } + + /** + * Checks if the list of {@link LookupResult}s already has a key. If so, + * reorders that {@link LookupResult} to the first position. + * + * @return Returns true if and only if list contained key. + */ + private boolean checkExistingAndReorder(ArrayList list, String key) { + // We assume list does not have duplicates (because of how the FST is created). + for (int i = list.size(); --i >= 0;) { + if (key.equals(list.get(i).key)) { + // Key found. Unless already at i==0, remove it and push up front so that the ordering + // remains identical with the exception of the exact match. + list.add(0, list.remove(i)); + return true; + } + } + return false; + } + + /** + * Descend along the path starting at arc and going through + * bytes in utf8 argument. + * + * @param arc The starting arc. This argument is modified in-place. + * @param term The term to descend with. + * @return If true, arc will be set to the arc matching + * last byte of utf8. false is returned if no such + * prefix utf8 exists. + */ + private boolean descendWithPrefix(Arc arc, String term) throws IOException { + final int max = term.length(); + + for (int i = 0; i < max; i++) { + if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) { + // No matching prefixes, return an empty result. + return false; + } + } + + return true; + } + + /** + * Recursive collect lookup results from the automaton subgraph starting at arc. + * + * @param num Maximum number of results needed (early termination). + * @param weight Weight of all results found during this collection. + */ + private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException { + output.append((char) arc.label); + + automaton.readFirstTargetArc(arc, arc); + while (true) { + if (arc.label == FST.END_LABEL) { + res.add(new LookupResult(output.toString(), weight)); + if (res.size() >= num) + return true; + } else { + int save = output.length(); + if (collect(res, num, weight, output, new Arc().copyFrom(arc))) { + return true; + } + output.setLength(save); + } + + if (arc.isLast()) { + break; + } + automaton.readNextArc(arc); + } + return false; + } + + /** + * Builds the final automaton from a list of entries. + */ + private FST buildAutomaton(List entries) throws IOException { + if (entries.size() == 0) + return null; + + // Sort by utf16 (raw char value) + final Comparator comp = new Comparator() { + public int compare(Entry o1, Entry o2) { + char [] ch1 = o1.term; + char [] ch2 = o2.term; + int len1 = ch1.length; + int len2 = ch2.length; + + int max = Math.min(len1, len2); + for (int i = 0; i < max; i++) { + int v = ch1[i] - ch2[i]; + if (v != 0) return v; + } + return len1 - len2; + } + }; + Collections.sort(entries, comp); + + // Avoid duplicated identical entries, if possible. This is required because + // it breaks automaton construction otherwise. + int len = entries.size(); + int j = 0; + for (int i = 1; i < len; i++) { + if (comp.compare(entries.get(j), entries.get(i)) != 0) { + entries.set(++j, entries.get(i)); + } + } + entries = entries.subList(0, j + 1); + + // Build the automaton. + final Outputs outputs = NoOutputs.getSingleton(); + final Object empty = outputs.getNoOutput(); + final Builder builder = + new Builder(FST.INPUT_TYPE.BYTE4, outputs); + final IntsRef scratchIntsRef = new IntsRef(10); + for (Entry e : entries) { + final int termLength = scratchIntsRef.length = e.term.length; + + scratchIntsRef.grow(termLength); + final int [] ints = scratchIntsRef.ints; + final char [] chars = e.term; + for (int i = termLength; --i >= 0;) { + ints[i] = chars[i]; + } + builder.add(scratchIntsRef, empty); + } + return builder.finish(); + } + + /** + * Prepends the entry's weight to each entry, encoded as a single byte, so that the + * root automaton node fans out to all possible priorities, starting with the arc that has + * the highest weights. + */ + private void encodeWeightPrefix(List entries) { + for (Entry e : entries) { + int weight = (int) e.weight; + assert (weight >= 0 && weight <= buckets) : + "Weight out of range: " + weight + " [" + buckets + "]"; + + // There should be a single empty char reserved in front for the weight. + e.term[0] = (char) weight; + } + } + + /** + * Split [min, max] range into buckets, reassigning weights. Entries' weights are + * remapped to [0, buckets] range (so, buckets + 1 buckets, actually). + */ + private void redistributeWeightsProportionalMinMax(List entries, int buckets) { + float min = entries.get(0).weight; + float max = min; + for (Entry e : entries) { + min = Math.min(e.weight, min); + max = Math.max(e.weight, max); + } + + final float range = max - min; + for (Entry e : entries) { + e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor() + } + } + + /** + * Deserialization from disk. + */ + @Override + public synchronized boolean load(File storeDir) throws IOException { + File data = new File(storeDir, FILENAME); + if (!data.exists() || !data.canRead()) { + return false; + } + + InputStream is = new BufferedInputStream(new FileInputStream(data)); + try { + this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); + cacheRootArcs(); + } finally { + IOUtils.close(is); + } + return true; + } + + /** + * Serialization to disk. + */ + @Override + public synchronized boolean store(File storeDir) throws IOException { + if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { + return false; + } + + if (this.automaton == null) + return false; + + File data = new File(storeDir, FILENAME); + OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); + try { + this.automaton.save(new OutputStreamDataOutput(os)); + } finally { + IOUtils.close(os); + } + + return true; + } +}