X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java diff --git a/lucene-java-3.4.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java b/lucene-java-3.4.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java deleted file mode 100644 index 1b7ea77..0000000 --- a/lucene-java-3.4.0/lucene/contrib/spellchecker/src/java/org/apache/lucene/search/suggest/fst/FSTLookup.java +++ /dev/null @@ -1,542 +0,0 @@ -package org.apache.lucene.search.suggest.fst; - -import java.io.BufferedInputStream; -import java.io.BufferedOutputStream; -import java.io.File; -import java.io.FileInputStream; -import java.io.FileOutputStream; -import java.io.IOException; -import java.io.InputStream; -import java.io.OutputStream; -import java.util.ArrayList; -import java.util.Collections; -import java.util.Comparator; -import java.util.List; - -import org.apache.lucene.util.IOUtils; -import org.apache.lucene.util.IntsRef; -import org.apache.lucene.util.fst.Builder; -import org.apache.lucene.util.fst.FST; -import org.apache.lucene.util.fst.FST.Arc; -import org.apache.lucene.util.fst.NoOutputs; -import org.apache.lucene.util.fst.Outputs; - -import org.apache.lucene.search.suggest.Lookup; -import org.apache.lucene.search.suggest.tst.TSTLookup; -import org.apache.lucene.search.spell.TermFreqIterator; -import org.apache.lucene.store.InputStreamDataInput; -import org.apache.lucene.store.OutputStreamDataOutput; - -/** - * Finite state automata based implementation of {@link Lookup} query - * suggestion/ autocomplete interface. - * - *

Implementation details

- * - *

The construction step in {@link #build(TermFreqIterator)} works as follows: - *

- * - *

At runtime, in {@link #lookup(String, boolean, int)}, the automaton is utilized as follows: - *

- * - *

Runtime behavior and performance characteristic

- * - *

The algorithm described above is optimized for finding suggestions to short prefixes - * in a top-weights-first order. This is probably the most common use case: it allows - * presenting suggestions early and sorts them by the global frequency (and then alphabetically). - * - *

If there is an exact match in the automaton, it is returned first on the results - * list (even with by-weight sorting). - * - *

Note that the maximum lookup time for any prefix - * is the time of descending to the subtree, plus traversal of the subtree up to the number - * of requested suggestions (because they are already presorted by weight on the root level - * and alphabetically at any node level). - * - *

To order alphabetically only (no ordering by priorities), use identical term weights - * for all terms. Alphabetical suggestions are returned even if non-constant weights are - * used, but the algorithm for doing this is suboptimal. - * - *

"alphabetically" in any of the documentation above indicates utf16 codepoint order, - * nothing else. - */ -public class FSTLookup extends Lookup { - - public FSTLookup() { - this(10, true); - } - - public FSTLookup(int buckets, boolean exactMatchFirst) { - this.buckets = buckets; - this.exactMatchFirst = exactMatchFirst; - } - - /** A structure for a single entry (for sorting/ preprocessing). */ - private static class Entry { - char [] term; - float weight; - - public Entry(char [] term, float freq) { - this.term = term; - this.weight = freq; - } - } - - /** Serialized automaton file name (storage). */ - public static final String FILENAME = "fst.dat"; - - /** An empty result. */ - private static final List EMPTY_RESULT = Collections.emptyList(); - - /** - * The number of separate buckets for weights (discretization). The more buckets, - * the more fine-grained term weights (priorities) can be assigned. The speed of lookup - * will not decrease for prefixes which have highly-weighted completions (because these - * are filled-in first), but will decrease significantly for low-weighted terms (but - * these should be infrequent, so it is all right). - * - *

The number of buckets must be within [1, 255] range. - */ - private final int buckets; - - /** - * If true, exact suggestions are returned first, even if they are prefixes - * of other strings in the automaton (possibly with larger weights). - */ - private final boolean exactMatchFirst; - - /** - * Finite state automaton encoding all the lookup terms. See class - * notes for details. - */ - private FST automaton; - - /** - * An array of arcs leaving the root automaton state and encoding weights of all - * completions in their sub-trees. - */ - private Arc [] rootArcs; - - /* */ - @Override - public void build(TermFreqIterator tfit) throws IOException { - // Buffer the input because we will need it twice: for calculating - // weights distribution and for the actual automata building. - List entries = new ArrayList(); - while (tfit.hasNext()) { - String term = tfit.next(); - char [] termChars = new char [term.length() + 1]; // add padding for weight. - for (int i = 0; i < term.length(); i++) - termChars[i + 1] = term.charAt(i); - entries.add(new Entry(termChars, tfit.freq())); - } - - // Distribute weights into at most N buckets. This is a form of discretization to - // limit the number of possible weights so that they can be efficiently encoded in the - // automaton. - // - // It is assumed the distribution of weights is _linear_ so proportional division - // of [min, max] range will be enough here. Other approaches could be to sort - // weights and divide into proportional ranges. - if (entries.size() > 0) { - redistributeWeightsProportionalMinMax(entries, buckets); - encodeWeightPrefix(entries); - } - - // Build the automaton (includes input sorting) and cache root arcs in order from the highest, - // to the lowest weight. - this.automaton = buildAutomaton(entries); - cacheRootArcs(); - } - - /** - * Cache the root node's output arcs starting with completions with the highest weights. - */ - @SuppressWarnings("unchecked") - private void cacheRootArcs() throws IOException { - if (automaton != null) { - List> rootArcs = new ArrayList>(); - Arc arc = automaton.getFirstArc(new Arc()); - automaton.readFirstTargetArc(arc, arc); - while (true) { - rootArcs.add(new Arc().copyFrom(arc)); - if (arc.isLast()) - break; - automaton.readNextArc(arc); - } - - Collections.reverse(rootArcs); // we want highest weights first. - this.rootArcs = rootArcs.toArray(new Arc[rootArcs.size()]); - } - } - - /** - * Not implemented. - */ - @Override - public boolean add(String key, Object value) { - // This implementation does not support ad-hoc additions (all input - // must be sorted for the builder). - return false; - } - - /** - * Get the (approximated) weight of a single key (if there is a perfect match - * for it in the automaton). - * - * @return Returns the approximated weight of the input key or null - * if not found. - */ - @Override - public Float get(String key) { - return getExactMatchStartingFromRootArc(0, key); - } - - /** - * Returns the first exact match by traversing root arcs, starting from - * the arc i. - * - * @param i The first root arc index in {@link #rootArcs} to consider when - * matching. - */ - private Float getExactMatchStartingFromRootArc(int i, String key) { - // Get the UTF-8 bytes representation of the input key. - try { - final FST.Arc scratch = new FST.Arc(); - for (; i < rootArcs.length; i++) { - final FST.Arc rootArc = rootArcs[i]; - final FST.Arc arc = scratch.copyFrom(rootArc); - - // Descend into the automaton using the key as prefix. - if (descendWithPrefix(arc, key)) { - automaton.readFirstTargetArc(arc, arc); - if (arc.label == FST.END_LABEL) { - // Prefix-encoded weight. - return rootArc.label / (float) buckets; - } - } - } - } catch (IOException e) { - // Should never happen, but anyway. - throw new RuntimeException(e); - } - - return null; - } - - /** - * Lookup autocomplete suggestions to key. - * - * @param key The prefix to which suggestions should be sought. - * @param onlyMorePopular Return most popular suggestions first. This is the default - * behavior for this implementation. Setting it to false has no effect (use - * constant term weights to sort alphabetically only). - * @param num At most this number of suggestions will be returned. - * @return Returns the suggestions, sorted by their approximated weight first (decreasing) - * and then alphabetically (utf16 codepoint order). - */ - @Override - public List lookup(String key, boolean onlyMorePopular, int num) { - if (key.length() == 0 || automaton == null) { - // Keep the result an ArrayList to keep calls monomorphic. - return EMPTY_RESULT; - } - - try { - if (!onlyMorePopular && rootArcs.length > 1) { - // We could emit a warning here (?). An optimal strategy for alphabetically sorted - // suggestions would be to add them with a constant weight -- this saves unnecessary - // traversals and sorting. - return lookupSortedAlphabetically(key, num); - } else { - return lookupSortedByWeight(key, num, true); - } - } catch (IOException e) { - // Should never happen, but anyway. - throw new RuntimeException(e); - } - } - - /** - * Lookup suggestions sorted alphabetically if weights are not constant. This - * is a workaround: in general, use constant weights for alphabetically sorted result. - */ - private List lookupSortedAlphabetically(String key, int num) throws IOException { - // Greedily get num results from each weight branch. - List res = lookupSortedByWeight(key, num, false); - - // Sort and trim. - Collections.sort(res, new Comparator() { - // not till java6 @Override - public int compare(LookupResult o1, LookupResult o2) { - return o1.key.compareTo(o2.key); - } - }); - if (res.size() > num) { - res = res.subList(0, num); - } - return res; - } - - /** - * Lookup suggestions sorted by weight (descending order). - * - * @param greedy If true, the routine terminates immediately when num - * suggestions have been collected. If false, it will collect suggestions from - * all weight arcs (needed for {@link #lookupSortedAlphabetically}. - */ - private ArrayList lookupSortedByWeight(String key, int num, boolean greedy) throws IOException { - final ArrayList res = new ArrayList(Math.min(10, num)); - final StringBuilder output = new StringBuilder(key); - final int matchLength = key.length() - 1; - - for (int i = 0; i < rootArcs.length; i++) { - final FST.Arc rootArc = rootArcs[i]; - final FST.Arc arc = new FST.Arc().copyFrom(rootArc); - - // Descend into the automaton using the key as prefix. - if (descendWithPrefix(arc, key)) { - // Prefix-encoded weight. - final float weight = rootArc.label / (float) buckets; - - // A subgraph starting from the current node has the completions - // of the key prefix. The arc we're at is the last key's byte, - // so we will collect it too. - output.setLength(matchLength); - if (collect(res, num, weight, output, arc) && greedy) { - // We have enough suggestion to return immediately. Keep on looking for an - // exact match, if requested. - if (exactMatchFirst) { - Float exactMatchWeight = getExactMatchStartingFromRootArc(i, key); - if (exactMatchWeight != null) { - res.add(0, new LookupResult(key, exactMatchWeight)); - while (res.size() > num) { - res.remove(res.size() - 1); - } - } - } - break; - } - } - } - return res; - } - - /** - * Descend along the path starting at arc and going through - * bytes in utf8 argument. - * - * @param arc The starting arc. This argument is modified in-place. - * @param term The term to descend with. - * @return If true, arc will be set to the arc matching - * last byte of utf8. false is returned if no such - * prefix utf8 exists. - */ - private boolean descendWithPrefix(Arc arc, String term) throws IOException { - final int max = term.length(); - - for (int i = 0; i < max; i++) { - if (automaton.findTargetArc(term.charAt(i) & 0xffff, arc, arc) == null) { - // No matching prefixes, return an empty result. - return false; - } - } - - return true; - } - - /** - * Recursive collect lookup results from the automaton subgraph starting at arc. - * - * @param num Maximum number of results needed (early termination). - * @param weight Weight of all results found during this collection. - */ - private boolean collect(List res, int num, float weight, StringBuilder output, Arc arc) throws IOException { - output.append((char) arc.label); - - automaton.readFirstTargetArc(arc, arc); - while (true) { - if (arc.label == FST.END_LABEL) { - res.add(new LookupResult(output.toString(), weight)); - if (res.size() >= num) - return true; - } else { - int save = output.length(); - if (collect(res, num, weight, output, new Arc().copyFrom(arc))) { - return true; - } - output.setLength(save); - } - - if (arc.isLast()) { - break; - } - automaton.readNextArc(arc); - } - return false; - } - - /** - * Builds the final automaton from a list of entries. - */ - private FST buildAutomaton(List entries) throws IOException { - if (entries.size() == 0) - return null; - - // Sort by utf16 (raw char value) - final Comparator comp = new Comparator() { - public int compare(Entry o1, Entry o2) { - char [] ch1 = o1.term; - char [] ch2 = o2.term; - int len1 = ch1.length; - int len2 = ch2.length; - - int max = Math.min(len1, len2); - for (int i = 0; i < max; i++) { - int v = ch1[i] - ch2[i]; - if (v != 0) return v; - } - return len1 - len2; - } - }; - Collections.sort(entries, comp); - - // Avoid duplicated identical entries, if possible. This is required because - // it breaks automaton construction otherwise. - int len = entries.size(); - int j = 0; - for (int i = 1; i < len; i++) { - if (comp.compare(entries.get(j), entries.get(i)) != 0) { - entries.set(++j, entries.get(i)); - } - } - entries = entries.subList(0, j + 1); - - // Build the automaton. - final Outputs outputs = NoOutputs.getSingleton(); - final Object empty = outputs.getNoOutput(); - final Builder builder = - new Builder(FST.INPUT_TYPE.BYTE4, outputs); - final IntsRef scratchIntsRef = new IntsRef(10); - for (Entry e : entries) { - final int termLength = scratchIntsRef.length = e.term.length; - - scratchIntsRef.grow(termLength); - final int [] ints = scratchIntsRef.ints; - final char [] chars = e.term; - for (int i = termLength; --i >= 0;) { - ints[i] = chars[i]; - } - builder.add(scratchIntsRef, empty); - } - return builder.finish(); - } - - /** - * Prepends the entry's weight to each entry, encoded as a single byte, so that the - * root automaton node fans out to all possible priorities, starting with the arc that has - * the highest weights. - */ - private void encodeWeightPrefix(List entries) { - for (Entry e : entries) { - int weight = (int) e.weight; - assert (weight >= 0 && weight <= buckets) : - "Weight out of range: " + weight + " [" + buckets + "]"; - - // There should be a single empty char reserved in front for the weight. - e.term[0] = (char) weight; - } - } - - /** - * Split [min, max] range into buckets, reassigning weights. Entries' weights are - * remapped to [0, buckets] range (so, buckets + 1 buckets, actually). - */ - private void redistributeWeightsProportionalMinMax(List entries, int buckets) { - float min = entries.get(0).weight; - float max = min; - for (Entry e : entries) { - min = Math.min(e.weight, min); - max = Math.max(e.weight, max); - } - - final float range = max - min; - for (Entry e : entries) { - e.weight = (int) (buckets * ((e.weight - min) / range)); // int cast equiv. to floor() - } - } - - /** - * Deserialization from disk. - */ - @Override - public synchronized boolean load(File storeDir) throws IOException { - File data = new File(storeDir, FILENAME); - if (!data.exists() || !data.canRead()) { - return false; - } - - InputStream is = new BufferedInputStream(new FileInputStream(data)); - try { - this.automaton = new FST(new InputStreamDataInput(is), NoOutputs.getSingleton()); - cacheRootArcs(); - } finally { - IOUtils.close(is); - } - return true; - } - - /** - * Serialization to disk. - */ - @Override - public synchronized boolean store(File storeDir) throws IOException { - if (!storeDir.exists() || !storeDir.isDirectory() || !storeDir.canWrite()) { - return false; - } - - if (this.automaton == null) - return false; - - File data = new File(storeDir, FILENAME); - OutputStream os = new BufferedOutputStream(new FileOutputStream(data)); - try { - this.automaton.save(new OutputStreamDataOutput(os)); - } finally { - IOUtils.close(os); - } - - return true; - } -}