X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java deleted file mode 100644 index 9842984..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ /dev/null @@ -1,574 +0,0 @@ -package org.apache.lucene.analysis.synonym; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.fst.FST; - -/** - * Matches single or multi word synonyms in a token stream. - * This token stream cannot properly handle position - * increments != 1, ie, you should place this filter before - * filtering out stop words. - * - *

Note that with the current implementation, parsing is - * greedy, so whenever multiple parses would apply, the rule - * starting the earliest and parsing the most tokens wins. - * For example if you have these rules: - * - *

- *   a -> x
- *   a b -> y
- *   b c d -> z
- * 
- * - * Then input a b c d e parses to y b c - * d, ie the 2nd rule "wins" because it started - * earliest and matched the most input tokens of other rules - * starting at that point.

- * - *

A future improvement to this filter could allow - * non-greedy parsing, such that the 3rd rule would win, and - * also separately allow multiple parses, such that all 3 - * rules would match, perhaps even on a rule by rule - * basis.

- * - *

NOTE: when a match occurs, the output tokens - * associated with the matching rule are "stacked" on top of - * the input stream (if the rule had - * keepOrig=true) and also on top of aother - * matched rule's output tokens. This is not a correct - * solution, as really the output should be an abitrary - * graph/lattice. For example, with the above match, you - * would expect an exact PhraseQuery "y b - * c" to match the parsed tokens, but it will fail to - * do so. This limitations is necessary because Lucene's - * TokenStream (and index) cannot yet represent an arbitrary - * graph.

- * - *

NOTE: If multiple incoming tokens arrive on the - * same position, only the first token at that position is - * used for parsing. Subsequent tokens simply pass through - * and are not parsed. A future improvement would be to - * allow these tokens to also be matched.

- */ - -// TODO: maybe we should resolve token -> wordID then run -// FST on wordIDs, for better perf? - -// TODO: a more efficient approach would be Aho/Corasick's -// algorithm -// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm -// It improves over the current approach here -// because it does not fully re-start matching at every -// token. For exampl,e if one pattern is "a b c x" -// and another is "b c d" and the input is "a b c d", on -// trying to parse "a b c x" but failing when you got to x, -// rather than starting over again your really should -// immediately recognize that "b c d" matches at the next -// input. I suspect this won't matter that much in -// practice, but it's possible on some set of synonyms it -// will. We'd have to modify Aho/Corasick to enforce our -// conflict resolving (eg greedy matching) because that algo -// finds all matches. - -public final class SynonymFilter extends TokenFilter { - - public static final String TYPE_SYNONYM = "SYNONYM"; - - private final SynonymMap synonyms; - - private final boolean ignoreCase; - private final int rollBufferSize; - - private int captureCount; - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - // How many future input tokens have already been matched - // to a synonym; because the matching is "greedy" we don't - // try to do any more matching for such tokens: - private int inputSkipCount; - - // Hold all buffered (read ahead) stacked input tokens for - // a future position. When multiple tokens are at the - // same position, we only store (and match against) the - // term for the first token at the position, but capture - // state for (and enumerate) all other tokens at this - // position: - private static class PendingInput { - final CharsRef term = new CharsRef(); - AttributeSource.State state; - boolean keepOrig; - boolean matched; - boolean consumed = true; - int startOffset; - int endOffset; - - public void reset() { - state = null; - consumed = true; - keepOrig = false; - matched = false; - } - }; - - // Rolling buffer, holding pending input tokens we had to - // clone because we needed to look ahead, indexed by - // position: - private final PendingInput[] futureInputs; - - // Holds pending output synonyms for one future position: - private static class PendingOutputs { - CharsRef[] outputs; - int upto; - int count; - int posIncr = 1; - - public PendingOutputs() { - outputs = new CharsRef[1]; - } - - public void reset() { - upto = count = 0; - posIncr = 1; - } - - public CharsRef pullNext() { - assert upto < count; - final CharsRef result = outputs[upto++]; - posIncr = 0; - if (upto == count) { - reset(); - } - return result; - } - - public void add(char[] output, int offset, int len) { - if (count == outputs.length) { - final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(outputs, 0, next, 0, count); - outputs = next; - } - if (outputs[count] == null) { - outputs[count] = new CharsRef(); - } - outputs[count].copy(output, offset, len); - count++; - } - }; - - private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); - - // Rolling buffer, holding stack of pending synonym - // outputs, indexed by position: - private final PendingOutputs[] futureOutputs; - - // Where (in rolling buffers) to write next input saved state: - private int nextWrite; - - // Where (in rolling buffers) to read next input saved state: - private int nextRead; - - // True once we've read last token - private boolean finished; - - private final FST.Arc scratchArc; - - private final FST fst; - - private final BytesRef scratchBytes = new BytesRef(); - private final CharsRef scratchChars = new CharsRef(); - - /** - * @param input input tokenstream - * @param synonyms synonym map - * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}. - * Note, if you set this to true, its your responsibility to lowercase - * the input entries when you create the {@link SynonymMap} - */ - public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) { - super(input); - this.synonyms = synonyms; - this.ignoreCase = ignoreCase; - this.fst = synonyms.fst; - - if (fst == null) { - throw new IllegalArgumentException("fst must be non-null"); - } - - // Must be 1+ so that when roll buffer is at full - // lookahead we can distinguish this full buffer from - // the empty buffer: - rollBufferSize = 1+synonyms.maxHorizontalContext; - - futureInputs = new PendingInput[rollBufferSize]; - futureOutputs = new PendingOutputs[rollBufferSize]; - for(int pos=0;pos