X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java deleted file mode 100644 index 9842984..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java +++ /dev/null @@ -1,574 +0,0 @@ -package org.apache.lucene.analysis.synonym; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; - -import org.apache.lucene.analysis.TokenFilter; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.analysis.tokenattributes.TypeAttribute; -import org.apache.lucene.store.ByteArrayDataInput; -import org.apache.lucene.util.ArrayUtil; -import org.apache.lucene.util.AttributeSource; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.RamUsageEstimator; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.fst.FST; - -/** - * Matches single or multi word synonyms in a token stream. - * This token stream cannot properly handle position - * increments != 1, ie, you should place this filter before - * filtering out stop words. - * - *
Note that with the current implementation, parsing is - * greedy, so whenever multiple parses would apply, the rule - * starting the earliest and parsing the most tokens wins. - * For example if you have these rules: - * - *
- * a -> x - * a b -> y - * b c d -> z - *- * - * Then input
a b c d e
parses to y b c
- * d
, ie the 2nd rule "wins" because it started
- * earliest and matched the most input tokens of other rules
- * starting at that point.
- *
- * A future improvement to this filter could allow - * non-greedy parsing, such that the 3rd rule would win, and - * also separately allow multiple parses, such that all 3 - * rules would match, perhaps even on a rule by rule - * basis.
- * - *NOTE: when a match occurs, the output tokens
- * associated with the matching rule are "stacked" on top of
- * the input stream (if the rule had
- * keepOrig=true
) and also on top of aother
- * matched rule's output tokens. This is not a correct
- * solution, as really the output should be an abitrary
- * graph/lattice. For example, with the above match, you
- * would expect an exact PhraseQuery
"y b
- * c"
to match the parsed tokens, but it will fail to
- * do so. This limitations is necessary because Lucene's
- * TokenStream (and index) cannot yet represent an arbitrary
- * graph.
NOTE: If multiple incoming tokens arrive on the - * same position, only the first token at that position is - * used for parsing. Subsequent tokens simply pass through - * and are not parsed. A future improvement would be to - * allow these tokens to also be matched.
- */ - -// TODO: maybe we should resolve token -> wordID then run -// FST on wordIDs, for better perf? - -// TODO: a more efficient approach would be Aho/Corasick's -// algorithm -// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm -// It improves over the current approach here -// because it does not fully re-start matching at every -// token. For exampl,e if one pattern is "a b c x" -// and another is "b c d" and the input is "a b c d", on -// trying to parse "a b c x" but failing when you got to x, -// rather than starting over again your really should -// immediately recognize that "b c d" matches at the next -// input. I suspect this won't matter that much in -// practice, but it's possible on some set of synonyms it -// will. We'd have to modify Aho/Corasick to enforce our -// conflict resolving (eg greedy matching) because that algo -// finds all matches. - -public final class SynonymFilter extends TokenFilter { - - public static final String TYPE_SYNONYM = "SYNONYM"; - - private final SynonymMap synonyms; - - private final boolean ignoreCase; - private final int rollBufferSize; - - private int captureCount; - - private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); - private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); - private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class); - private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); - - // How many future input tokens have already been matched - // to a synonym; because the matching is "greedy" we don't - // try to do any more matching for such tokens: - private int inputSkipCount; - - // Hold all buffered (read ahead) stacked input tokens for - // a future position. When multiple tokens are at the - // same position, we only store (and match against) the - // term for the first token at the position, but capture - // state for (and enumerate) all other tokens at this - // position: - private static class PendingInput { - final CharsRef term = new CharsRef(); - AttributeSource.State state; - boolean keepOrig; - boolean matched; - boolean consumed = true; - int startOffset; - int endOffset; - - public void reset() { - state = null; - consumed = true; - keepOrig = false; - matched = false; - } - }; - - // Rolling buffer, holding pending input tokens we had to - // clone because we needed to look ahead, indexed by - // position: - private final PendingInput[] futureInputs; - - // Holds pending output synonyms for one future position: - private static class PendingOutputs { - CharsRef[] outputs; - int upto; - int count; - int posIncr = 1; - - public PendingOutputs() { - outputs = new CharsRef[1]; - } - - public void reset() { - upto = count = 0; - posIncr = 1; - } - - public CharsRef pullNext() { - assert upto < count; - final CharsRef result = outputs[upto++]; - posIncr = 0; - if (upto == count) { - reset(); - } - return result; - } - - public void add(char[] output, int offset, int len) { - if (count == outputs.length) { - final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)]; - System.arraycopy(outputs, 0, next, 0, count); - outputs = next; - } - if (outputs[count] == null) { - outputs[count] = new CharsRef(); - } - outputs[count].copy(output, offset, len); - count++; - } - }; - - private final ByteArrayDataInput bytesReader = new ByteArrayDataInput(); - - // Rolling buffer, holding stack of pending synonym - // outputs, indexed by position: - private final PendingOutputs[] futureOutputs; - - // Where (in rolling buffers) to write next input saved state: - private int nextWrite; - - // Where (in rolling buffers) to read next input saved state: - private int nextRead; - - // True once we've read last token - private boolean finished; - - private final FST.Arc