+++ /dev/null
-package org.apache.lucene.analysis.synonym;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.store.ByteArrayDataInput;
-import org.apache.lucene.util.ArrayUtil;
-import org.apache.lucene.util.AttributeSource;
-import org.apache.lucene.util.BytesRef;
-import org.apache.lucene.util.CharsRef;
-import org.apache.lucene.util.RamUsageEstimator;
-import org.apache.lucene.util.UnicodeUtil;
-import org.apache.lucene.util.fst.FST;
-
-/**
- * Matches single or multi word synonyms in a token stream.
- * This token stream cannot properly handle position
- * increments != 1, ie, you should place this filter before
- * filtering out stop words.
- *
- * <p>Note that with the current implementation, parsing is
- * greedy, so whenever multiple parses would apply, the rule
- * starting the earliest and parsing the most tokens wins.
- * For example if you have these rules:
- *
- * <pre>
- * a -> x
- * a b -> y
- * b c d -> z
- * </pre>
- *
- * Then input <code>a b c d e</code> parses to <code>y b c
- * d</code>, ie the 2nd rule "wins" because it started
- * earliest and matched the most input tokens of other rules
- * starting at that point.</p>
- *
- * <p>A future improvement to this filter could allow
- * non-greedy parsing, such that the 3rd rule would win, and
- * also separately allow multiple parses, such that all 3
- * rules would match, perhaps even on a rule by rule
- * basis.</p>
- *
- * <p><b>NOTE</b>: when a match occurs, the output tokens
- * associated with the matching rule are "stacked" on top of
- * the input stream (if the rule had
- * <code>keepOrig=true</code>) and also on top of aother
- * matched rule's output tokens. This is not a correct
- * solution, as really the output should be an abitrary
- * graph/lattice. For example, with the above match, you
- * would expect an exact <code>PhraseQuery</code> <code>"y b
- * c"</code> to match the parsed tokens, but it will fail to
- * do so. This limitations is necessary because Lucene's
- * TokenStream (and index) cannot yet represent an arbitrary
- * graph.</p>
- *
- * <p><b>NOTE</b>: If multiple incoming tokens arrive on the
- * same position, only the first token at that position is
- * used for parsing. Subsequent tokens simply pass through
- * and are not parsed. A future improvement would be to
- * allow these tokens to also be matched.</p>
- */
-
-// TODO: maybe we should resolve token -> wordID then run
-// FST on wordIDs, for better perf?
-
-// TODO: a more efficient approach would be Aho/Corasick's
-// algorithm
-// http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
-// It improves over the current approach here
-// because it does not fully re-start matching at every
-// token. For exampl,e if one pattern is "a b c x"
-// and another is "b c d" and the input is "a b c d", on
-// trying to parse "a b c x" but failing when you got to x,
-// rather than starting over again your really should
-// immediately recognize that "b c d" matches at the next
-// input. I suspect this won't matter that much in
-// practice, but it's possible on some set of synonyms it
-// will. We'd have to modify Aho/Corasick to enforce our
-// conflict resolving (eg greedy matching) because that algo
-// finds all matches.
-
-public final class SynonymFilter extends TokenFilter {
-
- public static final String TYPE_SYNONYM = "SYNONYM";
-
- private final SynonymMap synonyms;
-
- private final boolean ignoreCase;
- private final int rollBufferSize;
-
- private int captureCount;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
-
- // How many future input tokens have already been matched
- // to a synonym; because the matching is "greedy" we don't
- // try to do any more matching for such tokens:
- private int inputSkipCount;
-
- // Hold all buffered (read ahead) stacked input tokens for
- // a future position. When multiple tokens are at the
- // same position, we only store (and match against) the
- // term for the first token at the position, but capture
- // state for (and enumerate) all other tokens at this
- // position:
- private static class PendingInput {
- final CharsRef term = new CharsRef();
- AttributeSource.State state;
- boolean keepOrig;
- boolean matched;
- boolean consumed = true;
- int startOffset;
- int endOffset;
-
- public void reset() {
- state = null;
- consumed = true;
- keepOrig = false;
- matched = false;
- }
- };
-
- // Rolling buffer, holding pending input tokens we had to
- // clone because we needed to look ahead, indexed by
- // position:
- private final PendingInput[] futureInputs;
-
- // Holds pending output synonyms for one future position:
- private static class PendingOutputs {
- CharsRef[] outputs;
- int upto;
- int count;
- int posIncr = 1;
-
- public PendingOutputs() {
- outputs = new CharsRef[1];
- }
-
- public void reset() {
- upto = count = 0;
- posIncr = 1;
- }
-
- public CharsRef pullNext() {
- assert upto < count;
- final CharsRef result = outputs[upto++];
- posIncr = 0;
- if (upto == count) {
- reset();
- }
- return result;
- }
-
- public void add(char[] output, int offset, int len) {
- if (count == outputs.length) {
- final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
- System.arraycopy(outputs, 0, next, 0, count);
- outputs = next;
- }
- if (outputs[count] == null) {
- outputs[count] = new CharsRef();
- }
- outputs[count].copy(output, offset, len);
- count++;
- }
- };
-
- private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
-
- // Rolling buffer, holding stack of pending synonym
- // outputs, indexed by position:
- private final PendingOutputs[] futureOutputs;
-
- // Where (in rolling buffers) to write next input saved state:
- private int nextWrite;
-
- // Where (in rolling buffers) to read next input saved state:
- private int nextRead;
-
- // True once we've read last token
- private boolean finished;
-
- private final FST.Arc<BytesRef> scratchArc;
-
- private final FST<BytesRef> fst;
-
- private final BytesRef scratchBytes = new BytesRef();
- private final CharsRef scratchChars = new CharsRef();
-
- /**
- * @param input input tokenstream
- * @param synonyms synonym map
- * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
- * Note, if you set this to true, its your responsibility to lowercase
- * the input entries when you create the {@link SynonymMap}
- */
- public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
- super(input);
- this.synonyms = synonyms;
- this.ignoreCase = ignoreCase;
- this.fst = synonyms.fst;
-
- if (fst == null) {
- throw new IllegalArgumentException("fst must be non-null");
- }
-
- // Must be 1+ so that when roll buffer is at full
- // lookahead we can distinguish this full buffer from
- // the empty buffer:
- rollBufferSize = 1+synonyms.maxHorizontalContext;
-
- futureInputs = new PendingInput[rollBufferSize];
- futureOutputs = new PendingOutputs[rollBufferSize];
- for(int pos=0;pos<rollBufferSize;pos++) {
- futureInputs[pos] = new PendingInput();
- futureOutputs[pos] = new PendingOutputs();
- }
-
- //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
-
- scratchArc = new FST.Arc<BytesRef>();
- }
-
- private void capture() {
- captureCount++;
- //System.out.println(" capture slot=" + nextWrite);
- final PendingInput input = futureInputs[nextWrite];
-
- input.state = captureState();
- input.consumed = false;
- input.term.copy(termAtt.buffer(), 0, termAtt.length());
-
- nextWrite = rollIncr(nextWrite);
-
- // Buffer head should never catch up to tail:
- assert nextWrite != nextRead;
- }
-
- /*
- This is the core of this TokenFilter: it locates the
- synonym matches and buffers up the results into
- futureInputs/Outputs.
-
- NOTE: this calls input.incrementToken and does not
- capture the state if no further tokens were checked. So
- caller must then forward state to our caller, or capture:
- */
-
- private void parse() throws IOException {
- //System.out.println("\nS: parse");
-
- assert inputSkipCount == 0;
-
- int curNextRead = nextRead;
-
- // Holds the longest match we've seen so far:
- BytesRef matchOutput = null;
- int matchInputLength = 0;
-
- BytesRef pendingOutput = fst.outputs.getNoOutput();
- fst.getFirstArc(scratchArc);
-
- assert scratchArc.output == fst.outputs.getNoOutput();
-
- int tokenCount = 0;
-
- byToken:
- while(true) {
-
- // Pull next token's chars:
- final char[] buffer;
- final int bufferLen;
- //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
-
- if (curNextRead == nextWrite) {
-
- // We used up our lookahead buffer of input tokens
- // -- pull next real input token:
-
- if (finished) {
- break;
- } else {
- //System.out.println(" input.incrToken");
- assert futureInputs[nextWrite].consumed;
- // Not correct: a syn match whose output is longer
- // than its input can set future inputs keepOrig
- // to true:
- //assert !futureInputs[nextWrite].keepOrig;
- if (input.incrementToken()) {
- buffer = termAtt.buffer();
- bufferLen = termAtt.length();
- final PendingInput input = futureInputs[nextWrite];
- input.startOffset = offsetAtt.startOffset();
- input.endOffset = offsetAtt.endOffset();
- //System.out.println(" new token=" + new String(buffer, 0, bufferLen));
- if (nextRead != nextWrite) {
- capture();
- } else {
- input.consumed = false;
- }
-
- } else {
- // No more input tokens
- //System.out.println(" set end");
- finished = true;
- break;
- }
- }
- } else {
- // Still in our lookahead
- buffer = futureInputs[curNextRead].term.chars;
- bufferLen = futureInputs[curNextRead].term.length;
- //System.out.println(" old token=" + new String(buffer, 0, bufferLen));
- }
-
- tokenCount++;
-
- // Run each char in this token through the FST:
- int bufUpto = 0;
- while(bufUpto < bufferLen) {
- final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
- if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
- //System.out.println(" stop");
- break byToken;
- }
-
- // Accum the output
- pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
- //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
- bufUpto += Character.charCount(codePoint);
- }
-
- // OK, entire token matched; now see if this is a final
- // state:
- if (scratchArc.isFinal()) {
- matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
- matchInputLength = tokenCount;
- //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
- }
-
- // See if the FST wants to continue matching (ie, needs to
- // see the next input token):
- if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
- // No further rules can match here; we're done
- // searching for matching rules starting at the
- // current input position.
- break;
- } else {
- // More matching is possible -- accum the output (if
- // any) of the WORD_SEP arc:
- pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
- if (nextRead == nextWrite) {
- capture();
- }
- }
-
- curNextRead = rollIncr(curNextRead);
- }
-
- if (nextRead == nextWrite && !finished) {
- //System.out.println(" skip write slot=" + nextWrite);
- nextWrite = rollIncr(nextWrite);
- }
-
- if (matchOutput != null) {
- //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
- inputSkipCount = matchInputLength;
- addOutput(matchOutput, matchInputLength);
- } else if (nextRead != nextWrite) {
- // Even though we had no match here, we set to 1
- // because we need to skip current input token before
- // trying to match again:
- inputSkipCount = 1;
- } else {
- assert finished;
- }
-
- //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
- }
-
- // Interleaves all output tokens onto the futureOutputs:
- private void addOutput(BytesRef bytes, int matchInputLength) {
- bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
-
- final int code = bytesReader.readVInt();
- final boolean keepOrig = (code & 0x1) == 0;
- final int count = code >>> 1;
- //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
- for(int outputIDX=0;outputIDX<count;outputIDX++) {
- synonyms.words.get(bytesReader.readVInt(),
- scratchBytes);
- //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
- UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
- int lastStart = scratchChars.offset;
- final int chEnd = lastStart + scratchChars.length;
- int outputUpto = nextRead;
- for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
- if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
- final int outputLen = chIDX - lastStart;
- // Caller is not allowed to have empty string in
- // the output:
- assert outputLen > 0: "output contains empty string: " + scratchChars;
- futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
- //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
- lastStart = 1+chIDX;
- //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
- outputUpto = rollIncr(outputUpto);
- assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
- }
- }
- }
-
- int upto = nextRead;
- for(int idx=0;idx<matchInputLength;idx++) {
- futureInputs[upto].keepOrig |= keepOrig;
- futureInputs[upto].matched = true;
- upto = rollIncr(upto);
- }
- }
-
- // ++ mod rollBufferSize
- private int rollIncr(int count) {
- count++;
- if (count == rollBufferSize) {
- return 0;
- } else {
- return count;
- }
- }
-
- // for testing
- int getCaptureCount() {
- return captureCount;
- }
-
- @Override
- public boolean incrementToken() throws IOException {
-
- //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
-
- while(true) {
-
- // First play back any buffered future inputs/outputs
- // w/o running parsing again:
- while (inputSkipCount != 0) {
-
- // At each position, we first output the original
- // token
-
- // TODO: maybe just a PendingState class, holding
- // both input & outputs?
- final PendingInput input = futureInputs[nextRead];
- final PendingOutputs outputs = futureOutputs[nextRead];
-
- //System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
-
- if (!input.consumed && (input.keepOrig || !input.matched)) {
- if (input.state != null) {
- // Return a previously saved token (because we
- // had to lookahead):
- restoreState(input.state);
- } else {
- // Pass-through case: return token we just pulled
- // but didn't capture:
- assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
- }
- input.reset();
- if (outputs.count > 0) {
- outputs.posIncr = 0;
- } else {
- nextRead = rollIncr(nextRead);
- inputSkipCount--;
- }
- //System.out.println(" return token=" + termAtt.toString());
- return true;
- } else if (outputs.upto < outputs.count) {
- // Still have pending outputs to replay at this
- // position
- input.reset();
- final int posIncr = outputs.posIncr;
- final CharsRef output = outputs.pullNext();
- clearAttributes();
- termAtt.copyBuffer(output.chars, output.offset, output.length);
- typeAtt.setType(TYPE_SYNONYM);
- offsetAtt.setOffset(input.startOffset, input.endOffset);
- posIncrAtt.setPositionIncrement(posIncr);
- if (outputs.count == 0) {
- // Done with the buffered input and all outputs at
- // this position
- nextRead = rollIncr(nextRead);
- inputSkipCount--;
- }
- //System.out.println(" return token=" + termAtt.toString());
- return true;
- } else {
- // Done with the buffered input and all outputs at
- // this position
- input.reset();
- nextRead = rollIncr(nextRead);
- inputSkipCount--;
- }
- }
-
- if (finished && nextRead == nextWrite) {
- // End case: if any output syns went beyond end of
- // input stream, enumerate them now:
- final PendingOutputs outputs = futureOutputs[nextRead];
- if (outputs.upto < outputs.count) {
- final int posIncr = outputs.posIncr;
- final CharsRef output = outputs.pullNext();
- futureInputs[nextRead].reset();
- if (outputs.count == 0) {
- nextWrite = nextRead = rollIncr(nextRead);
- }
- clearAttributes();
- termAtt.copyBuffer(output.chars, output.offset, output.length);
- typeAtt.setType(TYPE_SYNONYM);
- //System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
- posIncrAtt.setPositionIncrement(posIncr);
- //System.out.println(" return token=" + termAtt.toString());
- return true;
- } else {
- return false;
- }
- }
-
- // Find new synonym matches:
- parse();
- }
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- captureCount = 0;
- finished = false;
-
- // In normal usage these resets would not be needed,
- // since they reset-as-they-are-consumed, but the app
- // may not consume all input tokens in which case we
- // have leftover state here:
- for (PendingInput input : futureInputs) {
- input.reset();
- }
- for (PendingOutputs output : futureOutputs) {
- output.reset();
- }
- }
-}