1 package org.apache.lucene.analysis.synonym;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
22 import org.apache.lucene.analysis.TokenFilter;
23 import org.apache.lucene.analysis.TokenStream;
24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
26 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
27 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
28 import org.apache.lucene.store.ByteArrayDataInput;
29 import org.apache.lucene.util.ArrayUtil;
30 import org.apache.lucene.util.AttributeSource;
31 import org.apache.lucene.util.BytesRef;
32 import org.apache.lucene.util.CharsRef;
33 import org.apache.lucene.util.RamUsageEstimator;
34 import org.apache.lucene.util.UnicodeUtil;
35 import org.apache.lucene.util.fst.FST;
38 * Matches single or multi word synonyms in a token stream.
39 * This token stream cannot properly handle position
40 * increments != 1, ie, you should place this filter before
41 * filtering out stop words.
43 * <p>Note that with the current implementation, parsing is
44 * greedy, so whenever multiple parses would apply, the rule
45 * starting the earliest and parsing the most tokens wins.
46 * For example if you have these rules:
54 * Then input <code>a b c d e</code> parses to <code>y b c
55 * d</code>, ie the 2nd rule "wins" because it started
56 * earliest and matched the most input tokens of other rules
57 * starting at that point.</p>
59 * <p>A future improvement to this filter could allow
60 * non-greedy parsing, such that the 3rd rule would win, and
61 * also separately allow multiple parses, such that all 3
62 * rules would match, perhaps even on a rule by rule
65 * <p><b>NOTE</b>: when a match occurs, the output tokens
66 * associated with the matching rule are "stacked" on top of
67 * the input stream (if the rule had
68 * <code>keepOrig=true</code>) and also on top of aother
69 * matched rule's output tokens. This is not a correct
70 * solution, as really the output should be an abitrary
71 * graph/lattice. For example, with the above match, you
72 * would expect an exact <code>PhraseQuery</code> <code>"y b
73 * c"</code> to match the parsed tokens, but it will fail to
74 * do so. This limitations is necessary because Lucene's
75 * TokenStream (and index) cannot yet represent an arbitrary
78 * <p><b>NOTE</b>: If multiple incoming tokens arrive on the
79 * same position, only the first token at that position is
80 * used for parsing. Subsequent tokens simply pass through
81 * and are not parsed. A future improvement would be to
82 * allow these tokens to also be matched.</p>
85 // TODO: maybe we should resolve token -> wordID then run
86 // FST on wordIDs, for better perf?
88 // TODO: a more efficient approach would be Aho/Corasick's
90 // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
91 // It improves over the current approach here
92 // because it does not fully re-start matching at every
93 // token. For exampl,e if one pattern is "a b c x"
94 // and another is "b c d" and the input is "a b c d", on
95 // trying to parse "a b c x" but failing when you got to x,
96 // rather than starting over again your really should
97 // immediately recognize that "b c d" matches at the next
98 // input. I suspect this won't matter that much in
99 // practice, but it's possible on some set of synonyms it
100 // will. We'd have to modify Aho/Corasick to enforce our
101 // conflict resolving (eg greedy matching) because that algo
102 // finds all matches.
104 public final class SynonymFilter extends TokenFilter {
106 public static final String TYPE_SYNONYM = "SYNONYM";
108 private final SynonymMap synonyms;
110 private final boolean ignoreCase;
111 private final int rollBufferSize;
113 private int captureCount;
115 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
116 private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
117 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
118 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
120 // How many future input tokens have already been matched
121 // to a synonym; because the matching is "greedy" we don't
122 // try to do any more matching for such tokens:
123 private int inputSkipCount;
125 // Hold all buffered (read ahead) stacked input tokens for
126 // a future position. When multiple tokens are at the
127 // same position, we only store (and match against) the
128 // term for the first token at the position, but capture
129 // state for (and enumerate) all other tokens at this
131 private static class PendingInput {
132 final CharsRef term = new CharsRef();
133 AttributeSource.State state;
136 boolean consumed = true;
140 public void reset() {
148 // Rolling buffer, holding pending input tokens we had to
149 // clone because we needed to look ahead, indexed by
151 private final PendingInput[] futureInputs;
153 // Holds pending output synonyms for one future position:
154 private static class PendingOutputs {
160 public PendingOutputs() {
161 outputs = new CharsRef[1];
164 public void reset() {
169 public CharsRef pullNext() {
171 final CharsRef result = outputs[upto++];
179 public void add(char[] output, int offset, int len) {
180 if (count == outputs.length) {
181 final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
182 System.arraycopy(outputs, 0, next, 0, count);
185 if (outputs[count] == null) {
186 outputs[count] = new CharsRef();
188 outputs[count].copy(output, offset, len);
193 private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
195 // Rolling buffer, holding stack of pending synonym
196 // outputs, indexed by position:
197 private final PendingOutputs[] futureOutputs;
199 // Where (in rolling buffers) to write next input saved state:
200 private int nextWrite;
202 // Where (in rolling buffers) to read next input saved state:
203 private int nextRead;
205 // True once we've read last token
206 private boolean finished;
208 private final FST.Arc<BytesRef> scratchArc;
210 private final FST<BytesRef> fst;
212 private final BytesRef scratchBytes = new BytesRef();
213 private final CharsRef scratchChars = new CharsRef();
216 * @param input input tokenstream
217 * @param synonyms synonym map
218 * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
219 * Note, if you set this to true, its your responsibility to lowercase
220 * the input entries when you create the {@link SynonymMap}
222 public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
224 this.synonyms = synonyms;
225 this.ignoreCase = ignoreCase;
226 this.fst = synonyms.fst;
229 throw new IllegalArgumentException("fst must be non-null");
232 // Must be 1+ so that when roll buffer is at full
233 // lookahead we can distinguish this full buffer from
235 rollBufferSize = 1+synonyms.maxHorizontalContext;
237 futureInputs = new PendingInput[rollBufferSize];
238 futureOutputs = new PendingOutputs[rollBufferSize];
239 for(int pos=0;pos<rollBufferSize;pos++) {
240 futureInputs[pos] = new PendingInput();
241 futureOutputs[pos] = new PendingOutputs();
244 //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
246 scratchArc = new FST.Arc<BytesRef>();
249 private void capture() {
251 //System.out.println(" capture slot=" + nextWrite);
252 final PendingInput input = futureInputs[nextWrite];
254 input.state = captureState();
255 input.consumed = false;
256 input.term.copy(termAtt.buffer(), 0, termAtt.length());
258 nextWrite = rollIncr(nextWrite);
260 // Buffer head should never catch up to tail:
261 assert nextWrite != nextRead;
265 This is the core of this TokenFilter: it locates the
266 synonym matches and buffers up the results into
267 futureInputs/Outputs.
269 NOTE: this calls input.incrementToken and does not
270 capture the state if no further tokens were checked. So
271 caller must then forward state to our caller, or capture:
274 private void parse() throws IOException {
275 //System.out.println("\nS: parse");
277 assert inputSkipCount == 0;
279 int curNextRead = nextRead;
281 // Holds the longest match we've seen so far:
282 BytesRef matchOutput = null;
283 int matchInputLength = 0;
285 BytesRef pendingOutput = fst.outputs.getNoOutput();
286 fst.getFirstArc(scratchArc);
288 assert scratchArc.output == fst.outputs.getNoOutput();
295 // Pull next token's chars:
298 //System.out.println(" cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
300 if (curNextRead == nextWrite) {
302 // We used up our lookahead buffer of input tokens
303 // -- pull next real input token:
308 //System.out.println(" input.incrToken");
309 assert futureInputs[nextWrite].consumed;
310 // Not correct: a syn match whose output is longer
311 // than its input can set future inputs keepOrig
313 //assert !futureInputs[nextWrite].keepOrig;
314 if (input.incrementToken()) {
315 buffer = termAtt.buffer();
316 bufferLen = termAtt.length();
317 final PendingInput input = futureInputs[nextWrite];
318 input.startOffset = offsetAtt.startOffset();
319 input.endOffset = offsetAtt.endOffset();
320 //System.out.println(" new token=" + new String(buffer, 0, bufferLen));
321 if (nextRead != nextWrite) {
324 input.consumed = false;
328 // No more input tokens
329 //System.out.println(" set end");
335 // Still in our lookahead
336 buffer = futureInputs[curNextRead].term.chars;
337 bufferLen = futureInputs[curNextRead].term.length;
338 //System.out.println(" old token=" + new String(buffer, 0, bufferLen));
343 // Run each char in this token through the FST:
345 while(bufUpto < bufferLen) {
346 final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
347 if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
348 //System.out.println(" stop");
353 pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
354 //System.out.println(" char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
355 bufUpto += Character.charCount(codePoint);
358 // OK, entire token matched; now see if this is a final
360 if (scratchArc.isFinal()) {
361 matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
362 matchInputLength = tokenCount;
363 //System.out.println(" found matchLength=" + matchInputLength + " output=" + matchOutput);
366 // See if the FST wants to continue matching (ie, needs to
367 // see the next input token):
368 if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
369 // No further rules can match here; we're done
370 // searching for matching rules starting at the
371 // current input position.
374 // More matching is possible -- accum the output (if
375 // any) of the WORD_SEP arc:
376 pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
377 if (nextRead == nextWrite) {
382 curNextRead = rollIncr(curNextRead);
385 if (nextRead == nextWrite && !finished) {
386 //System.out.println(" skip write slot=" + nextWrite);
387 nextWrite = rollIncr(nextWrite);
390 if (matchOutput != null) {
391 //System.out.println(" add matchLength=" + matchInputLength + " output=" + matchOutput);
392 inputSkipCount = matchInputLength;
393 addOutput(matchOutput, matchInputLength);
394 } else if (nextRead != nextWrite) {
395 // Even though we had no match here, we set to 1
396 // because we need to skip current input token before
397 // trying to match again:
403 //System.out.println(" parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
406 // Interleaves all output tokens onto the futureOutputs:
407 private void addOutput(BytesRef bytes, int matchInputLength) {
408 bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
410 final int code = bytesReader.readVInt();
411 final boolean keepOrig = (code & 0x1) == 0;
412 final int count = code >>> 1;
413 //System.out.println(" addOutput count=" + count + " keepOrig=" + keepOrig);
414 for(int outputIDX=0;outputIDX<count;outputIDX++) {
415 synonyms.words.get(bytesReader.readVInt(),
417 //System.out.println(" outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
418 UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
419 int lastStart = scratchChars.offset;
420 final int chEnd = lastStart + scratchChars.length;
421 int outputUpto = nextRead;
422 for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
423 if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
424 final int outputLen = chIDX - lastStart;
425 // Caller is not allowed to have empty string in
427 assert outputLen > 0: "output contains empty string: " + scratchChars;
428 futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
429 //System.out.println(" " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
431 //System.out.println(" slot=" + outputUpto + " keepOrig=" + keepOrig);
432 outputUpto = rollIncr(outputUpto);
433 assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
439 for(int idx=0;idx<matchInputLength;idx++) {
440 futureInputs[upto].keepOrig |= keepOrig;
441 futureInputs[upto].matched = true;
442 upto = rollIncr(upto);
446 // ++ mod rollBufferSize
447 private int rollIncr(int count) {
449 if (count == rollBufferSize) {
457 int getCaptureCount() {
462 public boolean incrementToken() throws IOException {
464 //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
468 // First play back any buffered future inputs/outputs
469 // w/o running parsing again:
470 while (inputSkipCount != 0) {
472 // At each position, we first output the original
475 // TODO: maybe just a PendingState class, holding
476 // both input & outputs?
477 final PendingInput input = futureInputs[nextRead];
478 final PendingOutputs outputs = futureOutputs[nextRead];
480 //System.out.println(" cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
482 if (!input.consumed && (input.keepOrig || !input.matched)) {
483 if (input.state != null) {
484 // Return a previously saved token (because we
485 // had to lookahead):
486 restoreState(input.state);
488 // Pass-through case: return token we just pulled
489 // but didn't capture:
490 assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
493 if (outputs.count > 0) {
496 nextRead = rollIncr(nextRead);
499 //System.out.println(" return token=" + termAtt.toString());
501 } else if (outputs.upto < outputs.count) {
502 // Still have pending outputs to replay at this
505 final int posIncr = outputs.posIncr;
506 final CharsRef output = outputs.pullNext();
508 termAtt.copyBuffer(output.chars, output.offset, output.length);
509 typeAtt.setType(TYPE_SYNONYM);
510 offsetAtt.setOffset(input.startOffset, input.endOffset);
511 posIncrAtt.setPositionIncrement(posIncr);
512 if (outputs.count == 0) {
513 // Done with the buffered input and all outputs at
515 nextRead = rollIncr(nextRead);
518 //System.out.println(" return token=" + termAtt.toString());
521 // Done with the buffered input and all outputs at
524 nextRead = rollIncr(nextRead);
529 if (finished && nextRead == nextWrite) {
530 // End case: if any output syns went beyond end of
531 // input stream, enumerate them now:
532 final PendingOutputs outputs = futureOutputs[nextRead];
533 if (outputs.upto < outputs.count) {
534 final int posIncr = outputs.posIncr;
535 final CharsRef output = outputs.pullNext();
536 futureInputs[nextRead].reset();
537 if (outputs.count == 0) {
538 nextWrite = nextRead = rollIncr(nextRead);
541 termAtt.copyBuffer(output.chars, output.offset, output.length);
542 typeAtt.setType(TYPE_SYNONYM);
543 //System.out.println(" set posIncr=" + outputs.posIncr + " outputs=" + outputs);
544 posIncrAtt.setPositionIncrement(posIncr);
545 //System.out.println(" return token=" + termAtt.toString());
552 // Find new synonym matches:
558 public void reset() throws IOException {
563 // In normal usage these resets would not be needed,
564 // since they reset-as-they-are-consumed, but the app
565 // may not consume all input tokens in which case we
566 // have leftover state here:
567 for (PendingInput input : futureInputs) {
570 for (PendingOutputs output : futureOutputs) {