lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymFilter.java

   1 package org.apache.lucene.analysis.synonym;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21
  22 import org.apache.lucene.analysis.TokenFilter;
  23 import org.apache.lucene.analysis.TokenStream;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  26 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  28 import org.apache.lucene.store.ByteArrayDataInput;
  29 import org.apache.lucene.util.ArrayUtil;
  30 import org.apache.lucene.util.AttributeSource;
  31 import org.apache.lucene.util.BytesRef;
  32 import org.apache.lucene.util.CharsRef;
  33 import org.apache.lucene.util.RamUsageEstimator;
  34 import org.apache.lucene.util.UnicodeUtil;
  35 import org.apache.lucene.util.fst.FST;
  36
  37 /**
  38  * Matches single or multi word synonyms in a token stream.
  39  * This token stream cannot properly handle position
  40  * increments != 1, ie, you should place this filter before
  41  * filtering out stop words.
  42  *
  43  * <p>Note that with the current implementation, parsing is
  44  * greedy, so whenever multiple parses would apply, the rule
  45  * starting the earliest and parsing the most tokens wins.
  46  * For example if you have these rules:
  47  *
  48  * <pre>
  49  *   a -> x
  50  *   a b -> y
  51  *   b c d -> z
  52  * </pre>
  53  *
  54  * Then input <code>a b c d e</code> parses to <code>y b c
  55  * d</code>, ie the 2nd rule "wins" because it started
  56  * earliest and matched the most input tokens of other rules
  57  * starting at that point.</p>
  58  *
  59  * <p>A future improvement to this filter could allow
  60  * non-greedy parsing, such that the 3rd rule would win, and
  61  * also separately allow multiple parses, such that all 3
  62  * rules would match, perhaps even on a rule by rule
  63  * basis.</p>
  64  *
  65  * <p><b>NOTE</b>: when a match occurs, the output tokens
  66  * associated with the matching rule are "stacked" on top of
  67  * the input stream (if the rule had
  68  * <code>keepOrig=true</code>) and also on top of aother
  69  * matched rule's output tokens.  This is not a correct
  70  * solution, as really the output should be an abitrary
  71  * graph/lattice.  For example, with the above match, you
  72  * would expect an exact <code>PhraseQuery</code> <code>"y b
  73  * c"</code> to match the parsed tokens, but it will fail to
  74  * do so.  This limitations is necessary because Lucene's
  75  * TokenStream (and index) cannot yet represent an arbitrary
  76  * graph.</p>
  77  *
  78  * <p><b>NOTE</b>: If multiple incoming tokens arrive on the
  79  * same position, only the first token at that position is
  80  * used for parsing.  Subsequent tokens simply pass through
  81  * and are not parsed.  A future improvement would be to
  82  * allow these tokens to also be matched.</p>
  83  */
  84
  85 // TODO: maybe we should resolve token -> wordID then run
  86 // FST on wordIDs, for better perf?
  87
  88 // TODO: a more efficient approach would be Aho/Corasick's
  89 // algorithm
  90 // http://en.wikipedia.org/wiki/Aho%E2%80%93Corasick_string_matching_algorithm
  91 // It improves over the current approach here
  92 // because it does not fully re-start matching at every
  93 // token.  For exampl,e if one pattern is "a b c x"
  94 // and another is "b c d" and the input is "a b c d", on
  95 // trying to parse "a b c x" but failing when you got to x,
  96 // rather than starting over again your really should
  97 // immediately recognize that "b c d" matches at the next
  98 // input.  I suspect this won't matter that much in
  99 // practice, but it's possible on some set of synonyms it
 100 // will.  We'd have to modify Aho/Corasick to enforce our
 101 // conflict resolving (eg greedy matching) because that algo
 102 // finds all matches.
 103
 104 public final class SynonymFilter extends TokenFilter {
 105
 106   public static final String TYPE_SYNONYM = "SYNONYM";
 107
 108   private final SynonymMap synonyms;
 109
 110   private final boolean ignoreCase;
 111   private final int rollBufferSize;
 112
 113   private int captureCount;
 114
 115   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 116   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 117   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 118   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 119
 120   // How many future input tokens have already been matched
 121   // to a synonym; because the matching is "greedy" we don't
 122   // try to do any more matching for such tokens:
 123   private int inputSkipCount;
 124
 125   // Hold all buffered (read ahead) stacked input tokens for
 126   // a future position.  When multiple tokens are at the
 127   // same position, we only store (and match against) the
 128   // term for the first token at the position, but capture
 129   // state for (and enumerate) all other tokens at this
 130   // position:
 131   private static class PendingInput {
 132     final CharsRef term = new CharsRef();
 133     AttributeSource.State state;
 134     boolean keepOrig;
 135     boolean matched;
 136     boolean consumed = true;
 137     int startOffset;
 138     int endOffset;
 139
 140     public void reset() {
 141       state = null;
 142       consumed = true;
 143       keepOrig = false;
 144       matched = false;
 145     }
 146   };
 147
 148   // Rolling buffer, holding pending input tokens we had to
 149   // clone because we needed to look ahead, indexed by
 150   // position:
 151   private final PendingInput[] futureInputs;
 152
 153   // Holds pending output synonyms for one future position:
 154   private static class PendingOutputs {
 155     CharsRef[] outputs;
 156     int upto;
 157     int count;
 158     int posIncr = 1;
 159
 160     public PendingOutputs() {
 161       outputs = new CharsRef[1];
 162     }
 163
 164     public void reset() {
 165       upto = count = 0;
 166       posIncr = 1;
 167     }
 168
 169     public CharsRef pullNext() {
 170       assert upto < count;
 171       final CharsRef result = outputs[upto++];
 172       posIncr = 0;
 173       if (upto == count) {
 174         reset();
 175       }
 176       return result;
 177     }
 178
 179     public void add(char[] output, int offset, int len) {
 180       if (count == outputs.length) {
 181         final CharsRef[] next = new CharsRef[ArrayUtil.oversize(1+count, RamUsageEstimator.NUM_BYTES_OBJECT_REF)];
 182         System.arraycopy(outputs, 0, next, 0, count);
 183         outputs = next;
 184       }
 185       if (outputs[count] == null) {
 186         outputs[count] = new CharsRef();
 187       }
 188       outputs[count].copy(output, offset, len);
 189       count++;
 190     }
 191   };
 192
 193   private final ByteArrayDataInput bytesReader = new ByteArrayDataInput();
 194
 195   // Rolling buffer, holding stack of pending synonym
 196   // outputs, indexed by position:
 197   private final PendingOutputs[] futureOutputs;
 198
 199   // Where (in rolling buffers) to write next input saved state:
 200   private int nextWrite;
 201
 202   // Where (in rolling buffers) to read next input saved state:
 203   private int nextRead;
 204
 205   // True once we've read last token
 206   private boolean finished;
 207
 208   private final FST.Arc<BytesRef> scratchArc;
 209
 210   private final FST<BytesRef> fst;
 211
 212   private final BytesRef scratchBytes = new BytesRef();
 213   private final CharsRef scratchChars = new CharsRef();
 214
 215   /**
 216    * @param input input tokenstream
 217    * @param synonyms synonym map
 218    * @param ignoreCase case-folds input for matching with {@link Character#toLowerCase(int)}.
 219    *                   Note, if you set this to true, its your responsibility to lowercase
 220    *                   the input entries when you create the {@link SynonymMap}
 221    */
 222   public SynonymFilter(TokenStream input, SynonymMap synonyms, boolean ignoreCase) {
 223     super(input);
 224     this.synonyms = synonyms;
 225     this.ignoreCase = ignoreCase;
 226     this.fst = synonyms.fst;
 227
 228     if (fst == null) {
 229       throw new IllegalArgumentException("fst must be non-null");
 230     }
 231
 232     // Must be 1+ so that when roll buffer is at full
 233     // lookahead we can distinguish this full buffer from
 234     // the empty buffer:
 235     rollBufferSize = 1+synonyms.maxHorizontalContext;
 236
 237     futureInputs = new PendingInput[rollBufferSize];
 238     futureOutputs = new PendingOutputs[rollBufferSize];
 239     for(int pos=0;pos<rollBufferSize;pos++) {
 240       futureInputs[pos] = new PendingInput();
 241       futureOutputs[pos] = new PendingOutputs();
 242     }
 243
 244     //System.out.println("FSTFilt maxH=" + synonyms.maxHorizontalContext);
 245
 246     scratchArc = new FST.Arc<BytesRef>();
 247   }
 248
 249   private void capture() {
 250     captureCount++;
 251     //System.out.println("  capture slot=" + nextWrite);
 252     final PendingInput input = futureInputs[nextWrite];
 253
 254     input.state = captureState();
 255     input.consumed = false;
 256     input.term.copy(termAtt.buffer(), 0, termAtt.length());
 257
 258     nextWrite = rollIncr(nextWrite);
 259
 260     // Buffer head should never catch up to tail:
 261     assert nextWrite != nextRead;
 262   }
 263
 264   /*
 265    This is the core of this TokenFilter: it locates the
 266    synonym matches and buffers up the results into
 267    futureInputs/Outputs.
 268
 269    NOTE: this calls input.incrementToken and does not
 270    capture the state if no further tokens were checked.  So
 271    caller must then forward state to our caller, or capture:
 272   */
 273
 274   private void parse() throws IOException {
 275     //System.out.println("\nS: parse");
 276
 277     assert inputSkipCount == 0;
 278
 279     int curNextRead = nextRead;
 280
 281     // Holds the longest match we've seen so far:
 282     BytesRef matchOutput = null;
 283     int matchInputLength = 0;
 284
 285     BytesRef pendingOutput = fst.outputs.getNoOutput();
 286     fst.getFirstArc(scratchArc);
 287
 288     assert scratchArc.output == fst.outputs.getNoOutput();
 289
 290     int tokenCount = 0;
 291
 292     byToken:
 293     while(true) {
 294
 295       // Pull next token's chars:
 296       final char[] buffer;
 297       final int bufferLen;
 298       //System.out.println("  cycle nextRead=" + curNextRead + " nextWrite=" + nextWrite);
 299
 300       if (curNextRead == nextWrite) {
 301
 302         // We used up our lookahead buffer of input tokens
 303         // -- pull next real input token:
 304
 305         if (finished) {
 306           break;
 307         } else  {
 308           //System.out.println("  input.incrToken");
 309           assert futureInputs[nextWrite].consumed;
 310           // Not correct: a syn match whose output is longer
 311           // than its input can set future inputs keepOrig
 312           // to true:
 313           //assert !futureInputs[nextWrite].keepOrig;
 314           if (input.incrementToken()) {
 315             buffer = termAtt.buffer();
 316             bufferLen = termAtt.length();
 317             final PendingInput input = futureInputs[nextWrite];
 318             input.startOffset = offsetAtt.startOffset();
 319             input.endOffset = offsetAtt.endOffset();
 320             //System.out.println("  new token=" + new String(buffer, 0, bufferLen));
 321             if (nextRead != nextWrite) {
 322               capture();
 323             } else {
 324               input.consumed = false;
 325             }
 326
 327           } else {
 328             // No more input tokens
 329             //System.out.println("      set end");
 330             finished = true;
 331             break;
 332           }
 333         }
 334       } else {
 335         // Still in our lookahead
 336         buffer = futureInputs[curNextRead].term.chars;
 337         bufferLen = futureInputs[curNextRead].term.length;
 338         //System.out.println("  old token=" + new String(buffer, 0, bufferLen));
 339       }
 340
 341       tokenCount++;
 342
 343       // Run each char in this token through the FST:
 344       int bufUpto = 0;
 345       while(bufUpto < bufferLen) {
 346         final int codePoint = Character.codePointAt(buffer, bufUpto, bufferLen);
 347         if (fst.findTargetArc(ignoreCase ? Character.toLowerCase(codePoint) : codePoint, scratchArc, scratchArc) == null) {
 348           //System.out.println("    stop");
 349           break byToken;
 350         }
 351
 352         // Accum the output
 353         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
 354         //System.out.println("    char=" + buffer[bufUpto] + " output=" + pendingOutput + " arc.output=" + scratchArc.output);
 355         bufUpto += Character.charCount(codePoint);
 356       }
 357
 358       // OK, entire token matched; now see if this is a final
 359       // state:
 360       if (scratchArc.isFinal()) {
 361         matchOutput = fst.outputs.add(pendingOutput, scratchArc.nextFinalOutput);
 362         matchInputLength = tokenCount;
 363         //System.out.println("  found matchLength=" + matchInputLength + " output=" + matchOutput);
 364       }
 365
 366       // See if the FST wants to continue matching (ie, needs to
 367       // see the next input token):
 368       if (fst.findTargetArc(SynonymMap.WORD_SEPARATOR, scratchArc, scratchArc) == null) {
 369         // No further rules can match here; we're done
 370         // searching for matching rules starting at the
 371         // current input position.
 372         break;
 373       } else {
 374         // More matching is possible -- accum the output (if
 375         // any) of the WORD_SEP arc:
 376         pendingOutput = fst.outputs.add(pendingOutput, scratchArc.output);
 377         if (nextRead == nextWrite) {
 378           capture();
 379         }
 380       }
 381
 382       curNextRead = rollIncr(curNextRead);
 383     }
 384
 385     if (nextRead == nextWrite && !finished) {
 386       //System.out.println("  skip write slot=" + nextWrite);
 387       nextWrite = rollIncr(nextWrite);
 388     }
 389
 390     if (matchOutput != null) {
 391       //System.out.println("  add matchLength=" + matchInputLength + " output=" + matchOutput);
 392       inputSkipCount = matchInputLength;
 393       addOutput(matchOutput, matchInputLength);
 394     } else if (nextRead != nextWrite) {
 395       // Even though we had no match here, we set to 1
 396       // because we need to skip current input token before
 397       // trying to match again:
 398       inputSkipCount = 1;
 399     } else {
 400       assert finished;
 401     }
 402
 403     //System.out.println("  parse done inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
 404   }
 405
 406   // Interleaves all output tokens onto the futureOutputs:
 407   private void addOutput(BytesRef bytes, int matchInputLength) {
 408     bytesReader.reset(bytes.bytes, bytes.offset, bytes.length);
 409
 410     final int code = bytesReader.readVInt();
 411     final boolean keepOrig = (code & 0x1) == 0;
 412     final int count = code >>> 1;
 413     //System.out.println("  addOutput count=" + count + " keepOrig=" + keepOrig);
 414     for(int outputIDX=0;outputIDX<count;outputIDX++) {
 415       synonyms.words.get(bytesReader.readVInt(),
 416                          scratchBytes);
 417       //System.out.println("    outIDX=" + outputIDX + " bytes=" + scratchBytes.length);
 418       UnicodeUtil.UTF8toUTF16(scratchBytes, scratchChars);
 419       int lastStart = scratchChars.offset;
 420       final int chEnd = lastStart + scratchChars.length;
 421       int outputUpto = nextRead;
 422       for(int chIDX=lastStart;chIDX<=chEnd;chIDX++) {
 423         if (chIDX == chEnd || scratchChars.chars[chIDX] == SynonymMap.WORD_SEPARATOR) {
 424           final int outputLen = chIDX - lastStart;
 425           // Caller is not allowed to have empty string in
 426           // the output:
 427           assert outputLen > 0: "output contains empty string: " + scratchChars;
 428           futureOutputs[outputUpto].add(scratchChars.chars, lastStart, outputLen);
 429           //System.out.println("      " + new String(scratchChars.chars, lastStart, outputLen) + " outputUpto=" + outputUpto);
 430           lastStart = 1+chIDX;
 431           //System.out.println("  slot=" + outputUpto + " keepOrig=" + keepOrig);
 432           outputUpto = rollIncr(outputUpto);
 433           assert futureOutputs[outputUpto].posIncr == 1: "outputUpto=" + outputUpto + " vs nextWrite=" + nextWrite;
 434         }
 435       }
 436     }
 437
 438     int upto = nextRead;
 439     for(int idx=0;idx<matchInputLength;idx++) {
 440       futureInputs[upto].keepOrig |= keepOrig;
 441       futureInputs[upto].matched = true;
 442       upto = rollIncr(upto);
 443     }
 444   }
 445
 446   // ++ mod rollBufferSize
 447   private int rollIncr(int count) {
 448     count++;
 449     if (count == rollBufferSize) {
 450       return 0;
 451     } else {
 452       return count;
 453     }
 454   }
 455
 456   // for testing
 457   int getCaptureCount() {
 458     return captureCount;
 459   }
 460
 461   @Override
 462   public boolean incrementToken() throws IOException {
 463
 464     //System.out.println("\nS: incrToken inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead + " nextWrite=" + nextWrite);
 465
 466     while(true) {
 467
 468       // First play back any buffered future inputs/outputs
 469       // w/o running parsing again:
 470       while (inputSkipCount != 0) {
 471
 472         // At each position, we first output the original
 473         // token
 474
 475         // TODO: maybe just a PendingState class, holding
 476         // both input & outputs?
 477         final PendingInput input = futureInputs[nextRead];
 478         final PendingOutputs outputs = futureOutputs[nextRead];
 479
 480         //System.out.println("  cycle nextRead=" + nextRead + " nextWrite=" + nextWrite + " inputSkipCount="+ inputSkipCount + " input.keepOrig=" + input.keepOrig + " input.consumed=" + input.consumed + " input.state=" + input.state);
 481
 482         if (!input.consumed && (input.keepOrig || !input.matched)) {
 483           if (input.state != null) {
 484             // Return a previously saved token (because we
 485             // had to lookahead):
 486             restoreState(input.state);
 487           } else {
 488             // Pass-through case: return token we just pulled
 489             // but didn't capture:
 490             assert inputSkipCount == 1: "inputSkipCount=" + inputSkipCount + " nextRead=" + nextRead;
 491           }
 492           input.reset();
 493           if (outputs.count > 0) {
 494             outputs.posIncr = 0;
 495           } else {
 496             nextRead = rollIncr(nextRead);
 497             inputSkipCount--;
 498           }
 499           //System.out.println("  return token=" + termAtt.toString());
 500           return true;
 501         } else if (outputs.upto < outputs.count) {
 502           // Still have pending outputs to replay at this
 503           // position
 504           input.reset();
 505           final int posIncr = outputs.posIncr;
 506           final CharsRef output = outputs.pullNext();
 507           clearAttributes();
 508           termAtt.copyBuffer(output.chars, output.offset, output.length);
 509           typeAtt.setType(TYPE_SYNONYM);
 510           offsetAtt.setOffset(input.startOffset, input.endOffset);
 511           posIncrAtt.setPositionIncrement(posIncr);
 512           if (outputs.count == 0) {
 513             // Done with the buffered input and all outputs at
 514             // this position
 515             nextRead = rollIncr(nextRead);
 516             inputSkipCount--;
 517           }
 518           //System.out.println("  return token=" + termAtt.toString());
 519           return true;
 520         } else {
 521           // Done with the buffered input and all outputs at
 522           // this position
 523           input.reset();
 524           nextRead = rollIncr(nextRead);
 525           inputSkipCount--;
 526         }
 527       }
 528
 529       if (finished && nextRead == nextWrite) {
 530         // End case: if any output syns went beyond end of
 531         // input stream, enumerate them now:
 532         final PendingOutputs outputs = futureOutputs[nextRead];
 533         if (outputs.upto < outputs.count) {
 534           final int posIncr = outputs.posIncr;
 535           final CharsRef output = outputs.pullNext();
 536           futureInputs[nextRead].reset();
 537           if (outputs.count == 0) {
 538             nextWrite = nextRead = rollIncr(nextRead);
 539           }
 540           clearAttributes();
 541           termAtt.copyBuffer(output.chars, output.offset, output.length);
 542           typeAtt.setType(TYPE_SYNONYM);
 543           //System.out.println("  set posIncr=" + outputs.posIncr + " outputs=" + outputs);
 544           posIncrAtt.setPositionIncrement(posIncr);
 545           //System.out.println("  return token=" + termAtt.toString());
 546           return true;
 547         } else {
 548           return false;
 549         }
 550       }
 551
 552       // Find new synonym matches:
 553       parse();
 554     }
 555   }
 556
 557   @Override
 558   public void reset() throws IOException {
 559     super.reset();
 560     captureCount = 0;
 561     finished = false;
 562
 563     // In normal usage these resets would not be needed,
 564     // since they reset-as-they-are-consumed, but the app
 565     // may not consume all input tokens in which case we
 566     // have leftover state here:
 567     for (PendingInput input : futureInputs) {
 568       input.reset();
 569     }
 570     for (PendingOutputs output : futureOutputs) {
 571       output.reset();
 572     }
 573   }
 574 }