lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Arrays;
  22 import java.util.Collection;
  23 import java.util.LinkedList;
  24 import java.util.Locale;
  25 import java.util.Set;
  26
  27 import org.apache.lucene.analysis.CharArraySet;
  28 import org.apache.lucene.analysis.TokenFilter;
  29 import org.apache.lucene.analysis.TokenStream;
  30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  31 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  32 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  33 import org.apache.lucene.util.AttributeSource;
  34 import org.apache.lucene.util.Version;
  35
  36 /**
  37  * Base class for decomposition token filters.
  38  * <p>
  39  * You must specify the required {@link Version} compatibility when creating
  40  * CompoundWordTokenFilterBase:
  41  * <ul>
  42  * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
  43  * supplementary characters in strings and char arrays provided as compound word
  44  * dictionaries.
  45  * </ul>
  46  * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
  47  * it should be case-insensitive unless it contains only lowercased entries and you
  48  * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
  49  * For optional performance (as this filter does lots of lookups to the dictionary,
  50  * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
  51  * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
  52  * transformed to case-insensitive!
  53  */
  54 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  55   /**
  56    * The default for minimal word length that gets decomposed
  57    */
  58   public static final int DEFAULT_MIN_WORD_SIZE = 5;
  59
  60   /**
  61    * The default for minimal length of subwords that get propagated to the output of this filter
  62    */
  63   public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
  64
  65   /**
  66    * The default for maximal length of subwords that get propagated to the output of this filter
  67    */
  68   public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
  69
  70   protected final CharArraySet dictionary;
  71   protected final LinkedList<CompoundToken> tokens;
  72   protected final int minWordSize;
  73   protected final int minSubwordSize;
  74   protected final int maxSubwordSize;
  75   protected final boolean onlyLongestMatch;
  76
  77   protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  78   protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  79   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  80
  81   private AttributeSource.State current;
  82
  83   /**
  84    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
  85    */
  86   @Deprecated
  87   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  88     this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
  89   }
  90
  91   /**
  92    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead
  93    */
  94   @Deprecated
  95   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
  96     this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
  97   }
  98
  99   /**
 100    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead
 101    */
 102   @Deprecated
 103   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
 104     this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 105   }
 106
 107   /**
 108    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead
 109    */
 110   @Deprecated
 111   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
 112     this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 113   }
 114
 115   /**
 116    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead
 117    */
 118   @Deprecated
 119   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
 120     this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 121   }
 122
 123   /**
 124    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead
 125    */
 126   @Deprecated
 127   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 128     this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 129   }
 130
 131   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 132     this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
 133   }
 134
 135   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
 136     this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 137   }
 138
 139   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
 140     this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 141   }
 142
 143   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
 144     this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 145   }
 146
 147   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
 148     this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 149   }
 150
 151   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 152     super(input);
 153
 154     this.tokens=new LinkedList<CompoundToken>();
 155     this.minWordSize=minWordSize;
 156     this.minSubwordSize=minSubwordSize;
 157     this.maxSubwordSize=maxSubwordSize;
 158     this.onlyLongestMatch=onlyLongestMatch;
 159
 160     if (dictionary==null || dictionary instanceof CharArraySet) {
 161       this.dictionary = (CharArraySet) dictionary;
 162     } else {
 163       this.dictionary = new CharArraySet(matchVersion, dictionary, true);
 164     }
 165   }
 166
 167   /** @deprecated Only available for backwards compatibility. */
 168   @Deprecated
 169   public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) {
 170     if (dictionary == null) {
 171       return null;
 172     }
 173     return new CharArraySet(matchVersion, Arrays.asList(dictionary), true);
 174   }
 175
 176   @Override
 177   public final boolean incrementToken() throws IOException {
 178     if (!tokens.isEmpty()) {
 179       assert current != null;
 180       CompoundToken token = tokens.removeFirst();
 181       restoreState(current); // keep all other attributes untouched
 182       termAtt.setEmpty().append(token.txt);
 183       offsetAtt.setOffset(token.startOffset, token.endOffset);
 184       posIncAtt.setPositionIncrement(0);
 185       return true;
 186     }
 187
 188     current = null; // not really needed, but for safety
 189     if (input.incrementToken()) {
 190       // Only words longer than minWordSize get processed
 191       if (termAtt.length() >= this.minWordSize) {
 192         decompose();
 193         // only capture the state if we really need it for producing new tokens
 194         if (!tokens.isEmpty()) {
 195           current = captureState();
 196         }
 197       }
 198       // return original token:
 199       return true;
 200     } else {
 201       return false;
 202     }
 203   }
 204
 205   /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list.
 206    * The original token may not be placed in the list, as it is automatically passed through this filter.
 207    */
 208   protected abstract void decompose();
 209
 210   @Override
 211   public void reset() throws IOException {
 212     super.reset();
 213     tokens.clear();
 214     current = null;
 215   }
 216
 217   /**
 218    * Helper class to hold decompounded token information
 219    */
 220   protected class CompoundToken {
 221     public final CharSequence txt;
 222     public final int startOffset, endOffset;
 223
 224     /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */
 225     public CompoundToken(int offset, int length) {
 226       final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset;
 227       this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length);
 228       // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed
 229       // chars from the term, offsets may not match correctly (other filters producing tokens
 230       // may also have this problem):
 231       this.startOffset = newStart;
 232       this.endOffset = newStart + length;
 233     }
 234
 235   }
 236 }