lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.util.Arrays;
  22 import java.util.Collection;
  23 import java.util.LinkedList;
  24 import java.util.Locale;
  25 import java.util.Set;
  26
  27 import org.apache.lucene.analysis.CharArraySet;
  28 import org.apache.lucene.analysis.Token;
  29 import org.apache.lucene.analysis.TokenFilter;
  30 import org.apache.lucene.analysis.TokenStream;
  31 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
  32 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  33 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
  34 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  35 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  36 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  37 import org.apache.lucene.util.Version;
  38
  39 /**
  40  * Base class for decomposition token filters. <a name="version"/>
  41  * <p>
  42  * You must specify the required {@link Version} compatibility when creating
  43  * CompoundWordTokenFilterBase:
  44  * <ul>
  45  * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
  46  * supplementary characters in strings and char arrays provided as compound word
  47  * dictionaries.
  48  * </ul>
  49  */
  50 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
  51   /**
  52    * The default for minimal word length that gets decomposed
  53    */
  54   public static final int DEFAULT_MIN_WORD_SIZE = 5;
  55
  56   /**
  57    * The default for minimal length of subwords that get propagated to the output of this filter
  58    */
  59   public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
  60
  61   /**
  62    * The default for maximal length of subwords that get propagated to the output of this filter
  63    */
  64   public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
  65
  66   protected final CharArraySet dictionary;
  67   protected final LinkedList<Token> tokens;
  68   protected final int minWordSize;
  69   protected final int minSubwordSize;
  70   protected final int maxSubwordSize;
  71   protected final boolean onlyLongestMatch;
  72
  73   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  74   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  75   private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
  76   private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
  77   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  78   private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
  79
  80   private final Token wrapper = new Token();
  81   /**
  82    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
  83    */
  84   @Deprecated
  85   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  86     this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
  87   }
  88
  89   /**
  90    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead
  91    */
  92   @Deprecated
  93   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
  94     this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
  95   }
  96
  97   /**
  98    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead
  99    */
 100   @Deprecated
 101   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
 102     this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 103   }
 104
 105   /**
 106    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead
 107    */
 108   @Deprecated
 109   protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
 110     this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 111   }
 112
 113   /**
 114    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead
 115    */
 116   @Deprecated
 117   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
 118     this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 119   }
 120
 121   /**
 122    * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead
 123    */
 124   @Deprecated
 125   protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 126     this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 127   }
 128
 129   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 130     this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
 131   }
 132
 133   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
 134     this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 135   }
 136
 137   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
 138     this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
 139   }
 140
 141   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
 142     this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 143   }
 144
 145   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
 146     this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
 147   }
 148
 149   protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 150     super(input);
 151
 152     this.tokens=new LinkedList<Token>();
 153     this.minWordSize=minWordSize;
 154     this.minSubwordSize=minSubwordSize;
 155     this.maxSubwordSize=maxSubwordSize;
 156     this.onlyLongestMatch=onlyLongestMatch;
 157
 158     if (dictionary==null || dictionary instanceof CharArraySet) {
 159       this.dictionary = (CharArraySet) dictionary;
 160     } else {
 161       this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
 162       addAllLowerCase(this.dictionary, dictionary);
 163     }
 164   }
 165
 166   /**
 167    * Create a set of words from an array
 168    * The resulting Set does case insensitive matching
 169    * TODO We should look for a faster dictionary lookup approach.
 170    * @param dictionary
 171    * @return {@link Set} of lowercased terms
 172    */
 173   public static final Set<?> makeDictionary(final String[] dictionary) {
 174     return makeDictionary(Version.LUCENE_30, dictionary);
 175   }
 176
 177   public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
 178     if (dictionary == null) {
 179       return null;
 180     }
 181     // is the below really case insensitive?
 182     CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
 183     addAllLowerCase(dict, Arrays.asList(dictionary));
 184     return dict;
 185   }
 186
 187   private final void setToken(final Token token) throws IOException {
 188     clearAttributes();
 189     termAtt.copyBuffer(token.buffer(), 0, token.length());
 190     flagsAtt.setFlags(token.getFlags());
 191     typeAtt.setType(token.type());
 192     offsetAtt.setOffset(token.startOffset(), token.endOffset());
 193     posIncAtt.setPositionIncrement(token.getPositionIncrement());
 194     payloadAtt.setPayload(token.getPayload());
 195   }
 196
 197   @Override
 198   public final boolean incrementToken() throws IOException {
 199     if (tokens.size() > 0) {
 200       setToken(tokens.removeFirst());
 201       return true;
 202     }
 203
 204     if (!input.incrementToken())
 205       return false;
 206
 207     wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
 208     wrapper.setStartOffset(offsetAtt.startOffset());
 209     wrapper.setEndOffset(offsetAtt.endOffset());
 210     wrapper.setFlags(flagsAtt.getFlags());
 211     wrapper.setType(typeAtt.type());
 212     wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
 213     wrapper.setPayload(payloadAtt.getPayload());
 214
 215     decompose(wrapper);
 216
 217     if (tokens.size() > 0) {
 218       setToken(tokens.removeFirst());
 219       return true;
 220     } else {
 221       return false;
 222     }
 223   }
 224
 225   protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
 226     for (Object obj : col) {
 227       String string = (String) obj;
 228       target.add(string.toLowerCase(Locale.ENGLISH));
 229     }
 230   }
 231
 232   protected static char[] makeLowerCaseCopy(final char[] buffer) {
 233     char[] result=new char[buffer.length];
 234     System.arraycopy(buffer, 0, result, 0, buffer.length);
 235
 236     for (int i=0;i<buffer.length;++i) {
 237        result[i]=Character.toLowerCase(buffer[i]);
 238     }
 239
 240     return result;
 241   }
 242
 243   protected final Token createToken(final int offset, final int length,
 244       final Token prototype) {
 245     int newStart = prototype.startOffset() + offset;
 246     Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
 247     t.setPositionIncrement(0);
 248     return t;
 249   }
 250
 251   protected void decompose(final Token token) {
 252     // In any case we give the original token back
 253     tokens.add((Token) token.clone());
 254
 255     // Only words longer than minWordSize get processed
 256     if (token.length() < this.minWordSize) {
 257       return;
 258     }
 259
 260     decomposeInternal(token);
 261   }
 262
 263   protected abstract void decomposeInternal(final Token token);
 264
 265   @Override
 266   public void reset() throws IOException {
 267     super.reset();
 268     tokens.clear();
 269   }
 270 }