lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import java.util.Set;
  22
  23 import org.apache.lucene.analysis.Token;
  24 import org.apache.lucene.analysis.TokenFilter; // for javadocs
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.util.Version;
  27
  28 /**
  29  * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
  30  * <p>
  31  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  32  * "Donaudampfschiff" even when you only enter "schiff".
  33  *  It uses a brute-force algorithm to achieve this.
  34  * </p>
  35  */
  36 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
  37
  38   /**
  39    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  40    *
  41    * @param input the {@link TokenStream} to process
  42    * @param dictionary the word dictionary to match against
  43    * @param minWordSize only words longer than this get processed
  44    * @param minSubwordSize only subwords longer than this get to the output stream
  45    * @param maxSubwordSize only subwords shorter than this get to the output stream
  46    * @param onlyLongestMatch Add only the longest matching subword to the stream
  47    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
  48    */
  49   @Deprecated
  50   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
  51       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  52     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
  53   }
  54
  55   /**
  56    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  57    *
  58    * @param input the {@link TokenStream} to process
  59    * @param dictionary the word dictionary to match against
  60    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
  61    */
  62   @Deprecated
  63   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
  64     super(Version.LUCENE_30, input, dictionary);
  65   }
  66
  67   /**
  68    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  69    *
  70    * @param input the {@link TokenStream} to process
  71    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
  72    *        lower case strings.
  73    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
  74    */
  75   @Deprecated
  76   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
  77     super(Version.LUCENE_30, input, dictionary);
  78   }
  79
  80   /**
  81    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  82    *
  83    * @param input the {@link TokenStream} to process
  84    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
  85    *        lower case strings.
  86    * @param minWordSize only words longer than this get processed
  87    * @param minSubwordSize only subwords longer than this get to the output stream
  88    * @param maxSubwordSize only subwords shorter than this get to the output stream
  89    * @param onlyLongestMatch Add only the longest matching subword to the stream
  90    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
  91    */
  92   @Deprecated
  93   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
  94       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  95     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
  96   }
  97
  98   /**
  99    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 100    *
 101    * @param matchVersion
 102    *          Lucene version to enable correct Unicode 4.0 behavior in the
 103    *          dictionaries if Version > 3.0. See <a
 104    *          href="CompoundWordTokenFilterBase#version"
 105    *          >CompoundWordTokenFilterBase</a> for details.
 106    * @param input
 107    *          the {@link TokenStream} to process
 108    * @param dictionary
 109    *          the word dictionary to match against
 110    * @param minWordSize
 111    *          only words longer than this get processed
 112    * @param minSubwordSize
 113    *          only subwords longer than this get to the output stream
 114    * @param maxSubwordSize
 115    *          only subwords shorter than this get to the output stream
 116    * @param onlyLongestMatch
 117    *          Add only the longest matching subword to the stream
 118    */
 119   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
 120       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 121     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 122   }
 123
 124   /**
 125    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 126    *
 127    * @param matchVersion
 128    *          Lucene version to enable correct Unicode 4.0 behavior in the
 129    *          dictionaries if Version > 3.0. See <a
 130    *          href="CompoundWordTokenFilterBase#version"
 131    *          >CompoundWordTokenFilterBase</a> for details.
 132    *
 133    * @param input
 134    *          the {@link TokenStream} to process
 135    * @param dictionary
 136    *          the word dictionary to match against
 137    */
 138   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
 139     super(matchVersion, input, dictionary);
 140   }
 141
 142   /**
 143    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 144    *
 145    * @param matchVersion
 146    *          Lucene version to enable correct Unicode 4.0 behavior in the
 147    *          dictionaries if Version > 3.0. See <a
 148    *          href="CompoundWordTokenFilterBase#version"
 149    *          >CompoundWordTokenFilterBase</a> for details.
 150    * @param input
 151    *          the {@link TokenStream} to process
 152    * @param dictionary
 153    *          the word dictionary to match against. If this is a
 154    *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
 155    *          must have set ignoreCase=false and only contain lower case
 156    *          strings.
 157    */
 158   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary) {
 159     super(matchVersion, input, dictionary);
 160   }
 161
 162   /**
 163    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 164    *
 165    * @param matchVersion
 166    *          Lucene version to enable correct Unicode 4.0 behavior in the
 167    *          dictionaries if Version > 3.0. See <a
 168    *          href="CompoundWordTokenFilterBase#version"
 169    *          >CompoundWordTokenFilterBase</a> for details.
 170    * @param input
 171    *          the {@link TokenStream} to process
 172    * @param dictionary
 173    *          the word dictionary to match against. If this is a
 174    *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
 175    *          must have set ignoreCase=false and only contain lower case
 176    *          strings.
 177    * @param minWordSize
 178    *          only words longer than this get processed
 179    * @param minSubwordSize
 180    *          only subwords longer than this get to the output stream
 181    * @param maxSubwordSize
 182    *          only subwords shorter than this get to the output stream
 183    * @param onlyLongestMatch
 184    *          Add only the longest matching subword to the stream
 185    */
 186   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set dictionary,
 187       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 188     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 189   }
 190
 191   @Override
 192   protected void decomposeInternal(final Token token) {
 193     // Only words longer than minWordSize get processed
 194     if (token.length() < this.minWordSize) {
 195       return;
 196     }
 197
 198     char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
 199
 200     for (int i=0;i<token.length()-this.minSubwordSize;++i) {
 201         Token longestMatchToken=null;
 202         for (int j=this.minSubwordSize-1;j<this.maxSubwordSize;++j) {
 203             if(i+j>token.length()) {
 204                 break;
 205             }
 206             if(dictionary.contains(lowerCaseTermBuffer, i, j)) {
 207                 if (this.onlyLongestMatch) {
 208                    if (longestMatchToken!=null) {
 209                      if (longestMatchToken.length()<j) {
 210                        longestMatchToken=createToken(i,j,token);
 211                      }
 212                    } else {
 213                      longestMatchToken=createToken(i,j,token);
 214                    }
 215                 } else {
 216                    tokens.add(createToken(i,j,token));
 217                 }
 218             }
 219         }
 220         if (this.onlyLongestMatch && longestMatchToken!=null) {
 221           tokens.add(longestMatchToken);
 222         }
 223     }
 224   }
 225 }