lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/DictionaryCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20
  21 import java.util.Set;
  22
  23 import org.apache.lucene.analysis.TokenFilter; // for javadocs
  24 import org.apache.lucene.analysis.TokenStream;
  25 import org.apache.lucene.util.Version;
  26
  27 /**
  28  * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
  29  * <p>
  30  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  31  * "Donaudampfschiff" even when you only enter "schiff".
  32  *  It uses a brute-force algorithm to achieve this.
  33  * <p>
  34  * You must specify the required {@link Version} compatibility when creating
  35  * CompoundWordTokenFilterBase:
  36  * <ul>
  37  * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
  38  * supplementary characters in strings and char arrays provided as compound word
  39  * dictionaries.
  40  * </ul>
  41  * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
  42  * it should be case-insensitive unless it contains only lowercased entries and you
  43  * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
  44  * For optional performance (as this filter does lots of lookups to the dictionary,
  45  * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
  46  * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
  47  * transformed to case-insensitive!
  48  */
  49 public class DictionaryCompoundWordTokenFilter extends CompoundWordTokenFilterBase {
  50
  51   /**
  52    * Creates a new {@link DictionaryCompoundWordTokenFilter}.
  53    * @param input the {@link TokenStream} to process
  54    * @param dictionary the word dictionary to match against
  55    * @param minWordSize only words longer than this get processed
  56    * @param minSubwordSize only subwords longer than this get to the output stream
  57    * @param maxSubwordSize only subwords shorter than this get to the output stream
  58    * @param onlyLongestMatch Add only the longest matching subword to the stream
  59    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[], int, int, int, boolean)} instead
  60    */
  61   @Deprecated
  62   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary,
  63       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  64     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
  65   }
  66
  67   /**
  68    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  69    *
  70    * @param input the {@link TokenStream} to process
  71    * @param dictionary the word dictionary to match against
  72    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, String[])} instead
  73    */
  74   @Deprecated
  75   public DictionaryCompoundWordTokenFilter(TokenStream input, String[] dictionary) {
  76     super(Version.LUCENE_30, input, dictionary);
  77   }
  78
  79   /**
  80    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  81    *
  82    * @param input the {@link TokenStream} to process
  83    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
  84    *        lower case strings.
  85    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set)} instead
  86    */
  87   @Deprecated
  88   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary) {
  89     super(Version.LUCENE_30, input, dictionary);
  90   }
  91
  92   /**
  93    * Creates a new {@link DictionaryCompoundWordTokenFilter}
  94    *
  95    * @param input the {@link TokenStream} to process
  96    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
  97    *        lower case strings.
  98    * @param minWordSize only words longer than this get processed
  99    * @param minSubwordSize only subwords longer than this get to the output stream
 100    * @param maxSubwordSize only subwords shorter than this get to the output stream
 101    * @param onlyLongestMatch Add only the longest matching subword to the stream
 102    * @deprecated use {@link #DictionaryCompoundWordTokenFilter(Version, TokenStream, Set, int, int, int, boolean)} instead
 103    */
 104   @Deprecated
 105   public DictionaryCompoundWordTokenFilter(TokenStream input, Set dictionary,
 106       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 107     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 108   }
 109
 110   /**
 111    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 112    *
 113    * @param matchVersion
 114    *          Lucene version to enable correct Unicode 4.0 behavior in the
 115    *          dictionaries if Version > 3.0. See <a
 116    *          href="CompoundWordTokenFilterBase#version"
 117    *          >CompoundWordTokenFilterBase</a> for details.
 118    * @param input
 119    *          the {@link TokenStream} to process
 120    * @param dictionary
 121    *          the word dictionary to match against
 122    * @param minWordSize
 123    *          only words longer than this get processed
 124    * @param minSubwordSize
 125    *          only subwords longer than this get to the output stream
 126    * @param maxSubwordSize
 127    *          only subwords shorter than this get to the output stream
 128    * @param onlyLongestMatch
 129    *          Add only the longest matching subword to the stream
 130    * @deprecated Use the constructors taking {@link Set}
 131    */
 132   @Deprecated
 133   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary,
 134       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 135     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 136   }
 137
 138   /**
 139    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 140    *
 141    * @param matchVersion
 142    *          Lucene version to enable correct Unicode 4.0 behavior in the
 143    *          dictionaries if Version > 3.0. See <a
 144    *          href="CompoundWordTokenFilterBase#version"
 145    *          >CompoundWordTokenFilterBase</a> for details.
 146    *
 147    * @param input
 148    *          the {@link TokenStream} to process
 149    * @param dictionary
 150    *          the word dictionary to match against
 151    * @deprecated Use the constructors taking {@link Set}
 152    */
 153   @Deprecated
 154   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, String[] dictionary) {
 155     super(matchVersion, input, dictionary);
 156   }
 157
 158   /**
 159    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 160    *
 161    * @param matchVersion
 162    *          Lucene version to enable correct Unicode 4.0 behavior in the
 163    *          dictionaries if Version > 3.0. See <a
 164    *          href="CompoundWordTokenFilterBase#version"
 165    *          >CompoundWordTokenFilterBase</a> for details.
 166    * @param input
 167    *          the {@link TokenStream} to process
 168    * @param dictionary
 169    *          the word dictionary to match against.
 170    */
 171   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary) {
 172     super(matchVersion, input, dictionary);
 173   }
 174
 175   /**
 176    * Creates a new {@link DictionaryCompoundWordTokenFilter}
 177    *
 178    * @param matchVersion
 179    *          Lucene version to enable correct Unicode 4.0 behavior in the
 180    *          dictionaries if Version > 3.0. See <a
 181    *          href="CompoundWordTokenFilterBase#version"
 182    *          >CompoundWordTokenFilterBase</a> for details.
 183    * @param input
 184    *          the {@link TokenStream} to process
 185    * @param dictionary
 186    *          the word dictionary to match against.
 187    * @param minWordSize
 188    *          only words longer than this get processed
 189    * @param minSubwordSize
 190    *          only subwords longer than this get to the output stream
 191    * @param maxSubwordSize
 192    *          only subwords shorter than this get to the output stream
 193    * @param onlyLongestMatch
 194    *          Add only the longest matching subword to the stream
 195    */
 196   public DictionaryCompoundWordTokenFilter(Version matchVersion, TokenStream input, Set<?> dictionary,
 197       int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 198     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
 199   }
 200
 201   @Override
 202   protected void decompose() {
 203     final int len = termAtt.length();
 204     for (int i=0;i<=len-this.minSubwordSize;++i) {
 205         CompoundToken longestMatchToken=null;
 206         for (int j=this.minSubwordSize;j<=this.maxSubwordSize;++j) {
 207             if(i+j>len) {
 208                 break;
 209             }
 210             if(dictionary.contains(termAtt.buffer(), i, j)) {
 211                 if (this.onlyLongestMatch) {
 212                    if (longestMatchToken!=null) {
 213                      if (longestMatchToken.txt.length()<j) {
 214                        longestMatchToken=new CompoundToken(i,j);
 215                      }
 216                    } else {
 217                      longestMatchToken=new CompoundToken(i,j);
 218                    }
 219                 } else {
 220                    tokens.add(new CompoundToken(i,j));
 221                 }
 222             }
 223         }
 224         if (this.onlyLongestMatch && longestMatchToken!=null) {
 225           tokens.add(longestMatchToken);
 226         }
 227     }
 228   }
 229 }