lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.Reader;
  22 import java.util.Set;
  23
  24 import org.apache.lucene.analysis.TokenFilter; // for javadocs
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
  27 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
  28 import org.apache.lucene.util.Version;
  29 import org.xml.sax.InputSource;
  30
  31 /**
  32  * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
  33  * <p>
  34  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  35  * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
  36  * grammar and a word dictionary to achieve this.
  37  * <p>
  38  * You must specify the required {@link Version} compatibility when creating
  39  * CompoundWordTokenFilterBase:
  40  * <ul>
  41  * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
  42  * supplementary characters in strings and char arrays provided as compound word
  43  * dictionaries.
  44  * </ul>
  45  * <p>If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary,
  46  * it should be case-insensitive unless it contains only lowercased entries and you
  47  * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain.
  48  * For optional performance (as this filter does lots of lookups to the dictionary,
  49  * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary
  50  * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically
  51  * transformed to case-insensitive!
  52  */
  53 public class HyphenationCompoundWordTokenFilter extends
  54     CompoundWordTokenFilterBase {
  55   private HyphenationTree hyphenator;
  56
  57   /**
  58    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
  59    *
  60    * @param matchVersion
  61    *          Lucene version to enable correct Unicode 4.0 behavior in the
  62    *          dictionaries if Version > 3.0. See <a
  63    *          href="CompoundWordTokenFilterBase#version"
  64    *          >CompoundWordTokenFilterBase</a> for details.
  65    * @param input
  66    *          the {@link TokenStream} to process
  67    * @param hyphenator
  68    *          the hyphenation pattern tree to use for hyphenation
  69    * @param dictionary
  70    *          the word dictionary to match against
  71    * @param minWordSize
  72    *          only words longer than this get processed
  73    * @param minSubwordSize
  74    *          only subwords longer than this get to the output stream
  75    * @param maxSubwordSize
  76    *          only subwords shorter than this get to the output stream
  77    * @param onlyLongestMatch
  78    *          Add only the longest matching subword to the stream
  79    * @deprecated Use the constructors taking {@link Set}
  80    */
  81   @Deprecated
  82   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
  83       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
  84       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  85     this(matchVersion, input, hyphenator, makeDictionary(matchVersion, dictionary), minWordSize,
  86         minSubwordSize, maxSubwordSize, onlyLongestMatch);
  87   }
  88
  89   /**
  90    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
  91    *
  92    * @param matchVersion
  93    *          Lucene version to enable correct Unicode 4.0 behavior in the
  94    *          dictionaries if Version > 3.0. See <a
  95    *          href="CompoundWordTokenFilterBase#version"
  96    *          >CompoundWordTokenFilterBase</a> for details.
  97    * @param input
  98    *          the {@link TokenStream} to process
  99    * @param hyphenator
 100    *          the hyphenation pattern tree to use for hyphenation
 101    * @param dictionary
 102    *          the word dictionary to match against
 103    * @deprecated Use the constructors taking {@link Set}
 104    */
 105   @Deprecated
 106   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 107       HyphenationTree hyphenator, String[] dictionary) {
 108     this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30,dictionary), DEFAULT_MIN_WORD_SIZE,
 109         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 110   }
 111
 112   /**
 113    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 114    *
 115    * @param matchVersion
 116    *          Lucene version to enable correct Unicode 4.0 behavior in the
 117    *          dictionaries if Version > 3.0. See <a
 118    *          href="CompoundWordTokenFilterBase#version"
 119    *          >CompoundWordTokenFilterBase</a> for details.
 120    * @param input
 121    *          the {@link TokenStream} to process
 122    * @param hyphenator
 123    *          the hyphenation pattern tree to use for hyphenation
 124    * @param dictionary
 125    *          the word dictionary to match against.
 126    */
 127   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 128       HyphenationTree hyphenator, Set<?> dictionary) {
 129     this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
 130         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 131   }
 132
 133   /**
 134    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 135    *
 136    * @param matchVersion
 137    *          Lucene version to enable correct Unicode 4.0 behavior in the
 138    *          dictionaries if Version > 3.0. See <a
 139    *          href="CompoundWordTokenFilterBase#version"
 140    *          >CompoundWordTokenFilterBase</a> for details.
 141    * @param input
 142    *          the {@link TokenStream} to process
 143    * @param hyphenator
 144    *          the hyphenation pattern tree to use for hyphenation
 145    * @param dictionary
 146    *          the word dictionary to match against.
 147    * @param minWordSize
 148    *          only words longer than this get processed
 149    * @param minSubwordSize
 150    *          only subwords longer than this get to the output stream
 151    * @param maxSubwordSize
 152    *          only subwords shorter than this get to the output stream
 153    * @param onlyLongestMatch
 154    *          Add only the longest matching subword to the stream
 155    */
 156   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 157       HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
 158       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 159     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
 160         onlyLongestMatch);
 161
 162     this.hyphenator = hyphenator;
 163   }
 164
 165   /**
 166    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 167    * <p>
 168    * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
 169    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 170    * null, minWordSize, minSubwordSize, maxSubwordSize }
 171    */
 172   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 173       HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
 174       int maxSubwordSize) {
 175     this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
 176         maxSubwordSize, false);
 177   }
 178
 179   /**
 180    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 181    * <p>
 182    * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
 183    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 184    * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
 185    */
 186   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 187       HyphenationTree hyphenator) {
 188     this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
 189         DEFAULT_MAX_SUBWORD_SIZE);
 190   }
 191
 192   /**
 193    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 194    *
 195    * @param input the {@link TokenStream} to process
 196    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 197    * @param dictionary the word dictionary to match against
 198    * @param minWordSize only words longer than this get processed
 199    * @param minSubwordSize only subwords longer than this get to the output
 200    *        stream
 201    * @param maxSubwordSize only subwords shorter than this get to the output
 202    *        stream
 203    * @param onlyLongestMatch Add only the longest matching subword to the stream
 204    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
 205    */
 206   @Deprecated
 207   public HyphenationCompoundWordTokenFilter(TokenStream input,
 208       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
 209       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 210     this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), minWordSize,
 211         minSubwordSize, maxSubwordSize, onlyLongestMatch);
 212   }
 213
 214   /**
 215    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 216    *
 217    * @param input the {@link TokenStream} to process
 218    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 219    * @param dictionary the word dictionary to match against
 220    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
 221    */
 222   @Deprecated
 223   public HyphenationCompoundWordTokenFilter(TokenStream input,
 224       HyphenationTree hyphenator, String[] dictionary) {
 225     this(Version.LUCENE_30, input, hyphenator, makeDictionary(Version.LUCENE_30, dictionary), DEFAULT_MIN_WORD_SIZE,
 226         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 227   }
 228
 229   /**
 230    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 231    *
 232    * @param input the {@link TokenStream} to process
 233    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 234    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
 235    *        lower case strings.
 236    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
 237    */
 238   @Deprecated
 239   public HyphenationCompoundWordTokenFilter(TokenStream input,
 240       HyphenationTree hyphenator, Set<?> dictionary) {
 241     this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
 242         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 243   }
 244
 245   /**
 246    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 247    *
 248    * @param input the {@link TokenStream} to process
 249    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 250    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
 251    *        lower case strings.
 252    * @param minWordSize only words longer than this get processed
 253    * @param minSubwordSize only subwords longer than this get to the output
 254    *        stream
 255    * @param maxSubwordSize only subwords shorter than this get to the output
 256    *        stream
 257    * @param onlyLongestMatch Add only the longest matching subword to the stream
 258    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
 259    */
 260   @Deprecated
 261   public HyphenationCompoundWordTokenFilter(TokenStream input,
 262       HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
 263       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 264     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
 265         onlyLongestMatch);
 266
 267     this.hyphenator = hyphenator;
 268   }
 269
 270   /**
 271    * Create a hyphenator tree
 272    *
 273    * @param hyphenationFilename the filename of the XML grammar to load
 274    * @return An object representing the hyphenation patterns
 275    * @throws Exception
 276    */
 277   public static HyphenationTree getHyphenationTree(String hyphenationFilename)
 278       throws Exception {
 279     return getHyphenationTree(new InputSource(hyphenationFilename));
 280   }
 281
 282   /**
 283    * Create a hyphenator tree
 284    *
 285    * @param hyphenationFile the file of the XML grammar to load
 286    * @return An object representing the hyphenation patterns
 287    * @throws Exception
 288    */
 289   public static HyphenationTree getHyphenationTree(File hyphenationFile)
 290       throws Exception {
 291     return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
 292   }
 293
 294   /**
 295    * Create a hyphenator tree
 296    *
 297    * @param hyphenationReader the reader of the XML grammar to load from
 298    * @return An object representing the hyphenation patterns
 299    * @throws Exception
 300    * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
 301    * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
 302    * stream, if you like.
 303    */
 304   @Deprecated
 305   public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
 306       throws Exception {
 307     final InputSource is = new InputSource(hyphenationReader);
 308     // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
 309     // The DTD itsself is provided via EntityResolver, so it should always load, but
 310     // some parsers still want to have a base URL (Crimson).
 311     is.setSystemId("urn:java:" + HyphenationTree.class.getName());
 312     return getHyphenationTree(is);
 313   }
 314
 315   /**
 316    * Create a hyphenator tree
 317    *
 318    * @param hyphenationSource the InputSource pointing to the XML grammar
 319    * @return An object representing the hyphenation patterns
 320    * @throws Exception
 321    */
 322   public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
 323       throws Exception {
 324     HyphenationTree tree = new HyphenationTree();
 325     tree.loadPatterns(hyphenationSource);
 326     return tree;
 327   }
 328
 329   @Override
 330   protected void decompose() {
 331     // get the hyphenation points
 332     Hyphenation hyphens = hyphenator.hyphenate(termAtt.buffer(), 0, termAtt.length(), 1, 1);
 333     // No hyphen points found -> exit
 334     if (hyphens == null) {
 335       return;
 336     }
 337
 338     final int[] hyp = hyphens.getHyphenationPoints();
 339
 340     for (int i = 0; i < hyp.length; ++i) {
 341       int remaining = hyp.length - i;
 342       int start = hyp[i];
 343       CompoundToken longestMatchToken = null;
 344       for (int j = 1; j < remaining; j++) {
 345         int partLength = hyp[i + j] - start;
 346
 347         // if the part is longer than maxSubwordSize we
 348         // are done with this round
 349         if (partLength > this.maxSubwordSize) {
 350           break;
 351         }
 352
 353         // we only put subwords to the token stream
 354         // that are longer than minPartSize
 355         if (partLength < this.minSubwordSize) {
 356           continue;
 357         }
 358
 359         // check the dictionary
 360         if (dictionary == null || dictionary.contains(termAtt.buffer(), start, partLength)) {
 361           if (this.onlyLongestMatch) {
 362             if (longestMatchToken != null) {
 363               if (longestMatchToken.txt.length() < partLength) {
 364                 longestMatchToken = new CompoundToken(start, partLength);
 365               }
 366             } else {
 367               longestMatchToken = new CompoundToken(start, partLength);
 368             }
 369           } else {
 370             tokens.add(new CompoundToken(start, partLength));
 371           }
 372         } else if (dictionary.contains(termAtt.buffer(), start, partLength - 1)) {
 373           // check the dictionary again with a word that is one character
 374           // shorter
 375           // to avoid problems with genitive 's characters and other binding
 376           // characters
 377           if (this.onlyLongestMatch) {
 378             if (longestMatchToken != null) {
 379               if (longestMatchToken.txt.length() < partLength - 1) {
 380                 longestMatchToken = new CompoundToken(start, partLength - 1);
 381               }
 382             } else {
 383               longestMatchToken = new CompoundToken(start, partLength - 1);
 384             }
 385           } else {
 386             tokens.add(new CompoundToken(start, partLength - 1));
 387           }
 388         }
 389       }
 390       if (this.onlyLongestMatch && longestMatchToken!=null) {
 391         tokens.add(longestMatchToken);
 392       }
 393     }
 394   }
 395 }