lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/HyphenationCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.File;
  21 import java.io.Reader;
  22 import java.util.Set;
  23
  24 import org.apache.lucene.analysis.Token;
  25 import org.apache.lucene.analysis.TokenFilter; // for javadocs
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.compound.hyphenation.Hyphenation;
  28 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
  29 import org.apache.lucene.util.Version;
  30 import org.xml.sax.InputSource;
  31
  32 /**
  33  * A {@link TokenFilter} that decomposes compound words found in many Germanic languages.
  34  * <p>
  35  * "Donaudampfschiff" becomes Donau, dampf, schiff so that you can find
  36  * "Donaudampfschiff" even when you only enter "schiff". It uses a hyphenation
  37  * grammar and a word dictionary to achieve this.
  38  * </p>
  39  */
  40 public class HyphenationCompoundWordTokenFilter extends
  41     CompoundWordTokenFilterBase {
  42   private HyphenationTree hyphenator;
  43
  44   /**
  45    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
  46    *
  47    * @param matchVersion
  48    *          Lucene version to enable correct Unicode 4.0 behavior in the
  49    *          dictionaries if Version > 3.0. See <a
  50    *          href="CompoundWordTokenFilterBase#version"
  51    *          >CompoundWordTokenFilterBase</a> for details.
  52    * @param input
  53    *          the {@link TokenStream} to process
  54    * @param hyphenator
  55    *          the hyphenation pattern tree to use for hyphenation
  56    * @param dictionary
  57    *          the word dictionary to match against
  58    * @param minWordSize
  59    *          only words longer than this get processed
  60    * @param minSubwordSize
  61    *          only subwords longer than this get to the output stream
  62    * @param maxSubwordSize
  63    *          only subwords shorter than this get to the output stream
  64    * @param onlyLongestMatch
  65    *          Add only the longest matching subword to the stream
  66    */
  67   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
  68       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
  69       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  70     this(input, hyphenator, makeDictionary(dictionary), minWordSize,
  71         minSubwordSize, maxSubwordSize, onlyLongestMatch);
  72   }
  73
  74   /**
  75    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
  76    *
  77    * @param matchVersion
  78    *          Lucene version to enable correct Unicode 4.0 behavior in the
  79    *          dictionaries if Version > 3.0. See <a
  80    *          href="CompoundWordTokenFilterBase#version"
  81    *          >CompoundWordTokenFilterBase</a> for details.
  82    * @param input
  83    *          the {@link TokenStream} to process
  84    * @param hyphenator
  85    *          the hyphenation pattern tree to use for hyphenation
  86    * @param dictionary
  87    *          the word dictionary to match against
  88    */
  89   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
  90       HyphenationTree hyphenator, String[] dictionary) {
  91     this(input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
  92         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
  93   }
  94
  95   /**
  96    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
  97    *
  98    * @param matchVersion
  99    *          Lucene version to enable correct Unicode 4.0 behavior in the
 100    *          dictionaries if Version > 3.0. See <a
 101    *          href="CompoundWordTokenFilterBase#version"
 102    *          >CompoundWordTokenFilterBase</a> for details.
 103    * @param input
 104    *          the {@link TokenStream} to process
 105    * @param hyphenator
 106    *          the hyphenation pattern tree to use for hyphenation
 107    * @param dictionary
 108    *          the word dictionary to match against. If this is a
 109    *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
 110    *          must have set ignoreCase=false and only contain lower case
 111    *          strings.
 112    */
 113   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 114       HyphenationTree hyphenator, Set<?> dictionary) {
 115     this(input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
 116         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 117   }
 118
 119   /**
 120    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 121    *
 122    * @param matchVersion
 123    *          Lucene version to enable correct Unicode 4.0 behavior in the
 124    *          dictionaries if Version > 3.0. See <a
 125    *          href="CompoundWordTokenFilterBase#version"
 126    *          >CompoundWordTokenFilterBase</a> for details.
 127    * @param input
 128    *          the {@link TokenStream} to process
 129    * @param hyphenator
 130    *          the hyphenation pattern tree to use for hyphenation
 131    * @param dictionary
 132    *          the word dictionary to match against. If this is a
 133    *          {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it
 134    *          must have set ignoreCase=false and only contain lower case
 135    *          strings.
 136    * @param minWordSize
 137    *          only words longer than this get processed
 138    * @param minSubwordSize
 139    *          only subwords longer than this get to the output stream
 140    * @param maxSubwordSize
 141    *          only subwords shorter than this get to the output stream
 142    * @param onlyLongestMatch
 143    *          Add only the longest matching subword to the stream
 144    */
 145   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 146       HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
 147       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 148     super(matchVersion, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
 149         onlyLongestMatch);
 150
 151     this.hyphenator = hyphenator;
 152   }
 153
 154   /**
 155    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 156    * <p>
 157    * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)
 158    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 159    * null, minWordSize, minSubwordSize, maxSubwordSize }
 160    */
 161   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 162       HyphenationTree hyphenator, int minWordSize, int minSubwordSize,
 163       int maxSubwordSize) {
 164     this(matchVersion, input, hyphenator, (Set<?>) null, minWordSize, minSubwordSize,
 165         maxSubwordSize, false);
 166   }
 167
 168   /**
 169    * Create a HyphenationCompoundWordTokenFilter with no dictionary.
 170    * <p>
 171    * Calls {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, int, int, int)
 172    * HyphenationCompoundWordTokenFilter(matchVersion, input, hyphenator,
 173    * DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE }
 174    */
 175   public HyphenationCompoundWordTokenFilter(Version matchVersion, TokenStream input,
 176       HyphenationTree hyphenator) {
 177     this(matchVersion, input, hyphenator, DEFAULT_MIN_WORD_SIZE, DEFAULT_MIN_SUBWORD_SIZE,
 178         DEFAULT_MAX_SUBWORD_SIZE);
 179   }
 180
 181   /**
 182    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 183    *
 184    * @param input the {@link TokenStream} to process
 185    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 186    * @param dictionary the word dictionary to match against
 187    * @param minWordSize only words longer than this get processed
 188    * @param minSubwordSize only subwords longer than this get to the output
 189    *        stream
 190    * @param maxSubwordSize only subwords shorter than this get to the output
 191    *        stream
 192    * @param onlyLongestMatch Add only the longest matching subword to the stream
 193    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[], int, int, int, boolean)} instead.
 194    */
 195   @Deprecated
 196   public HyphenationCompoundWordTokenFilter(TokenStream input,
 197       HyphenationTree hyphenator, String[] dictionary, int minWordSize,
 198       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 199     this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), minWordSize,
 200         minSubwordSize, maxSubwordSize, onlyLongestMatch);
 201   }
 202
 203   /**
 204    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 205    *
 206    * @param input the {@link TokenStream} to process
 207    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 208    * @param dictionary the word dictionary to match against
 209    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, String[])} instead.
 210    */
 211   @Deprecated
 212   public HyphenationCompoundWordTokenFilter(TokenStream input,
 213       HyphenationTree hyphenator, String[] dictionary) {
 214     this(Version.LUCENE_30, input, hyphenator, makeDictionary(dictionary), DEFAULT_MIN_WORD_SIZE,
 215         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 216   }
 217
 218   /**
 219    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 220    *
 221    * @param input the {@link TokenStream} to process
 222    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 223    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
 224    *        lower case strings.
 225    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set)} instead.
 226    */
 227   @Deprecated
 228   public HyphenationCompoundWordTokenFilter(TokenStream input,
 229       HyphenationTree hyphenator, Set<?> dictionary) {
 230     this(Version.LUCENE_30, input, hyphenator, dictionary, DEFAULT_MIN_WORD_SIZE,
 231         DEFAULT_MIN_SUBWORD_SIZE, DEFAULT_MAX_SUBWORD_SIZE, false);
 232   }
 233
 234   /**
 235    * Creates a new {@link HyphenationCompoundWordTokenFilter} instance.
 236    *
 237    * @param input the {@link TokenStream} to process
 238    * @param hyphenator the hyphenation pattern tree to use for hyphenation
 239    * @param dictionary the word dictionary to match against. If this is a {@link org.apache.lucene.analysis.CharArraySet CharArraySet} it must have set ignoreCase=false and only contain
 240    *        lower case strings.
 241    * @param minWordSize only words longer than this get processed
 242    * @param minSubwordSize only subwords longer than this get to the output
 243    *        stream
 244    * @param maxSubwordSize only subwords shorter than this get to the output
 245    *        stream
 246    * @param onlyLongestMatch Add only the longest matching subword to the stream
 247    * @deprecated use {@link #HyphenationCompoundWordTokenFilter(Version, TokenStream, HyphenationTree, Set, int, int, int, boolean)} instead.
 248    */
 249   @Deprecated
 250   public HyphenationCompoundWordTokenFilter(TokenStream input,
 251       HyphenationTree hyphenator, Set<?> dictionary, int minWordSize,
 252       int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
 253     super(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize,
 254         onlyLongestMatch);
 255
 256     this.hyphenator = hyphenator;
 257   }
 258
 259   /**
 260    * Create a hyphenator tree
 261    *
 262    * @param hyphenationFilename the filename of the XML grammar to load
 263    * @return An object representing the hyphenation patterns
 264    * @throws Exception
 265    */
 266   public static HyphenationTree getHyphenationTree(String hyphenationFilename)
 267       throws Exception {
 268     return getHyphenationTree(new InputSource(hyphenationFilename));
 269   }
 270
 271   /**
 272    * Create a hyphenator tree
 273    *
 274    * @param hyphenationFile the file of the XML grammar to load
 275    * @return An object representing the hyphenation patterns
 276    * @throws Exception
 277    */
 278   public static HyphenationTree getHyphenationTree(File hyphenationFile)
 279       throws Exception {
 280     return getHyphenationTree(new InputSource(hyphenationFile.toURL().toExternalForm()));
 281   }
 282
 283   /**
 284    * Create a hyphenator tree
 285    *
 286    * @param hyphenationReader the reader of the XML grammar to load from
 287    * @return An object representing the hyphenation patterns
 288    * @throws Exception
 289    * @deprecated Don't use Readers with fixed charset to load XML files, unless programatically created.
 290    * Use {@link #getHyphenationTree(InputSource)} instead, where you can supply default charset and input
 291    * stream, if you like.
 292    */
 293   @Deprecated
 294   public static HyphenationTree getHyphenationTree(Reader hyphenationReader)
 295       throws Exception {
 296     final InputSource is = new InputSource(hyphenationReader);
 297     // we need this to load the DTD in very old parsers (like the one in JDK 1.4).
 298     // The DTD itsself is provided via EntityResolver, so it should always load, but
 299     // some parsers still want to have a base URL (Crimson).
 300     is.setSystemId("urn:java:" + HyphenationTree.class.getName());
 301     return getHyphenationTree(is);
 302   }
 303
 304   /**
 305    * Create a hyphenator tree
 306    *
 307    * @param hyphenationSource the InputSource pointing to the XML grammar
 308    * @return An object representing the hyphenation patterns
 309    * @throws Exception
 310    */
 311   public static HyphenationTree getHyphenationTree(InputSource hyphenationSource)
 312       throws Exception {
 313     HyphenationTree tree = new HyphenationTree();
 314     tree.loadPatterns(hyphenationSource);
 315     return tree;
 316   }
 317
 318   @Override
 319   protected void decomposeInternal(final Token token) {
 320     // get the hyphenation points
 321     Hyphenation hyphens = hyphenator.hyphenate(token.buffer(), 0, token
 322         .length(), 1, 1);
 323     // No hyphen points found -> exit
 324     if (hyphens == null) {
 325       return;
 326     }
 327
 328     final int[] hyp = hyphens.getHyphenationPoints();
 329     char[] lowerCaseTermBuffer=makeLowerCaseCopy(token.buffer());
 330
 331     for (int i = 0; i < hyp.length; ++i) {
 332       int remaining = hyp.length - i;
 333       int start = hyp[i];
 334       Token longestMatchToken = null;
 335       for (int j = 1; j < remaining; j++) {
 336         int partLength = hyp[i + j] - start;
 337
 338         // if the part is longer than maxSubwordSize we
 339         // are done with this round
 340         if (partLength > this.maxSubwordSize) {
 341           break;
 342         }
 343
 344         // we only put subwords to the token stream
 345         // that are longer than minPartSize
 346         if (partLength < this.minSubwordSize) {
 347           continue;
 348         }
 349
 350         // check the dictionary
 351         if (dictionary == null || dictionary.contains(lowerCaseTermBuffer, start, partLength)) {
 352           if (this.onlyLongestMatch) {
 353             if (longestMatchToken != null) {
 354               if (longestMatchToken.length() < partLength) {
 355                 longestMatchToken = createToken(start, partLength, token);
 356               }
 357             } else {
 358               longestMatchToken = createToken(start, partLength, token);
 359             }
 360           } else {
 361             tokens.add(createToken(start, partLength, token));
 362           }
 363         } else if (dictionary.contains(lowerCaseTermBuffer, start,
 364             partLength - 1)) {
 365           // check the dictionary again with a word that is one character
 366           // shorter
 367           // to avoid problems with genitive 's characters and other binding
 368           // characters
 369           if (this.onlyLongestMatch) {
 370             if (longestMatchToken != null) {
 371               if (longestMatchToken.length() < partLength - 1) {
 372                 longestMatchToken = createToken(start, partLength - 1, token);
 373               }
 374             } else {
 375               longestMatchToken = createToken(start, partLength - 1, token);
 376             }
 377           } else {
 378             tokens.add(createToken(start, partLength - 1, token));
 379           }
 380         }
 381       }
 382       if (this.onlyLongestMatch && longestMatchToken!=null) {
 383         tokens.add(longestMatchToken);
 384       }
 385     }
 386   }
 387 }