lucene-java-3.5.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.MockTokenizer;
  25 import org.apache.lucene.analysis.TokenFilter;
  26 import org.apache.lucene.analysis.TokenStream;
  27 import org.apache.lucene.analysis.Tokenizer;
  28 import org.apache.lucene.analysis.WhitespaceTokenizer;
  29 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
  30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  31 import org.apache.lucene.util.Attribute;
  32 import org.apache.lucene.util.AttributeImpl;
  33 import org.xml.sax.InputSource;
  34
  35 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
  36   public void testHyphenationCompoundWordsDA() throws Exception {
  37     String[] dict = { "læse", "hest" };
  38
  39     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  40     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  41         .getHyphenationTree(is);
  42
  43     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
  44         new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
  45         hyphenator,
  46         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  47         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
  48         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  49     assertTokenStreamContents(tf,
  50         new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
  51         new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
  52     );
  53   }
  54
  55   public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  56     String[] dict = { "basketball", "basket", "ball", "kurv" };
  57
  58     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  59     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  60         .getHyphenationTree(is);
  61
  62     // the word basket will not be added due to the longest match option
  63     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
  64         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  65         hyphenator, dict,
  66         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  67         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  68     assertTokenStreamContents(tf,
  69         new String[] { "basketballkurv", "basketball", "ball", "kurv" },
  70         new int[] { 1, 0, 0, 0 }
  71     );
  72
  73   }
  74
  75   /**
  76    * With hyphenation-only, you can get a lot of nonsense tokens.
  77    * This can be controlled with the min/max subword size.
  78    */
  79   public void testHyphenationOnly() throws Exception {
  80     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  81     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  82         .getHyphenationTree(is);
  83
  84     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
  85         TEST_VERSION_CURRENT,
  86         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  87         hyphenator,
  88         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  89         2, 4);
  90
  91     // min=2, max=4
  92     assertTokenStreamContents(tf,
  93         new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
  94     );
  95
  96     tf = new HyphenationCompoundWordTokenFilter(
  97         TEST_VERSION_CURRENT,
  98         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  99         hyphenator,
 100         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 101         4, 6);
 102
 103     // min=4, max=6
 104     assertTokenStreamContents(tf,
 105         new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
 106     );
 107
 108     tf = new HyphenationCompoundWordTokenFilter(
 109         TEST_VERSION_CURRENT,
 110         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
 111         hyphenator,
 112         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 113         4, 10);
 114
 115     // min=4, max=10
 116     assertTokenStreamContents(tf,
 117         new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
 118                        "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
 119     );
 120
 121   }
 122
 123   public void testDumbCompoundWordsSE() throws Exception {
 124     String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
 125         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
 126         "Sko", "Vind", "Rute", "Torkare", "Blad" };
 127
 128     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 129         new MockTokenizer(
 130             new StringReader(
 131                 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
 132             MockTokenizer.WHITESPACE, false),
 133         dict);
 134
 135     assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
 136         "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
 137         "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
 138         "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
 139         "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
 140         "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
 141         "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
 142         "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
 143         17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
 144         77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
 145         137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
 146         28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
 147         87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
 148         155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
 149         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
 150         0, 0, 0, 1 });
 151   }
 152
 153   public void testDumbCompoundWordsSELongestMatch() throws Exception {
 154     String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
 155         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
 156         "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
 157
 158     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 159         new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
 160         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 161         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 162         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
 163
 164     assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
 165         "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
 166         14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
 167         0, 0 });
 168   }
 169
 170   public void testTokenEndingWithWordComponentOfMinimumLength() throws Exception {
 171     String[] dict = {"ab", "cd", "ef"};
 172
 173     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 174       new WhitespaceTokenizer(TEST_VERSION_CURRENT,
 175         new StringReader(
 176           "abcdef")
 177         ),
 178       dict,
 179       CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 180       CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 181       CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 182
 183     assertTokenStreamContents(tf,
 184       new String[] { "abcdef", "ab", "cd", "ef" },
 185       new int[] { 0, 0, 2, 4},
 186       new int[] { 6, 2, 4, 6},
 187       new int[] { 1, 0, 0, 0}
 188       );
 189   }
 190
 191   public void testWordComponentWithLessThanMinimumLength() throws Exception {
 192     String[] dict = {"abc", "d", "efg"};
 193
 194     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 195       new WhitespaceTokenizer(TEST_VERSION_CURRENT,
 196         new StringReader(
 197           "abcdefg")
 198         ),
 199       dict,
 200       CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 201       CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 202       CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 203
 204   // since "d" is shorter than the minimum subword size, it should not be added to the token stream
 205     assertTokenStreamContents(tf,
 206       new String[] { "abcdefg", "abc", "efg" },
 207       new int[] { 0, 0, 4},
 208       new int[] { 7, 3, 7},
 209       new int[] { 1, 0, 0}
 210       );
 211   }
 212
 213   public void testReset() throws Exception {
 214     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
 215         "Aufgabe", "Überwachung" };
 216
 217     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
 218         "Rindfleischüberwachungsgesetz"));
 219     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 220         wsTokenizer, dict,
 221         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 222         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 223         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 224
 225     CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
 226     assertTrue(tf.incrementToken());
 227     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
 228     assertTrue(tf.incrementToken());
 229     assertEquals("Rind", termAtt.toString());
 230     wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
 231     tf.reset();
 232     assertTrue(tf.incrementToken());
 233     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
 234   }
 235
 236   public void testRetainMockAttribute() throws Exception {
 237     String[] dict = { "abc", "d", "efg" };
 238     Tokenizer tokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT,
 239         new StringReader("abcdefg"));
 240     TokenStream stream = new MockRetainAttributeFilter(tokenizer);
 241     stream = new DictionaryCompoundWordTokenFilter(
 242         TEST_VERSION_CURRENT, stream, dict,
 243         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 244         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 245         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 246     MockRetainAttribute retAtt = stream.addAttribute(MockRetainAttribute.class);
 247     while (stream.incrementToken()) {
 248       assertTrue("Custom attribute value was lost", retAtt.getRetain());
 249     }
 250
 251   }
 252
 253   public static interface MockRetainAttribute extends Attribute {
 254     void setRetain(boolean attr);
 255     boolean getRetain();
 256   }
 257
 258   public static final class MockRetainAttributeImpl extends AttributeImpl implements MockRetainAttribute {
 259     private boolean retain = false;
 260     @Override
 261     public void clear() {
 262       retain = false;
 263     }
 264     public boolean getRetain() {
 265       return retain;
 266     }
 267     public void setRetain(boolean retain) {
 268       this.retain = retain;
 269     }
 270     @Override
 271     public void copyTo(AttributeImpl target) {
 272       MockRetainAttribute t = (MockRetainAttribute) target;
 273       t.setRetain(retain);
 274     }
 275   }
 276
 277   private static class MockRetainAttributeFilter extends TokenFilter {
 278
 279     MockRetainAttribute retainAtt = addAttribute(MockRetainAttribute.class);
 280
 281     MockRetainAttributeFilter(TokenStream input) {
 282       super(input);
 283     }
 284
 285     @Override
 286     public boolean incrementToken() throws IOException {
 287       if (input.incrementToken()){
 288         retainAtt.setRetain(true);
 289         return true;
 290       } else {
 291       return false;
 292       }
 293     }
 294   }
 295
 296 }