lucene-java-3.4.0/lucene/contrib/analyzers/common/src/test/org/apache/lucene/analysis/compound/TestCompoundWordTokenFilter.java

   1 package org.apache.lucene.analysis.compound;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.StringReader;
  21 import org.xml.sax.InputSource;
  22
  23 import org.apache.lucene.analysis.BaseTokenStreamTestCase;
  24 import org.apache.lucene.analysis.MockTokenizer;
  25 import org.apache.lucene.analysis.Tokenizer;
  26 import org.apache.lucene.analysis.WhitespaceTokenizer;
  27 import org.apache.lucene.analysis.compound.hyphenation.HyphenationTree;
  28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  29
  30 public class TestCompoundWordTokenFilter extends BaseTokenStreamTestCase {
  31   public void testHyphenationCompoundWordsDA() throws Exception {
  32     String[] dict = { "læse", "hest" };
  33
  34     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  35     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  36         .getHyphenationTree(is);
  37
  38     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
  39         new MockTokenizer(new StringReader("min veninde som er lidt af en læsehest"), MockTokenizer.WHITESPACE, false),
  40         hyphenator,
  41         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  42         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
  43         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  44     assertTokenStreamContents(tf,
  45         new String[] { "min", "veninde", "som", "er", "lidt", "af", "en", "læsehest", "læse", "hest" },
  46         new int[] { 1, 1, 1, 1, 1, 1, 1, 1, 0, 0 }
  47     );
  48   }
  49
  50   public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  51     String[] dict = { "basketball", "basket", "ball", "kurv" };
  52
  53     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  54     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  55         .getHyphenationTree(is);
  56
  57     // the word basket will not be added due to the longest match option
  58     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(TEST_VERSION_CURRENT,
  59         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  60         hyphenator, dict,
  61         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  62         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  63     assertTokenStreamContents(tf,
  64         new String[] { "basketballkurv", "basketball", "ball", "kurv" },
  65         new int[] { 1, 0, 0, 0 }
  66     );
  67
  68   }
  69
  70   /**
  71    * With hyphenation-only, you can get a lot of nonsense tokens.
  72    * This can be controlled with the min/max subword size.
  73    */
  74   public void testHyphenationOnly() throws Exception {
  75     InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  76     HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
  77         .getHyphenationTree(is);
  78
  79     HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
  80         TEST_VERSION_CURRENT,
  81         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  82         hyphenator,
  83         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  84         2, 4);
  85
  86     // min=2, max=4
  87     assertTokenStreamContents(tf,
  88         new String[] { "basketballkurv", "ba", "sket", "bal", "ball", "kurv" }
  89     );
  90
  91     tf = new HyphenationCompoundWordTokenFilter(
  92         TEST_VERSION_CURRENT,
  93         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
  94         hyphenator,
  95         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
  96         4, 6);
  97
  98     // min=4, max=6
  99     assertTokenStreamContents(tf,
 100         new String[] { "basketballkurv", "basket", "sket", "ball", "lkurv", "kurv" }
 101     );
 102
 103     tf = new HyphenationCompoundWordTokenFilter(
 104         TEST_VERSION_CURRENT,
 105         new MockTokenizer(new StringReader("basketballkurv"), MockTokenizer.WHITESPACE, false),
 106         hyphenator,
 107         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 108         4, 10);
 109
 110     // min=4, max=10
 111     assertTokenStreamContents(tf,
 112         new String[] { "basketballkurv", "basket", "basketbal", "basketball", "sket",
 113                        "sketbal", "sketball", "ball", "ballkurv", "lkurv", "kurv" }
 114     );
 115
 116   }
 117
 118   public void testDumbCompoundWordsSE() throws Exception {
 119     String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
 120         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
 121         "Sko", "Vind", "Rute", "Torkare", "Blad" };
 122
 123     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 124         new MockTokenizer(
 125             new StringReader(
 126                 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
 127             MockTokenizer.WHITESPACE, false),
 128         dict);
 129
 130     assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
 131         "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
 132         "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
 133         "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
 134         "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
 135         "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
 136         "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
 137         "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 3, 8, 8, 11, 17,
 138         17, 20, 24, 24, 28, 33, 33, 39, 44, 44, 49, 54, 54, 58, 62, 69, 69, 72,
 139         77, 84, 84, 87, 92, 98, 104, 111, 111, 114, 121, 121, 125, 129, 137,
 140         137, 141, 151, 156 }, new int[] { 7, 3, 7, 16, 11, 16, 23, 20, 23, 32,
 141         28, 32, 43, 39, 43, 53, 49, 53, 68, 58, 62, 68, 83, 72, 76, 83, 110,
 142         87, 91, 98, 104, 110, 120, 114, 120, 136, 125, 129, 136, 155, 141, 145,
 143         155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
 144         0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
 145         0, 0, 0, 1 });
 146   }
 147
 148   public void testDumbCompoundWordsSELongestMatch() throws Exception {
 149     String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
 150         "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiols", "Makare", "Gesäll",
 151         "Sko", "Vind", "Rute", "Torkare", "Blad", "Fiolsfodral" };
 152
 153     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 154         new MockTokenizer(new StringReader("Basfiolsfodralmakaregesäll"), MockTokenizer.WHITESPACE, false),
 155         dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 156         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 157         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, true);
 158
 159     assertTokenStreamContents(tf, new String[] { "Basfiolsfodralmakaregesäll", "Bas",
 160         "fiolsfodral", "fodral", "makare", "gesäll" }, new int[] { 0, 0, 3, 8,
 161         14, 20 }, new int[] { 26, 3, 14, 14, 20, 26 }, new int[] { 1, 0, 0, 0,
 162         0, 0 });
 163   }
 164
 165   public void testReset() throws Exception {
 166     String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
 167         "Aufgabe", "Überwachung" };
 168
 169     Tokenizer wsTokenizer = new WhitespaceTokenizer(TEST_VERSION_CURRENT, new StringReader(
 170         "Rindfleischüberwachungsgesetz"));
 171     DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(TEST_VERSION_CURRENT,
 172         wsTokenizer, dict,
 173         CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
 174         CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
 175         CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
 176
 177     CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
 178     assertTrue(tf.incrementToken());
 179     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
 180     assertTrue(tf.incrementToken());
 181     assertEquals("Rind", termAtt.toString());
 182     wsTokenizer.reset(new StringReader("Rindfleischüberwachungsgesetz"));
 183     tf.reset();
 184     assertTrue(tf.incrementToken());
 185     assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
 186   }
 187
 188 }