1 <!doctype html public "-//w3c//dtd html 4.0 transitional//en">
3 Licensed to the Apache Software Foundation (ASF) under one or more
4 contributor license agreements. See the NOTICE file distributed with
5 this work for additional information regarding copyright ownership.
6 The ASF licenses this file to You under the Apache License, Version 2.0
7 (the "License"); you may not use this file except in compliance with
8 the License. You may obtain a copy of the License at
10 http://www.apache.org/licenses/LICENSE-2.0
12 Unless required by applicable law or agreed to in writing, software
13 distributed under the License is distributed on an "AS IS" BASIS,
14 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 See the License for the specific language governing permissions and
16 limitations under the License.
20 <title>CompoundWordTokenFilter</title>
21 <meta http-equiv="Content-Type" content="text/html; charset=iso-8859-1"></meta>
24 A filter that decomposes compound words you find in many Germanic
25 languages into the word parts. This example shows what it does:
28 <th>Input token stream</th>
31 <td>Rindfleischüberwachungsgesetz Drahtschere abba</td>
37 <th>Output token stream</th>
40 <td>(Rindfleischüberwachungsgesetz,0,29)</td>
43 <td>(Rind,0,4,posIncr=0)</td>
46 <td>(fleisch,4,11,posIncr=0)</td>
49 <td>(überwachung,11,22,posIncr=0)</td>
52 <td>(gesetz,23,29,posIncr=0)</td>
55 <td>(Drahtschere,30,41)</td>
58 <td>(Draht,30,35,posIncr=0)</td>
61 <td>(schere,35,41,posIncr=0)</td>
68 The input token is always preserved and the filters do not alter the case of word parts. There are two variants of the
71 <li><i>HyphenationCompoundWordTokenFilter</i>: it uses a
72 hyphenation grammar based approach to find potential word parts of a
74 <li><i>DictionaryCompoundWordTokenFilter</i>: it uses a
75 brute-force dictionary-only based approach to find the word parts of a given
79 <h3>Compound word token filters</h3>
80 <h4>HyphenationCompoundWordTokenFilter</h4>
82 org.apache.lucene.analysis.compound.HyphenationCompoundWordTokenFilter
83 HyphenationCompoundWordTokenFilter} uses hyphenation grammars to find
84 potential subwords that a worth to check against the dictionary. It can be used
85 without a dictionary as well but then produces a lot of "nonword" tokens.
86 The quality of the output tokens is directly connected to the quality of the
87 grammar file you use. For languages like German they are quite good.
89 Unfortunately we cannot bundle the hyphenation grammar files with Lucene
90 because they do not use an ASF compatible license (they use the LaTeX
91 Project Public License instead). You can find the XML based grammar
93 <a href="http://offo.sourceforge.net/hyphenation/index.html">Objects
94 For Formatting Objects</a>
95 (OFFO) Sourceforge project (direct link to download the pattern files:
96 <a href="http://downloads.sourceforge.net/offo/offo-hyphenation.zip">http://downloads.sourceforge.net/offo/offo-hyphenation.zip</a>
97 ). The files you need are in the subfolder
98 <i>offo-hyphenation/hyph/</i>
101 Credits for the hyphenation code go to the
102 <a href="http://xmlgraphics.apache.org/fop/">Apache FOP project</a>
105 <h4>DictionaryCompoundWordTokenFilter</h4>
107 org.apache.lucene.analysis.compound.DictionaryCompoundWordTokenFilter
108 DictionaryCompoundWordTokenFilter} uses a dictionary-only approach to
109 find subwords in a compound word. It is much slower than the one that
110 uses the hyphenation grammars. You can use it as a first start to
111 see if your dictionary is good or not because it is much simpler in design.
114 The output quality of both token filters is directly connected to the
115 quality of the dictionary you use. They are language dependent of course.
116 You always should use a dictionary
117 that fits to the text you want to index. If you index medical text for
118 example then you should use a dictionary that contains medical words.
119 A good start for general text are the dictionaries you find at the
120 <a href="http://wiki.services.openoffice.org/wiki/Dictionaries">OpenOffice
124 <h3>Which variant should I use?</h3>
125 This decision matrix should help you:
128 <th>Token filter</th>
129 <th>Output quality</th>
133 <td>HyphenationCompoundWordTokenFilter</td>
134 <td>good if grammar file is good – acceptable otherwise</td>
138 <td>DictionaryCompoundWordTokenFilter</td>
145 public void testHyphenationCompoundWordsDE() throws Exception {
146 String[] dict = { "Rind", "Fleisch", "Draht", "Schere", "Gesetz",
147 "Aufgabe", "Überwachung" };
149 Reader reader = new FileReader("de_DR.xml");
151 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
152 .getHyphenationTree(reader);
154 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
155 new WhitespaceTokenizer(new StringReader(
156 "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator,
157 dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
158 CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
159 CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
161 CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
162 while (tf.incrementToken()) {
163 System.out.println(t);
167 public void testHyphenationCompoundWordsWithoutDictionaryDE() throws Exception {
168 Reader reader = new FileReader("de_DR.xml");
170 HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
171 .getHyphenationTree(reader);
173 HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
174 new WhitespaceTokenizer(new StringReader(
175 "Rindfleischüberwachungsgesetz Drahtschere abba")), hyphenator);
177 CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
178 while (tf.incrementToken()) {
179 System.out.println(t);
183 public void testDumbCompoundWordsSE() throws Exception {
184 String[] dict = { "Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
185 "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
186 "Sko", "Vind", "Rute", "Torkare", "Blad" };
188 DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
189 new WhitespaceTokenizer(
191 "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba")),
193 CharTermAttribute t = tf.addAttribute(CharTermAttribute.class);
194 while (tf.incrementToken()) {
195 System.out.println(t);