1 package org.apache.lucene.analysis.compound;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.util.Arrays;
22 import java.util.Collection;
23 import java.util.LinkedList;
24 import java.util.Locale;
27 import org.apache.lucene.analysis.CharArraySet;
28 import org.apache.lucene.analysis.Token;
29 import org.apache.lucene.analysis.TokenFilter;
30 import org.apache.lucene.analysis.TokenStream;
31 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
32 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
33 import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
34 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
35 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
36 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
37 import org.apache.lucene.util.Version;
40 * Base class for decomposition token filters. <a name="version"/>
42 * You must specify the required {@link Version} compatibility when creating
43 * CompoundWordTokenFilterBase:
45 * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
46 * supplementary characters in strings and char arrays provided as compound word
50 public abstract class CompoundWordTokenFilterBase extends TokenFilter {
52 * The default for minimal word length that gets decomposed
54 public static final int DEFAULT_MIN_WORD_SIZE = 5;
57 * The default for minimal length of subwords that get propagated to the output of this filter
59 public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
62 * The default for maximal length of subwords that get propagated to the output of this filter
64 public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
66 protected final CharArraySet dictionary;
67 protected final LinkedList<Token> tokens;
68 protected final int minWordSize;
69 protected final int minSubwordSize;
70 protected final int maxSubwordSize;
71 protected final boolean onlyLongestMatch;
73 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
74 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
75 private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
76 private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
77 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
78 private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
80 private final Token wrapper = new Token();
82 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
85 protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
86 this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
90 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead
93 protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
94 this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
98 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead
101 protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
102 this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
106 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead
109 protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
110 this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
114 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead
117 protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
118 this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
122 * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead
125 protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
126 this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
129 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
130 this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
133 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
134 this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
137 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
138 this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
141 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
142 this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
145 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
146 this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
149 protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
152 this.tokens=new LinkedList<Token>();
153 this.minWordSize=minWordSize;
154 this.minSubwordSize=minSubwordSize;
155 this.maxSubwordSize=maxSubwordSize;
156 this.onlyLongestMatch=onlyLongestMatch;
158 if (dictionary==null || dictionary instanceof CharArraySet) {
159 this.dictionary = (CharArraySet) dictionary;
161 this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
162 addAllLowerCase(this.dictionary, dictionary);
167 * Create a set of words from an array
168 * The resulting Set does case insensitive matching
169 * TODO We should look for a faster dictionary lookup approach.
171 * @return {@link Set} of lowercased terms
173 public static final Set<?> makeDictionary(final String[] dictionary) {
174 return makeDictionary(Version.LUCENE_30, dictionary);
177 public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
178 if (dictionary == null) {
181 // is the below really case insensitive?
182 CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
183 addAllLowerCase(dict, Arrays.asList(dictionary));
187 private final void setToken(final Token token) throws IOException {
189 termAtt.copyBuffer(token.buffer(), 0, token.length());
190 flagsAtt.setFlags(token.getFlags());
191 typeAtt.setType(token.type());
192 offsetAtt.setOffset(token.startOffset(), token.endOffset());
193 posIncAtt.setPositionIncrement(token.getPositionIncrement());
194 payloadAtt.setPayload(token.getPayload());
198 public final boolean incrementToken() throws IOException {
199 if (tokens.size() > 0) {
200 setToken(tokens.removeFirst());
204 if (!input.incrementToken())
207 wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
208 wrapper.setStartOffset(offsetAtt.startOffset());
209 wrapper.setEndOffset(offsetAtt.endOffset());
210 wrapper.setFlags(flagsAtt.getFlags());
211 wrapper.setType(typeAtt.type());
212 wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
213 wrapper.setPayload(payloadAtt.getPayload());
217 if (tokens.size() > 0) {
218 setToken(tokens.removeFirst());
225 protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
226 for (Object obj : col) {
227 String string = (String) obj;
228 target.add(string.toLowerCase(Locale.ENGLISH));
232 protected static char[] makeLowerCaseCopy(final char[] buffer) {
233 char[] result=new char[buffer.length];
234 System.arraycopy(buffer, 0, result, 0, buffer.length);
236 for (int i=0;i<buffer.length;++i) {
237 result[i]=Character.toLowerCase(buffer[i]);
243 protected final Token createToken(final int offset, final int length,
244 final Token prototype) {
245 int newStart = prototype.startOffset() + offset;
246 Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
247 t.setPositionIncrement(0);
251 protected void decompose(final Token token) {
252 // In any case we give the original token back
253 tokens.add((Token) token.clone());
255 // Only words longer than minWordSize get processed
256 if (token.length() < this.minWordSize) {
260 decomposeInternal(token);
263 protected abstract void decomposeInternal(final Token token);
266 public void reset() throws IOException {