+++ /dev/null
-package org.apache.lucene.analysis.compound;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.IOException;
-import java.util.Arrays;
-import java.util.Collection;
-import java.util.LinkedList;
-import java.util.Locale;
-import java.util.Set;
-
-import org.apache.lucene.analysis.CharArraySet;
-import org.apache.lucene.analysis.Token;
-import org.apache.lucene.analysis.TokenFilter;
-import org.apache.lucene.analysis.TokenStream;
-import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.PayloadAttribute;
-import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.Version;
-
-/**
- * Base class for decomposition token filters. <a name="version"/>
- * <p>
- * You must specify the required {@link Version} compatibility when creating
- * CompoundWordTokenFilterBase:
- * <ul>
- * <li>As of 3.1, CompoundWordTokenFilterBase correctly handles Unicode 4.0
- * supplementary characters in strings and char arrays provided as compound word
- * dictionaries.
- * </ul>
- */
-public abstract class CompoundWordTokenFilterBase extends TokenFilter {
- /**
- * The default for minimal word length that gets decomposed
- */
- public static final int DEFAULT_MIN_WORD_SIZE = 5;
-
- /**
- * The default for minimal length of subwords that get propagated to the output of this filter
- */
- public static final int DEFAULT_MIN_SUBWORD_SIZE = 2;
-
- /**
- * The default for maximal length of subwords that get propagated to the output of this filter
- */
- public static final int DEFAULT_MAX_SUBWORD_SIZE = 15;
-
- protected final CharArraySet dictionary;
- protected final LinkedList<Token> tokens;
- protected final int minWordSize;
- protected final int minSubwordSize;
- protected final int maxSubwordSize;
- protected final boolean onlyLongestMatch;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
- private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
- private final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class);
-
- private final Token wrapper = new Token();
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
- }
-
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
- }
-
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
- }
-
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) {
- this(Version.LUCENE_30, input, makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
- }
-
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary) {
- this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
- }
-
- /**
- * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead
- */
- @Deprecated
- protected CompoundWordTokenFilterBase(TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- this(matchVersion, input,makeDictionary(dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) {
- this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, boolean onlyLongestMatch) {
- this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) {
- this(matchVersion, input,makeDictionary(dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary) {
- this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false);
- }
-
- protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set<?> dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
- super(input);
-
- this.tokens=new LinkedList<Token>();
- this.minWordSize=minWordSize;
- this.minSubwordSize=minSubwordSize;
- this.maxSubwordSize=maxSubwordSize;
- this.onlyLongestMatch=onlyLongestMatch;
-
- if (dictionary==null || dictionary instanceof CharArraySet) {
- this.dictionary = (CharArraySet) dictionary;
- } else {
- this.dictionary = new CharArraySet(matchVersion, dictionary.size(), false);
- addAllLowerCase(this.dictionary, dictionary);
- }
- }
-
- /**
- * Create a set of words from an array
- * The resulting Set does case insensitive matching
- * TODO We should look for a faster dictionary lookup approach.
- * @param dictionary
- * @return {@link Set} of lowercased terms
- */
- public static final Set<?> makeDictionary(final String[] dictionary) {
- return makeDictionary(Version.LUCENE_30, dictionary);
- }
-
- public static final Set<?> makeDictionary(final Version matchVersion, final String[] dictionary) {
- if (dictionary == null) {
- return null;
- }
- // is the below really case insensitive?
- CharArraySet dict = new CharArraySet(matchVersion, dictionary.length, false);
- addAllLowerCase(dict, Arrays.asList(dictionary));
- return dict;
- }
-
- private final void setToken(final Token token) throws IOException {
- clearAttributes();
- termAtt.copyBuffer(token.buffer(), 0, token.length());
- flagsAtt.setFlags(token.getFlags());
- typeAtt.setType(token.type());
- offsetAtt.setOffset(token.startOffset(), token.endOffset());
- posIncAtt.setPositionIncrement(token.getPositionIncrement());
- payloadAtt.setPayload(token.getPayload());
- }
-
- @Override
- public final boolean incrementToken() throws IOException {
- if (tokens.size() > 0) {
- setToken(tokens.removeFirst());
- return true;
- }
-
- if (!input.incrementToken())
- return false;
-
- wrapper.copyBuffer(termAtt.buffer(), 0, termAtt.length());
- wrapper.setStartOffset(offsetAtt.startOffset());
- wrapper.setEndOffset(offsetAtt.endOffset());
- wrapper.setFlags(flagsAtt.getFlags());
- wrapper.setType(typeAtt.type());
- wrapper.setPositionIncrement(posIncAtt.getPositionIncrement());
- wrapper.setPayload(payloadAtt.getPayload());
-
- decompose(wrapper);
-
- if (tokens.size() > 0) {
- setToken(tokens.removeFirst());
- return true;
- } else {
- return false;
- }
- }
-
- protected static final void addAllLowerCase(CharArraySet target, Collection<?> col) {
- for (Object obj : col) {
- String string = (String) obj;
- target.add(string.toLowerCase(Locale.ENGLISH));
- }
- }
-
- protected static char[] makeLowerCaseCopy(final char[] buffer) {
- char[] result=new char[buffer.length];
- System.arraycopy(buffer, 0, result, 0, buffer.length);
-
- for (int i=0;i<buffer.length;++i) {
- result[i]=Character.toLowerCase(buffer[i]);
- }
-
- return result;
- }
-
- protected final Token createToken(final int offset, final int length,
- final Token prototype) {
- int newStart = prototype.startOffset() + offset;
- Token t = prototype.clone(prototype.buffer(), offset, length, newStart, newStart+length);
- t.setPositionIncrement(0);
- return t;
- }
-
- protected void decompose(final Token token) {
- // In any case we give the original token back
- tokens.add((Token) token.clone());
-
- // Only words longer than minWordSize get processed
- if (token.length() < this.minWordSize) {
- return;
- }
-
- decomposeInternal(token);
- }
-
- protected abstract void decomposeInternal(final Token token);
-
- @Override
- public void reset() throws IOException {
- super.reset();
- tokens.clear();
- }
-}