X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java?ds=sidebyside diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java new file mode 100644 index 0000000..fd1a65a --- /dev/null +++ b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/compound/CompoundWordTokenFilterBase.java @@ -0,0 +1,236 @@ +package org.apache.lucene.analysis.compound; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import java.io.IOException; +import java.util.Arrays; +import java.util.Collection; +import java.util.LinkedList; +import java.util.Locale; +import java.util.Set; + +import org.apache.lucene.analysis.CharArraySet; +import org.apache.lucene.analysis.TokenFilter; +import org.apache.lucene.analysis.TokenStream; +import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; +import org.apache.lucene.analysis.tokenattributes.OffsetAttribute; +import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; +import org.apache.lucene.util.AttributeSource; +import org.apache.lucene.util.Version; + +/** + * Base class for decomposition token filters. + *

+ * You must specify the required {@link Version} compatibility when creating + * CompoundWordTokenFilterBase: + *

+ *

If you pass in a {@link org.apache.lucene.analysis.CharArraySet} as dictionary, + * it should be case-insensitive unless it contains only lowercased entries and you + * have {@link org.apache.lucene.analysis.LowerCaseFilter} before this filter in your analysis chain. + * For optional performance (as this filter does lots of lookups to the dictionary, + * you should use the latter analysis chain/CharArraySet). Be aware: If you supply arbitrary + * {@link Set Sets} to the ctors or {@code String[]} dictionaries, they will be automatically + * transformed to case-insensitive! + */ +public abstract class CompoundWordTokenFilterBase extends TokenFilter { + /** + * The default for minimal word length that gets decomposed + */ + public static final int DEFAULT_MIN_WORD_SIZE = 5; + + /** + * The default for minimal length of subwords that get propagated to the output of this filter + */ + public static final int DEFAULT_MIN_SUBWORD_SIZE = 2; + + /** + * The default for maximal length of subwords that get propagated to the output of this filter + */ + public static final int DEFAULT_MAX_SUBWORD_SIZE = 15; + + protected final CharArraySet dictionary; + protected final LinkedList tokens; + protected final int minWordSize; + protected final int minSubwordSize; + protected final int maxSubwordSize; + protected final boolean onlyLongestMatch; + + protected final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); + protected final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); + private final PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); + + private AttributeSource.State current; + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], int, int, int, boolean)} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + } + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[], boolean)} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary, boolean onlyLongestMatch) { + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, boolean)} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, boolean onlyLongestMatch) { + this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, String[])} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, String[] dictionary) { + this(Version.LUCENE_30, input, makeDictionary(Version.LUCENE_30, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set)} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary) { + this(Version.LUCENE_30, input, dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + /** + * @deprecated use {@link #CompoundWordTokenFilterBase(Version, TokenStream, Set, int, int, int, boolean)} instead + */ + @Deprecated + protected CompoundWordTokenFilterBase(TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + this(Version.LUCENE_30, input, dictionary, minWordSize, minSubwordSize, maxSubwordSize, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + this(matchVersion, input,makeDictionary(matchVersion, dictionary),minWordSize,minSubwordSize,maxSubwordSize, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary, boolean onlyLongestMatch) { + this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, boolean onlyLongestMatch) { + this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, onlyLongestMatch); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, String[] dictionary) { + this(matchVersion, input,makeDictionary(matchVersion, dictionary),DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary) { + this(matchVersion, input,dictionary,DEFAULT_MIN_WORD_SIZE,DEFAULT_MIN_SUBWORD_SIZE,DEFAULT_MAX_SUBWORD_SIZE, false); + } + + protected CompoundWordTokenFilterBase(Version matchVersion, TokenStream input, Set dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) { + super(input); + + this.tokens=new LinkedList(); + this.minWordSize=minWordSize; + this.minSubwordSize=minSubwordSize; + this.maxSubwordSize=maxSubwordSize; + this.onlyLongestMatch=onlyLongestMatch; + + if (dictionary==null || dictionary instanceof CharArraySet) { + this.dictionary = (CharArraySet) dictionary; + } else { + this.dictionary = new CharArraySet(matchVersion, dictionary, true); + } + } + + /** @deprecated Only available for backwards compatibility. */ + @Deprecated + public static CharArraySet makeDictionary(final Version matchVersion, final String[] dictionary) { + if (dictionary == null) { + return null; + } + return new CharArraySet(matchVersion, Arrays.asList(dictionary), true); + } + + @Override + public final boolean incrementToken() throws IOException { + if (!tokens.isEmpty()) { + assert current != null; + CompoundToken token = tokens.removeFirst(); + restoreState(current); // keep all other attributes untouched + termAtt.setEmpty().append(token.txt); + offsetAtt.setOffset(token.startOffset, token.endOffset); + posIncAtt.setPositionIncrement(0); + return true; + } + + current = null; // not really needed, but for safety + if (input.incrementToken()) { + // Only words longer than minWordSize get processed + if (termAtt.length() >= this.minWordSize) { + decompose(); + // only capture the state if we really need it for producing new tokens + if (!tokens.isEmpty()) { + current = captureState(); + } + } + // return original token: + return true; + } else { + return false; + } + } + + /** Decomposes the current {@link #termAtt} and places {@link CompoundToken} instances in the {@link #tokens} list. + * The original token may not be placed in the list, as it is automatically passed through this filter. + */ + protected abstract void decompose(); + + @Override + public void reset() throws IOException { + super.reset(); + tokens.clear(); + current = null; + } + + /** + * Helper class to hold decompounded token information + */ + protected class CompoundToken { + public final CharSequence txt; + public final int startOffset, endOffset; + + /** Construct the compound token based on a slice of the current {@link CompoundWordTokenFilterBase#termAtt}. */ + public CompoundToken(int offset, int length) { + final int newStart = CompoundWordTokenFilterBase.this.offsetAtt.startOffset() + offset; + this.txt = CompoundWordTokenFilterBase.this.termAtt.subSequence(offset, offset + length); + // TODO: This ignores the original endOffset, if a CharFilter/Tokenizer/Filter removed + // chars from the term, offsets may not match correctly (other filters producing tokens + // may also have this problem): + this.startOffset = newStart; + this.endOffset = newStart + length; + } + + } +}