X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java deleted file mode 100644 index 1cc22a4..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java +++ /dev/null @@ -1,317 +0,0 @@ -package org.apache.lucene.analysis.synonym; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.IOException; -import java.io.StringReader; -import java.util.ArrayList; -import java.util.Arrays; -import java.util.HashMap; -import java.util.HashSet; -import java.util.Set; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.analysis.TokenStream; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; -import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute; -import org.apache.lucene.store.ByteArrayDataOutput; -import org.apache.lucene.util.BytesRef; -import org.apache.lucene.util.BytesRefHash; -import org.apache.lucene.util.CharsRef; -import org.apache.lucene.util.UnicodeUtil; -import org.apache.lucene.util.fst.ByteSequenceOutputs; -import org.apache.lucene.util.fst.FST; - -/** - * A map of synonyms, keys and values are phrases. - * @lucene.experimental - */ -public class SynonymMap { - /** for multiword support, you must separate words with this separator */ - public static final char WORD_SEPARATOR = 0; - /** map> */ - public final FST fst; - /** map */ - public final BytesRefHash words; - /** maxHorizontalContext: maximum context we need on the tokenstream */ - public final int maxHorizontalContext; - - public SynonymMap(FST fst, BytesRefHash words, int maxHorizontalContext) { - this.fst = fst; - this.words = words; - this.maxHorizontalContext = maxHorizontalContext; - } - - /** - * Builds an FSTSynonymMap. - *

- * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap - * @lucene.experimental - */ - public static class Builder { - private final HashMap workingSet = new HashMap(); - private final BytesRefHash words = new BytesRefHash(); - private final BytesRef utf8Scratch = new BytesRef(8); - private int maxHorizontalContext; - private final boolean dedup; - - /** If dedup is true then identical rules (same input, - * same output) will be added only once. */ - public Builder(boolean dedup) { - this.dedup = dedup; - } - - private static class MapEntry { - boolean includeOrig; - // we could sort for better sharing ultimately, but it could confuse people - ArrayList ords = new ArrayList(); - } - - /** Sugar: just joins the provided terms with {@link - * SynonymMap#WORD_SEPARATOR}. reuse and its chars - * must not be null. */ - public static CharsRef join(String[] words, CharsRef reuse) { - int upto = 0; - char[] buffer = reuse.chars; - for(String word : words) { - if (upto > 0) { - if (upto >= buffer.length) { - reuse.grow(upto); - buffer = reuse.chars; - } - buffer[upto++] = SynonymMap.WORD_SEPARATOR; - } - - final int wordLen = word.length(); - final int needed = upto + wordLen; - if (needed > buffer.length) { - reuse.grow(needed); - buffer = reuse.chars; - } - - word.getChars(0, wordLen, buffer, upto); - upto += wordLen; - } - - return reuse; - } - - /** Sugar: analyzes the text with the analyzer and - * separates by {@link SynonymMap#WORD_SEPARATOR}. - * reuse and its chars must not be null. */ - public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException { - TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text)); - CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); - PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); - ts.reset(); - reuse.length = 0; - while (ts.incrementToken()) { - int length = termAtt.length(); - if (length == 0) { - throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); - } - if (posIncAtt.getPositionIncrement() != 1) { - throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); - } - reuse.grow(reuse.length + length + 1); /* current + word + separator */ - int end = reuse.offset + reuse.length; - if (reuse.length > 0) { - reuse.chars[end++] = SynonymMap.WORD_SEPARATOR; - reuse.length++; - } - System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length); - reuse.length += length; - } - ts.end(); - ts.close(); - if (reuse.length == 0) { - throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); - } - return reuse; - } - - /** only used for asserting! */ - private boolean hasHoles(CharsRef chars) { - final int end = chars.offset + chars.length; - for(int idx=chars.offset+1;idx 0 (got " + numInputWords + ")"); - } - if (input.length <= 0) { - throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")"); - } - if (numOutputWords <= 0) { - throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")"); - } - if (output.length <= 0) { - throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")"); - } - - assert !hasHoles(input): "input has holes: " + input; - assert !hasHoles(output): "output has holes: " + output; - - //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords); - final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch); - // lookup in hash - int ord = words.add(utf8Scratch, hashCode); - if (ord < 0) { - // already exists in our hash - ord = (-ord)-1; - //System.out.println(" output=" + output + " old ord=" + ord); - } else { - //System.out.println(" output=" + output + " new ord=" + ord); - } - - MapEntry e = workingSet.get(input); - if (e == null) { - e = new MapEntry(); - workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map - } - - e.ords.add(ord); - e.includeOrig |= includeOrig; - maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords); - maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords); - } - - private int countWords(CharsRef chars) { - int wordCount = 1; - int upto = chars.offset; - final int limit = chars.offset + chars.length; - while(upto < limit) { - if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) { - wordCount++; - } - } - return wordCount; - } - - /** - * Add a phrase->phrase synonym mapping. - * Phrases are character sequences where words are - * separated with character zero (\u0000). Empty words - * (two \u0000s in a row) are not allowed in the input nor - * the output! - * - * @param input input phrase - * @param output output phrase - * @param includeOrig true if the original should be included - */ - public void add(CharsRef input, CharsRef output, boolean includeOrig) { - add(input, countWords(input), output, countWords(output), includeOrig); - } - - /** - * Builds an {@link SynonymMap} and returns it. - */ - public SynonymMap build() throws IOException { - ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton(); - // TODO: are we using the best sharing options? - org.apache.lucene.util.fst.Builder builder = - new org.apache.lucene.util.fst.Builder(FST.INPUT_TYPE.BYTE4, outputs); - - BytesRef scratch = new BytesRef(64); - ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput(); - - final Set dedupSet; - - if (dedup) { - dedupSet = new HashSet(); - } else { - dedupSet = null; - } - - final byte[] spare = new byte[5]; - - Set keys = workingSet.keySet(); - CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]); - Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator()); - - //System.out.println("fmap.build"); - for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) { - CharsRef input = sortedKeys[keyIdx]; - MapEntry output = workingSet.get(input); - - int numEntries = output.ords.size(); - // output size, assume the worst case - int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry - - scratch.grow(estimatedSize); - scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length); - assert scratch.offset == 0; - - // now write our output data: - int count = 0; - for (int i = 0; i < numEntries; i++) { - if (dedupSet != null) { - // box once - final Integer ent = output.ords.get(i); - if (dedupSet.contains(ent)) { - continue; - } - dedupSet.add(ent); - } - scratchOutput.writeVInt(output.ords.get(i)); - count++; - } - - final int pos = scratchOutput.getPosition(); - scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1)); - final int pos2 = scratchOutput.getPosition(); - final int vIntLen = pos2-pos; - - // Move the count + includeOrig to the front of the byte[]: - System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen); - System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos); - System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen); - - if (dedupSet != null) { - dedupSet.clear(); - } - - scratch.length = scratchOutput.getPosition() - scratch.offset; - //System.out.println(" add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count); - builder.add(input, new BytesRef(scratch)); - } - - FST fst = builder.finish(); - return new SynonymMap(fst, words, maxHorizontalContext); - } - } -}