lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SynonymMap.java

   1 package org.apache.lucene.analysis.synonym;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.StringReader;
  22 import java.util.ArrayList;
  23 import java.util.Arrays;
  24 import java.util.HashMap;
  25 import java.util.HashSet;
  26 import java.util.Set;
  27
  28 import org.apache.lucene.analysis.Analyzer;
  29 import org.apache.lucene.analysis.TokenStream;
  30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  31 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  32 import org.apache.lucene.store.ByteArrayDataOutput;
  33 import org.apache.lucene.util.BytesRef;
  34 import org.apache.lucene.util.BytesRefHash;
  35 import org.apache.lucene.util.CharsRef;
  36 import org.apache.lucene.util.UnicodeUtil;
  37 import org.apache.lucene.util.fst.ByteSequenceOutputs;
  38 import org.apache.lucene.util.fst.FST;
  39
  40 /**
  41  * A map of synonyms, keys and values are phrases.
  42  * @lucene.experimental
  43  */
  44 public class SynonymMap {
  45   /** for multiword support, you must separate words with this separator */
  46   public static final char WORD_SEPARATOR = 0;
  47   /** map<input word, list<ord>> */
  48   public final FST<BytesRef> fst;
  49   /** map<ord, outputword> */
  50   public final BytesRefHash words;
  51   /** maxHorizontalContext: maximum context we need on the tokenstream */
  52   public final int maxHorizontalContext;
  53
  54   public SynonymMap(FST<BytesRef> fst, BytesRefHash words, int maxHorizontalContext) {
  55     this.fst = fst;
  56     this.words = words;
  57     this.maxHorizontalContext = maxHorizontalContext;
  58   }
  59
  60   /**
  61    * Builds an FSTSynonymMap.
  62    * <p>
  63    * Call add() until you have added all the mappings, then call build() to get an FSTSynonymMap
  64    * @lucene.experimental
  65    */
  66   public static class Builder {
  67     private final HashMap<CharsRef,MapEntry> workingSet = new HashMap<CharsRef,MapEntry>();
  68     private final BytesRefHash words = new BytesRefHash();
  69     private final BytesRef utf8Scratch = new BytesRef(8);
  70     private int maxHorizontalContext;
  71     private final boolean dedup;
  72
  73     /** If dedup is true then identical rules (same input,
  74      *  same output) will be added only once. */
  75     public Builder(boolean dedup) {
  76       this.dedup = dedup;
  77     }
  78
  79     private static class MapEntry {
  80       boolean includeOrig;
  81       // we could sort for better sharing ultimately, but it could confuse people
  82       ArrayList<Integer> ords = new ArrayList<Integer>();
  83     }
  84
  85     /** Sugar: just joins the provided terms with {@link
  86      *  SynonymMap#WORD_SEPARATOR}.  reuse and its chars
  87      *  must not be null. */
  88     public static CharsRef join(String[] words, CharsRef reuse) {
  89       int upto = 0;
  90       char[] buffer = reuse.chars;
  91       for(String word : words) {
  92         if (upto > 0) {
  93           if (upto >= buffer.length) {
  94             reuse.grow(upto);
  95             buffer = reuse.chars;
  96           }
  97           buffer[upto++] = SynonymMap.WORD_SEPARATOR;
  98         }
  99
 100         final int wordLen =  word.length();
 101         final int needed = upto + wordLen;
 102         if (needed > buffer.length) {
 103           reuse.grow(needed);
 104           buffer = reuse.chars;
 105         }
 106
 107         word.getChars(0, wordLen, buffer, upto);
 108         upto += wordLen;
 109       }
 110
 111       return reuse;
 112     }
 113
 114     /** Sugar: analyzes the text with the analyzer and
 115      *  separates by {@link SynonymMap#WORD_SEPARATOR}.
 116      *  reuse and its chars must not be null. */
 117     public static CharsRef analyze(Analyzer analyzer, String text, CharsRef reuse) throws IOException {
 118       TokenStream ts = analyzer.reusableTokenStream("", new StringReader(text));
 119       CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class);
 120       PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class);
 121       ts.reset();
 122       reuse.length = 0;
 123       while (ts.incrementToken()) {
 124         int length = termAtt.length();
 125         if (length == 0) {
 126           throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token");
 127         }
 128         if (posIncAtt.getPositionIncrement() != 1) {
 129           throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1");
 130         }
 131         reuse.grow(reuse.length + length + 1); /* current + word + separator */
 132         int end = reuse.offset + reuse.length;
 133         if (reuse.length > 0) {
 134           reuse.chars[end++] = SynonymMap.WORD_SEPARATOR;
 135           reuse.length++;
 136         }
 137         System.arraycopy(termAtt.buffer(), 0, reuse.chars, end, length);
 138         reuse.length += length;
 139       }
 140       ts.end();
 141       ts.close();
 142       if (reuse.length == 0) {
 143         throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer");
 144       }
 145       return reuse;
 146     }
 147
 148     /** only used for asserting! */
 149     private boolean hasHoles(CharsRef chars) {
 150       final int end = chars.offset + chars.length;
 151       for(int idx=chars.offset+1;idx<end;idx++) {
 152         if (chars.chars[idx] == SynonymMap.WORD_SEPARATOR && chars.chars[idx-1] == SynonymMap.WORD_SEPARATOR) {
 153           return true;
 154         }
 155       }
 156       if (chars.chars[chars.offset] == '\u0000') {
 157         return true;
 158       }
 159       if (chars.chars[chars.offset + chars.length - 1] == '\u0000') {
 160         return true;
 161       }
 162
 163       return false;
 164     }
 165
 166     // NOTE: while it's tempting to make this public, since
 167     // caller's parser likely knows the
 168     // numInput/numOutputWords, sneaky exceptions, much later
 169     // on, will result if these values are wrong; so we always
 170     // recompute ourselves to be safe:
 171     private void add(CharsRef input, int numInputWords, CharsRef output, int numOutputWords, boolean includeOrig) {
 172       // first convert to UTF-8
 173       if (numInputWords <= 0) {
 174         throw new IllegalArgumentException("numInputWords must be > 0 (got " + numInputWords + ")");
 175       }
 176       if (input.length <= 0) {
 177         throw new IllegalArgumentException("input.length must be > 0 (got " + input.length + ")");
 178       }
 179       if (numOutputWords <= 0) {
 180         throw new IllegalArgumentException("numOutputWords must be > 0 (got " + numOutputWords + ")");
 181       }
 182       if (output.length <= 0) {
 183         throw new IllegalArgumentException("output.length must be > 0 (got " + output.length + ")");
 184       }
 185
 186       assert !hasHoles(input): "input has holes: " + input;
 187       assert !hasHoles(output): "output has holes: " + output;
 188
 189       //System.out.println("fmap.add input=" + input + " numInputWords=" + numInputWords + " output=" + output + " numOutputWords=" + numOutputWords);
 190       final int hashCode = UnicodeUtil.UTF16toUTF8WithHash(output.chars, output.offset, output.length, utf8Scratch);
 191       // lookup in hash
 192       int ord = words.add(utf8Scratch, hashCode);
 193       if (ord < 0) {
 194         // already exists in our hash
 195         ord = (-ord)-1;
 196         //System.out.println("  output=" + output + " old ord=" + ord);
 197       } else {
 198         //System.out.println("  output=" + output + " new ord=" + ord);
 199       }
 200
 201       MapEntry e = workingSet.get(input);
 202       if (e == null) {
 203         e = new MapEntry();
 204         workingSet.put(new CharsRef(input), e); // make a copy, since we will keep around in our map
 205       }
 206
 207       e.ords.add(ord);
 208       e.includeOrig |= includeOrig;
 209       maxHorizontalContext = Math.max(maxHorizontalContext, numInputWords);
 210       maxHorizontalContext = Math.max(maxHorizontalContext, numOutputWords);
 211     }
 212
 213     private int countWords(CharsRef chars) {
 214       int wordCount = 1;
 215       int upto = chars.offset;
 216       final int limit = chars.offset + chars.length;
 217       while(upto < limit) {
 218         if (chars.chars[upto++] == SynonymMap.WORD_SEPARATOR) {
 219           wordCount++;
 220         }
 221       }
 222       return wordCount;
 223     }
 224
 225     /**
 226      * Add a phrase->phrase synonym mapping.
 227      * Phrases are character sequences where words are
 228      * separated with character zero (\u0000).  Empty words
 229      * (two \u0000s in a row) are not allowed in the input nor
 230      * the output!
 231      *
 232      * @param input input phrase
 233      * @param output output phrase
 234      * @param includeOrig true if the original should be included
 235      */
 236     public void add(CharsRef input, CharsRef output, boolean includeOrig) {
 237       add(input, countWords(input), output, countWords(output), includeOrig);
 238     }
 239
 240     /**
 241      * Builds an {@link SynonymMap} and returns it.
 242      */
 243     public SynonymMap build() throws IOException {
 244       ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
 245       // TODO: are we using the best sharing options?
 246       org.apache.lucene.util.fst.Builder<BytesRef> builder =
 247         new org.apache.lucene.util.fst.Builder<BytesRef>(FST.INPUT_TYPE.BYTE4, outputs);
 248
 249       BytesRef scratch = new BytesRef(64);
 250       ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();
 251
 252       final Set<Integer> dedupSet;
 253
 254       if (dedup) {
 255         dedupSet = new HashSet<Integer>();
 256       } else {
 257         dedupSet = null;
 258       }
 259
 260       final byte[] spare = new byte[5];
 261
 262       Set<CharsRef> keys = workingSet.keySet();
 263       CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
 264       Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());
 265
 266       //System.out.println("fmap.build");
 267       for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
 268         CharsRef input = sortedKeys[keyIdx];
 269         MapEntry output = workingSet.get(input);
 270
 271         int numEntries = output.ords.size();
 272         // output size, assume the worst case
 273         int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
 274
 275         scratch.grow(estimatedSize);
 276         scratchOutput.reset(scratch.bytes, scratch.offset, scratch.bytes.length);
 277         assert scratch.offset == 0;
 278
 279         // now write our output data:
 280         int count = 0;
 281         for (int i = 0; i < numEntries; i++) {
 282           if (dedupSet != null) {
 283             // box once
 284             final Integer ent = output.ords.get(i);
 285             if (dedupSet.contains(ent)) {
 286               continue;
 287             }
 288             dedupSet.add(ent);
 289           }
 290           scratchOutput.writeVInt(output.ords.get(i));
 291           count++;
 292         }
 293
 294         final int pos = scratchOutput.getPosition();
 295         scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
 296         final int pos2 = scratchOutput.getPosition();
 297         final int vIntLen = pos2-pos;
 298
 299         // Move the count + includeOrig to the front of the byte[]:
 300         System.arraycopy(scratch.bytes, pos, spare, 0, vIntLen);
 301         System.arraycopy(scratch.bytes, 0, scratch.bytes, vIntLen, pos);
 302         System.arraycopy(spare, 0, scratch.bytes, 0, vIntLen);
 303
 304         if (dedupSet != null) {
 305           dedupSet.clear();
 306         }
 307
 308         scratch.length = scratchOutput.getPosition() - scratch.offset;
 309         //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
 310         builder.add(input, new BytesRef(scratch));
 311       }
 312
 313       FST<BytesRef> fst = builder.finish();
 314       return new SynonymMap(fst, words, maxHorizontalContext);
 315     }
 316   }
 317 }