lucene-java-3.5.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ICUTokenizer.java

   1 package org.apache.lucene.analysis.icu.segmentation;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Tokenizer;
  24 import org.apache.lucene.analysis.icu.tokenattributes.ScriptAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  26 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  27 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  28
  29 import com.ibm.icu.lang.UCharacter;
  30 import com.ibm.icu.text.BreakIterator;
  31
  32 /**
  33  * Breaks text into words according to UAX #29: Unicode Text Segmentation
  34  * (http://www.unicode.org/reports/tr29/)
  35  * <p>
  36  * Words are broken across script boundaries, then segmented according to
  37  * the BreakIterator and typing provided by the {@link ICUTokenizerConfig}
  38  * </p>
  39  * @see ICUTokenizerConfig
  40  * @lucene.experimental
  41  */
  42 public final class ICUTokenizer extends Tokenizer {
  43   private static final int IOBUFFER = 4096;
  44   private final char buffer[] = new char[IOBUFFER];
  45   /** true length of text in the buffer */
  46   private int length = 0;
  47   /** length in buffer that can be evaluated safely, up to a safe end point */
  48   private int usableLength = 0;
  49   /** accumulated offset of previous buffers for this reader, for offsetAtt */
  50   private int offset = 0;
  51
  52   private final CompositeBreakIterator breaker; /* tokenizes a char[] of text */
  53   private final ICUTokenizerConfig config;
  54   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  55   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  56   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  57   private final ScriptAttribute scriptAtt = addAttribute(ScriptAttribute.class);
  58
  59   /**
  60    * Construct a new ICUTokenizer that breaks text into words from the given
  61    * Reader.
  62    * <p>
  63    * The default script-specific handling is used.
  64    *
  65    * @param input Reader containing text to tokenize.
  66    * @see DefaultICUTokenizerConfig
  67    */
  68   public ICUTokenizer(Reader input) {
  69     this(input, new DefaultICUTokenizerConfig());
  70   }
  71
  72   /**
  73    * Construct a new ICUTokenizer that breaks text into words from the given
  74    * Reader, using a tailored BreakIterator configuration.
  75    *
  76    * @param input Reader containing text to tokenize.
  77    * @param config Tailored BreakIterator configuration
  78    */
  79   public ICUTokenizer(Reader input, ICUTokenizerConfig config) {
  80     super(input);
  81     this.config = config;
  82     breaker = new CompositeBreakIterator(config);
  83   }
  84
  85   @Override
  86   public boolean incrementToken() throws IOException {
  87     clearAttributes();
  88     if (length == 0)
  89       refill();
  90     while (!incrementTokenBuffer()) {
  91       refill();
  92       if (length <= 0) // no more bytes to read;
  93         return false;
  94     }
  95     return true;
  96   }
  97
  98   @Override
  99   public void reset() throws IOException {
 100     super.reset();
 101     breaker.setText(buffer, 0, 0);
 102     length = usableLength = offset = 0;
 103   }
 104
 105   @Override
 106   public void reset(Reader input) throws IOException {
 107     super.reset(input);
 108     reset();
 109   }
 110
 111   @Override
 112   public void end() throws IOException {
 113     final int finalOffset = (length < 0) ? offset : offset + length;
 114     offsetAtt.setOffset(finalOffset, finalOffset);
 115   }
 116
 117   /*
 118    * This tokenizes text based upon the longest matching rule, and because of
 119    * this, isn't friendly to a Reader.
 120    *
 121    * Text is read from the input stream in 4kB chunks. Within a 4kB chunk of
 122    * text, the last unambiguous break point is found (in this implementation:
 123    * white space character) Any remaining characters represent possible partial
 124    * words, so are appended to the front of the next chunk.
 125    *
 126    * There is the possibility that there are no unambiguous break points within
 127    * an entire 4kB chunk of text (binary data). So there is a maximum word limit
 128    * of 4kB since it will not try to grow the buffer in this case.
 129    */
 130
 131   /**
 132    * Returns the last unambiguous break position in the text.
 133    *
 134    * @return position of character, or -1 if one does not exist
 135    */
 136   private int findSafeEnd() {
 137     for (int i = length - 1; i >= 0; i--)
 138       if (UCharacter.isWhitespace(buffer[i]))
 139         return i + 1;
 140     return -1;
 141   }
 142
 143   /**
 144    * Refill the buffer, accumulating the offset and setting usableLength to the
 145    * last unambiguous break position
 146    *
 147    * @throws IOException
 148    */
 149   private void refill() throws IOException {
 150     offset += usableLength;
 151     int leftover = length - usableLength;
 152     System.arraycopy(buffer, usableLength, buffer, 0, leftover);
 153     int requested = buffer.length - leftover;
 154     int returned = input.read(buffer, leftover, requested);
 155     length = returned < 0 ? leftover : returned + leftover;
 156     if (returned < requested) /* reader has been emptied, process the rest */
 157       usableLength = length;
 158     else { /* still more data to be read, find a safe-stopping place */
 159       usableLength = findSafeEnd();
 160       if (usableLength < 0)
 161         usableLength = length; /*
 162                                 * more than IOBUFFER of text without space,
 163                                 * gonna possibly truncate tokens
 164                                 */
 165     }
 166
 167     breaker.setText(buffer, 0, Math.max(0, usableLength));
 168   }
 169
 170   /*
 171    * return true if there is a token from the buffer, or null if it is
 172    * exhausted.
 173    */
 174   private boolean incrementTokenBuffer() {
 175     int start = breaker.current();
 176     if (start == BreakIterator.DONE)
 177       return false; // BreakIterator exhausted
 178
 179     // find the next set of boundaries, skipping over non-tokens (rule status 0)
 180     int end = breaker.next();
 181     while (start != BreakIterator.DONE && breaker.getRuleStatus() == 0) {
 182       start = end;
 183       end = breaker.next();
 184     }
 185
 186     if (start == BreakIterator.DONE)
 187       return false; // BreakIterator exhausted
 188
 189     termAtt.copyBuffer(buffer, start, end - start);
 190     offsetAtt.setOffset(correctOffset(offset + start), correctOffset(offset + end));
 191     typeAtt.setType(config.getType(breaker.getScriptCode(), breaker.getRuleStatus()));
 192     scriptAtt.setCode(breaker.getScriptCode());
 193
 194     return true;
 195   }
 196 }