lucene-java-3.5.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/CompositeBreakIterator.java

   1 package org.apache.lucene.analysis.icu.segmentation;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import com.ibm.icu.lang.UScript;
  21 import com.ibm.icu.text.BreakIterator;
  22
  23 /**
  24  * An internal BreakIterator for multilingual text, following recommendations
  25  * from: UAX #29: Unicode Text Segmentation. (http://unicode.org/reports/tr29/)
  26  * <p>
  27  * See http://unicode.org/reports/tr29/#Tailoring for the motivation of this
  28  * design.
  29  * <p>
  30  * Text is first divided into script boundaries. The processing is then
  31  * delegated to the appropriate break iterator for that specific script.
  32  * <p>
  33  * This break iterator also allows you to retrieve the ISO 15924 script code
  34  * associated with a piece of text.
  35  * <p>
  36  * See also UAX #29, UTR #24
  37  * @lucene.experimental
  38  */
  39 final class CompositeBreakIterator {
  40   private final ICUTokenizerConfig config;
  41   private final BreakIteratorWrapper wordBreakers[] = new BreakIteratorWrapper[UScript.CODE_LIMIT];
  42
  43   private BreakIteratorWrapper rbbi;
  44   private final ScriptIterator scriptIterator = new ScriptIterator();
  45
  46   private char text[];
  47
  48   CompositeBreakIterator(ICUTokenizerConfig config) {
  49     this.config = config;
  50   }
  51
  52   /**
  53    * Retrieve the next break position. If the RBBI range is exhausted within the
  54    * script boundary, examine the next script boundary.
  55    *
  56    * @return the next break position or BreakIterator.DONE
  57    */
  58   int next() {
  59     int next = rbbi.next();
  60     while (next == BreakIterator.DONE && scriptIterator.next()) {
  61       rbbi = getBreakIterator(scriptIterator.getScriptCode());
  62       rbbi.setText(text, scriptIterator.getScriptStart(),
  63           scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
  64       next = rbbi.next();
  65     }
  66     return (next == BreakIterator.DONE) ? BreakIterator.DONE : next
  67         + scriptIterator.getScriptStart();
  68   }
  69
  70   /**
  71    * Retrieve the current break position.
  72    *
  73    * @return the current break position or BreakIterator.DONE
  74    */
  75   int current() {
  76     final int current = rbbi.current();
  77     return (current == BreakIterator.DONE) ? BreakIterator.DONE : current
  78         + scriptIterator.getScriptStart();
  79   }
  80
  81   /**
  82    * Retrieve the rule status code (token type) from the underlying break
  83    * iterator
  84    *
  85    * @return rule status code (see RuleBasedBreakIterator constants)
  86    */
  87   int getRuleStatus() {
  88     return rbbi.getRuleStatus();
  89   }
  90
  91   /**
  92    * Retrieve the UScript script code for the current token. This code can be
  93    * decoded with UScript into a name or ISO 15924 code.
  94    *
  95    * @return UScript script code for the current token.
  96    */
  97   int getScriptCode() {
  98     return scriptIterator.getScriptCode();
  99   }
 100
 101   /**
 102    * Set a new region of text to be examined by this iterator
 103    *
 104    * @param text buffer of text
 105    * @param start offset into buffer
 106    * @param length maximum length to examine
 107    */
 108   void setText(final char text[], int start, int length) {
 109     this.text = text;
 110     scriptIterator.setText(text, start, length);
 111     if (scriptIterator.next()) {
 112       rbbi = getBreakIterator(scriptIterator.getScriptCode());
 113       rbbi.setText(text, scriptIterator.getScriptStart(),
 114           scriptIterator.getScriptLimit() - scriptIterator.getScriptStart());
 115     } else {
 116       rbbi = getBreakIterator(UScript.COMMON);
 117       rbbi.setText(text, 0, 0);
 118     }
 119   }
 120
 121   private BreakIteratorWrapper getBreakIterator(int scriptCode) {
 122     if (wordBreakers[scriptCode] == null)
 123       wordBreakers[scriptCode] = BreakIteratorWrapper.wrap(config.getBreakIterator(scriptCode));
 124     return wordBreakers[scriptCode];
 125   }
 126 }