--- /dev/null
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Copyright (C) 1999-2010, International Business Machines
+ * Corporation and others. All Rights Reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
+ * Software, and to permit persons to whom the Software is furnished to do so,
+ * provided that the above copyright notice(s) and this permission notice appear
+ * in all copies of the Software and that both the above copyright notice(s) and
+ * this permission notice appear in supporting documentation.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
+ * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
+ * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
+ * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
+ * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
+ * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
+ *
+ * Except as contained in this notice, the name of a copyright holder shall not
+ * be used in advertising or otherwise to promote the sale, use or other
+ * dealings in this Software without prior written authorization of the
+ * copyright holder.
+ */
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.lang.UScript;
+import com.ibm.icu.text.UTF16;
+
+/**
+ * An iterator that locates ISO 15924 script boundaries in text.
+ * <p>
+ * This is not the same as simply looking at the Unicode block, or even the
+ * Script property. Some characters are 'common' across multiple scripts, and
+ * some 'inherit' the script value of text surrounding them.
+ * <p>
+ * This is similar to ICU (internal-only) UScriptRun, with the following
+ * differences:
+ * <ul>
+ * <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
+ * is not necessary. Its also quite expensive.
+ * <li>Non-spacing marks inherit the script of their base character, following
+ * recommendations from UTR #24.
+ * </ul>
+ * @lucene.experimental
+ */
+final class ScriptIterator {
+ private char text[];
+ private int start;
+ private int limit;
+ private int index;
+
+ private int scriptStart;
+ private int scriptLimit;
+ private int scriptCode;
+
+ /**
+ * Get the start of this script run
+ *
+ * @return start position of script run
+ */
+ int getScriptStart() {
+ return scriptStart;
+ }
+
+ /**
+ * Get the index of the first character after the end of this script run
+ *
+ * @return position of the first character after this script run
+ */
+ int getScriptLimit() {
+ return scriptLimit;
+ }
+
+ /**
+ * Get the UScript script code for this script run
+ *
+ * @return code for the script of the current run
+ */
+ int getScriptCode() {
+ return scriptCode;
+ }
+
+ /**
+ * Iterates to the next script run, returning true if one exists.
+ *
+ * @return true if there is another script run, false otherwise.
+ */
+ boolean next() {
+ if (scriptLimit >= limit)
+ return false;
+
+ scriptCode = UScript.COMMON;
+ scriptStart = scriptLimit;
+
+ while (index < limit) {
+ final int ch = UTF16.charAt(text, start, limit, index - start);
+ final int sc = getScript(ch);
+
+ /*
+ * From UTR #24: Implementations that determine the boundaries between
+ * characters of given scripts should never break between a non-spacing
+ * mark and its base character. Thus for boundary determinations and
+ * similar sorts of processing, a non-spacing mark — whatever its script
+ * value — should inherit the script value of its base character.
+ */
+ if (isSameScript(scriptCode, sc)
+ || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
+ index += UTF16.getCharCount(ch);
+
+ /*
+ * Inherited or Common becomes the script code of the surrounding text.
+ */
+ if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
+ scriptCode = sc;
+ }
+
+ } else {
+ break;
+ }
+ }
+
+ scriptLimit = index;
+ return true;
+ }
+
+ /** Determine if two scripts are compatible. */
+ private static boolean isSameScript(int scriptOne, int scriptTwo) {
+ return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
+ || scriptOne == scriptTwo;
+ }
+
+ /**
+ * Set a new region of text to be examined by this iterator
+ *
+ * @param text text buffer to examine
+ * @param start offset into buffer
+ * @param length maximum length to examine
+ */
+ void setText(char text[], int start, int length) {
+ this.text = text;
+ this.start = start;
+ this.index = start;
+ this.limit = start + length;
+ this.scriptStart = start;
+ this.scriptLimit = start;
+ this.scriptCode = UScript.INVALID_CODE;
+ }
+
+ /** linear fast-path for basic latin case */
+ private static final int basicLatin[] = new int[128];
+
+ static {
+ for (int i = 0; i < basicLatin.length; i++)
+ basicLatin[i] = UScript.getScript(i);
+ }
+
+ /** fast version of UScript.getScript(). Basic Latin is an array lookup */
+ private static int getScript(int codepoint) {
+ if (0 <= codepoint && codepoint < basicLatin.length)
+ return basicLatin[codepoint];
+ else
+ return UScript.getScript(codepoint);
+ }
+}