1 package org.apache.lucene.analysis.icu.segmentation;
4 * Copyright (C) 1999-2010, International Business Machines
5 * Corporation and others. All Rights Reserved.
7 * Permission is hereby granted, free of charge, to any person obtaining a copy
8 * of this software and associated documentation files (the "Software"), to deal
9 * in the Software without restriction, including without limitation the rights
10 * to use, copy, modify, merge, publish, distribute, and/or sell copies of the
11 * Software, and to permit persons to whom the Software is furnished to do so,
12 * provided that the above copyright notice(s) and this permission notice appear
13 * in all copies of the Software and that both the above copyright notice(s) and
14 * this permission notice appear in supporting documentation.
16 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
17 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
18 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS.
19 * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE
20 * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR
21 * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER
22 * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT
23 * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE.
25 * Except as contained in this notice, the name of a copyright holder shall not
26 * be used in advertising or otherwise to promote the sale, use or other
27 * dealings in this Software without prior written authorization of the
31 import com.ibm.icu.lang.UCharacter;
32 import com.ibm.icu.lang.UScript;
33 import com.ibm.icu.text.UTF16;
36 * An iterator that locates ISO 15924 script boundaries in text.
38 * This is not the same as simply looking at the Unicode block, or even the
39 * Script property. Some characters are 'common' across multiple scripts, and
40 * some 'inherit' the script value of text surrounding them.
42 * This is similar to ICU (internal-only) UScriptRun, with the following
45 * <li>Doesn't attempt to match paired punctuation. For tokenization purposes, this
46 * is not necessary. Its also quite expensive.
47 * <li>Non-spacing marks inherit the script of their base character, following
48 * recommendations from UTR #24.
50 * @lucene.experimental
52 final class ScriptIterator {
58 private int scriptStart;
59 private int scriptLimit;
60 private int scriptCode;
63 * Get the start of this script run
65 * @return start position of script run
67 int getScriptStart() {
72 * Get the index of the first character after the end of this script run
74 * @return position of the first character after this script run
76 int getScriptLimit() {
81 * Get the UScript script code for this script run
83 * @return code for the script of the current run
90 * Iterates to the next script run, returning true if one exists.
92 * @return true if there is another script run, false otherwise.
95 if (scriptLimit >= limit)
98 scriptCode = UScript.COMMON;
99 scriptStart = scriptLimit;
101 while (index < limit) {
102 final int ch = UTF16.charAt(text, start, limit, index - start);
103 final int sc = getScript(ch);
106 * From UTR #24: Implementations that determine the boundaries between
107 * characters of given scripts should never break between a non-spacing
108 * mark and its base character. Thus for boundary determinations and
109 * similar sorts of processing, a non-spacing mark — whatever its script
110 * value — should inherit the script value of its base character.
112 if (isSameScript(scriptCode, sc)
113 || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) {
114 index += UTF16.getCharCount(ch);
117 * Inherited or Common becomes the script code of the surrounding text.
119 if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) {
132 /** Determine if two scripts are compatible. */
133 private static boolean isSameScript(int scriptOne, int scriptTwo) {
134 return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED
135 || scriptOne == scriptTwo;
139 * Set a new region of text to be examined by this iterator
141 * @param text text buffer to examine
142 * @param start offset into buffer
143 * @param length maximum length to examine
145 void setText(char text[], int start, int length) {
149 this.limit = start + length;
150 this.scriptStart = start;
151 this.scriptLimit = start;
152 this.scriptCode = UScript.INVALID_CODE;
155 /** linear fast-path for basic latin case */
156 private static final int basicLatin[] = new int[128];
159 for (int i = 0; i < basicLatin.length; i++)
160 basicLatin[i] = UScript.getScript(i);
163 /** fast version of UScript.getScript(). Basic Latin is an array lookup */
164 private static int getScript(int codepoint) {
165 if (0 <= codepoint && codepoint < basicLatin.length)
166 return basicLatin[codepoint];
168 return UScript.getScript(codepoint);