X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java b/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java deleted file mode 100644 index 4c327bc..0000000 --- a/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/ScriptIterator.java +++ /dev/null @@ -1,170 +0,0 @@ -package org.apache.lucene.analysis.icu.segmentation; - -/** - * Copyright (C) 1999-2010, International Business Machines - * Corporation and others. All Rights Reserved. - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to deal - * in the Software without restriction, including without limitation the rights - * to use, copy, modify, merge, publish, distribute, and/or sell copies of the - * Software, and to permit persons to whom the Software is furnished to do so, - * provided that the above copyright notice(s) and this permission notice appear - * in all copies of the Software and that both the above copyright notice(s) and - * this permission notice appear in supporting documentation. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT OF THIRD PARTY RIGHTS. - * IN NO EVENT SHALL THE COPYRIGHT HOLDER OR HOLDERS INCLUDED IN THIS NOTICE BE - * LIABLE FOR ANY CLAIM, OR ANY SPECIAL INDIRECT OR CONSEQUENTIAL DAMAGES, OR - * ANY DAMAGES WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER - * IN AN ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT - * OF OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. - * - * Except as contained in this notice, the name of a copyright holder shall not - * be used in advertising or otherwise to promote the sale, use or other - * dealings in this Software without prior written authorization of the - * copyright holder. - */ - -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.lang.UScript; -import com.ibm.icu.text.UTF16; - -/** - * An iterator that locates ISO 15924 script boundaries in text. - *

- * This is not the same as simply looking at the Unicode block, or even the - * Script property. Some characters are 'common' across multiple scripts, and - * some 'inherit' the script value of text surrounding them. - *

- * This is similar to ICU (internal-only) UScriptRun, with the following - * differences: - *

- * @lucene.experimental - */ -final class ScriptIterator { - private char text[]; - private int start; - private int limit; - private int index; - - private int scriptStart; - private int scriptLimit; - private int scriptCode; - - /** - * Get the start of this script run - * - * @return start position of script run - */ - int getScriptStart() { - return scriptStart; - } - - /** - * Get the index of the first character after the end of this script run - * - * @return position of the first character after this script run - */ - int getScriptLimit() { - return scriptLimit; - } - - /** - * Get the UScript script code for this script run - * - * @return code for the script of the current run - */ - int getScriptCode() { - return scriptCode; - } - - /** - * Iterates to the next script run, returning true if one exists. - * - * @return true if there is another script run, false otherwise. - */ - boolean next() { - if (scriptLimit >= limit) - return false; - - scriptCode = UScript.COMMON; - scriptStart = scriptLimit; - - while (index < limit) { - final int ch = UTF16.charAt(text, start, limit, index - start); - final int sc = getScript(ch); - - /* - * From UTR #24: Implementations that determine the boundaries between - * characters of given scripts should never break between a non-spacing - * mark and its base character. Thus for boundary determinations and - * similar sorts of processing, a non-spacing mark — whatever its script - * value — should inherit the script value of its base character. - */ - if (isSameScript(scriptCode, sc) - || UCharacter.getType(ch) == UCharacter.NON_SPACING_MARK) { - index += UTF16.getCharCount(ch); - - /* - * Inherited or Common becomes the script code of the surrounding text. - */ - if (scriptCode <= UScript.INHERITED && sc > UScript.INHERITED) { - scriptCode = sc; - } - - } else { - break; - } - } - - scriptLimit = index; - return true; - } - - /** Determine if two scripts are compatible. */ - private static boolean isSameScript(int scriptOne, int scriptTwo) { - return scriptOne <= UScript.INHERITED || scriptTwo <= UScript.INHERITED - || scriptOne == scriptTwo; - } - - /** - * Set a new region of text to be examined by this iterator - * - * @param text text buffer to examine - * @param start offset into buffer - * @param length maximum length to examine - */ - void setText(char text[], int start, int length) { - this.text = text; - this.start = start; - this.index = start; - this.limit = start + length; - this.scriptStart = start; - this.scriptLimit = start; - this.scriptCode = UScript.INVALID_CODE; - } - - /** linear fast-path for basic latin case */ - private static final int basicLatin[] = new int[128]; - - static { - for (int i = 0; i < basicLatin.length; i++) - basicLatin[i] = UScript.getScript(i); - } - - /** fast version of UScript.getScript(). Basic Latin is an array lookup */ - private static int getScript(int codepoint) { - if (0 <= codepoint && codepoint < basicLatin.length) - return basicLatin[codepoint]; - else - return UScript.getScript(codepoint); - } -}