X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java diff --git a/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java b/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java deleted file mode 100644 index ffd4c33..0000000 --- a/lucene-java-3.4.0/lucene/contrib/icu/src/java/org/apache/lucene/analysis/icu/segmentation/LaoBreakIterator.java +++ /dev/null @@ -1,226 +0,0 @@ -package org.apache.lucene.analysis.icu.segmentation; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.text.CharacterIterator; - -import com.ibm.icu.lang.UCharacter; -import com.ibm.icu.text.BreakIterator; -import com.ibm.icu.text.RuleBasedBreakIterator; -import com.ibm.icu.text.UnicodeSet; - -/** - * Syllable iterator for Lao text. - *

- * This breaks Lao text into syllables according to: - * Syllabification of Lao Script for Line Breaking - * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, - * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP. - *

- *

- * Most work is accomplished with RBBI rules, however some additional special logic is needed - * that cannot be coded in a grammar, and this is implemented here. - *

- * For example, what appears to be a final consonant might instead be part of the next syllable. - * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules. - *

- * Take for instance the text ກວ່າດອກ - * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal. - * What LaoBreakIterator does, according to the paper: - *

    - *
  1. backtrack and remove the ດ from the last syllable, placing it on the current syllable. - *
  2. verify the modified previous syllable (ກວ່າ ) is still legal. - *
  3. verify the modified current syllable (ດອກ) is now legal. - *
  4. If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character. - *
- *

- * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper. - * This is the issue of combining marks being in the wrong order (typos). - * @lucene.experimental - */ -public class LaoBreakIterator extends BreakIterator { - RuleBasedBreakIterator rules; - CharArrayIterator text; - - CharArrayIterator working = new CharArrayIterator(); - int workingOffset = 0; - - CharArrayIterator verifyText = new CharArrayIterator(); - RuleBasedBreakIterator verify; - - private static final UnicodeSet laoSet; - static { - laoSet = new UnicodeSet("[:Lao:]"); - laoSet.compact(); - laoSet.freeze(); - } - - public LaoBreakIterator(RuleBasedBreakIterator rules) { - this.rules = (RuleBasedBreakIterator) rules.clone(); - this.verify = (RuleBasedBreakIterator) rules.clone(); - } - - @Override - public int current() { - int current = rules.current(); - return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current; - } - - @Override - public int first() { - working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); - rules.setText(working); - workingOffset = 0; - int first = rules.first(); - return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first; - } - - @Override - public int following(int offset) { - throw new UnsupportedOperationException(); - } - - @Override - public CharacterIterator getText() { - return text; - } - - @Override - public int last() { - throw new UnsupportedOperationException(); - } - - @Override - public int next() { - int current = current(); - int next = rules.next(); - if (next == BreakIterator.DONE) - return next; - else - next += workingOffset; - - char c = working.current(); - int following = rules.next(); // lookahead - if (following != BreakIterator.DONE) { - following += workingOffset; - if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) { - workingOffset = next - 1; - working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset); - return next - 1; - } - rules.previous(); // undo the lookahead - } - - return next; - } - - @Override - public int next(int n) { - if (n < 0) - throw new UnsupportedOperationException("Backwards traversal is unsupported"); - - int result = current(); - while (n > 0) { - result = next(); - --n; - } - return result; - } - - @Override - public int previous() { - throw new UnsupportedOperationException("Backwards traversal is unsupported"); - } - - @Override - public void setText(CharacterIterator text) { - if (!(text instanceof CharArrayIterator)) - throw new UnsupportedOperationException("unsupported CharacterIterator"); - this.text = (CharArrayIterator) text; - ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength()); - working.setText(this.text.getText(), this.text.getStart(), this.text.getLength()); - rules.setText(working); - workingOffset = 0; - } - - @Override - public void setText(String newText) { - CharArrayIterator ci = new CharArrayIterator(); - ci.setText(newText.toCharArray(), 0, newText.length()); - setText(ci); - } - - private boolean verifyPushBack(int current, int next) { - int shortenedSyllable = next - current - 1; - - verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable); - verify.setText(verifyText); - if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0) - return false; - - - verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1); - verify.setText(verifyText); - - return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0); - } - - // TODO: only bubblesort around runs of combining marks, instead of the entire text. - private void ccReorder(char[] text, int start, int length) { - boolean reordered; - do { - int prevCC = 0; - reordered = false; - for (int i = start; i < start + length; i++) { - final char c = text[i]; - final int cc = UCharacter.getCombiningClass(c); - if (cc > 0 && cc < prevCC) { - // swap - text[i] = text[i - 1]; - text[i - 1] = c; - reordered = true; - } else { - prevCC = cc; - } - } - - } while (reordered == true); - } - - /** - * Clone method. Creates another LaoBreakIterator with the same behavior - * and current state as this one. - * @return The clone. - */ - @Override - public Object clone() { - LaoBreakIterator other = (LaoBreakIterator) super.clone(); - other.rules = (RuleBasedBreakIterator) rules.clone(); - other.verify = (RuleBasedBreakIterator) verify.clone(); - if (text != null) - other.text = (CharArrayIterator) text.clone(); - if (working != null) - other.working = (CharArrayIterator) working.clone(); - if (verifyText != null) - other.verifyText = (CharArrayIterator) verifyText.clone(); - return other; - } -}