--- /dev/null
+package org.apache.lucene.analysis.icu.segmentation;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.text.CharacterIterator;
+
+import com.ibm.icu.lang.UCharacter;
+import com.ibm.icu.text.BreakIterator;
+import com.ibm.icu.text.RuleBasedBreakIterator;
+import com.ibm.icu.text.UnicodeSet;
+
+/**
+ * Syllable iterator for Lao text.
+ * <p>
+ * This breaks Lao text into syllables according to:
+ * <i>Syllabification of Lao Script for Line Breaking</i>
+ * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
+ * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
+ * <ul>
+ * <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
+ * <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
+ * </ul>
+ * <p>
+ * Most work is accomplished with RBBI rules, however some additional special logic is needed
+ * that cannot be coded in a grammar, and this is implemented here.
+ * <p>
+ * For example, what appears to be a final consonant might instead be part of the next syllable.
+ * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
+ * <p>
+ * Take for instance the text ກວ່າດອກ
+ * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
+ * What LaoBreakIterator does, according to the paper:
+ * <ol>
+ * <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
+ * <li>verify the modified previous syllable (ກວ່າ ) is still legal.
+ * <li>verify the modified current syllable (ດອກ) is now legal.
+ * <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
+ * </ol>
+ * <p>
+ * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
+ * This is the issue of combining marks being in the wrong order (typos).
+ * @lucene.experimental
+ */
+public class LaoBreakIterator extends BreakIterator {
+ RuleBasedBreakIterator rules;
+ CharArrayIterator text;
+
+ CharArrayIterator working = new CharArrayIterator();
+ int workingOffset = 0;
+
+ CharArrayIterator verifyText = new CharArrayIterator();
+ RuleBasedBreakIterator verify;
+
+ private static final UnicodeSet laoSet;
+ static {
+ laoSet = new UnicodeSet("[:Lao:]");
+ laoSet.compact();
+ laoSet.freeze();
+ }
+
+ public LaoBreakIterator(RuleBasedBreakIterator rules) {
+ this.rules = (RuleBasedBreakIterator) rules.clone();
+ this.verify = (RuleBasedBreakIterator) rules.clone();
+ }
+
+ @Override
+ public int current() {
+ int current = rules.current();
+ return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
+ }
+
+ @Override
+ public int first() {
+ working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+ rules.setText(working);
+ workingOffset = 0;
+ int first = rules.first();
+ return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
+ }
+
+ @Override
+ public int following(int offset) {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public CharacterIterator getText() {
+ return text;
+ }
+
+ @Override
+ public int last() {
+ throw new UnsupportedOperationException();
+ }
+
+ @Override
+ public int next() {
+ int current = current();
+ int next = rules.next();
+ if (next == BreakIterator.DONE)
+ return next;
+ else
+ next += workingOffset;
+
+ char c = working.current();
+ int following = rules.next(); // lookahead
+ if (following != BreakIterator.DONE) {
+ following += workingOffset;
+ if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
+ workingOffset = next - 1;
+ working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
+ return next - 1;
+ }
+ rules.previous(); // undo the lookahead
+ }
+
+ return next;
+ }
+
+ @Override
+ public int next(int n) {
+ if (n < 0)
+ throw new UnsupportedOperationException("Backwards traversal is unsupported");
+
+ int result = current();
+ while (n > 0) {
+ result = next();
+ --n;
+ }
+ return result;
+ }
+
+ @Override
+ public int previous() {
+ throw new UnsupportedOperationException("Backwards traversal is unsupported");
+ }
+
+ @Override
+ public void setText(CharacterIterator text) {
+ if (!(text instanceof CharArrayIterator))
+ throw new UnsupportedOperationException("unsupported CharacterIterator");
+ this.text = (CharArrayIterator) text;
+ ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
+ working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
+ rules.setText(working);
+ workingOffset = 0;
+ }
+
+ @Override
+ public void setText(String newText) {
+ CharArrayIterator ci = new CharArrayIterator();
+ ci.setText(newText.toCharArray(), 0, newText.length());
+ setText(ci);
+ }
+
+ private boolean verifyPushBack(int current, int next) {
+ int shortenedSyllable = next - current - 1;
+
+ verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
+ verify.setText(verifyText);
+ if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
+ return false;
+
+
+ verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
+ verify.setText(verifyText);
+
+ return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
+ }
+
+ // TODO: only bubblesort around runs of combining marks, instead of the entire text.
+ private void ccReorder(char[] text, int start, int length) {
+ boolean reordered;
+ do {
+ int prevCC = 0;
+ reordered = false;
+ for (int i = start; i < start + length; i++) {
+ final char c = text[i];
+ final int cc = UCharacter.getCombiningClass(c);
+ if (cc > 0 && cc < prevCC) {
+ // swap
+ text[i] = text[i - 1];
+ text[i - 1] = c;
+ reordered = true;
+ } else {
+ prevCC = cc;
+ }
+ }
+
+ } while (reordered == true);
+ }
+
+ /**
+ * Clone method. Creates another LaoBreakIterator with the same behavior
+ * and current state as this one.
+ * @return The clone.
+ */
+ @Override
+ public Object clone() {
+ LaoBreakIterator other = (LaoBreakIterator) super.clone();
+ other.rules = (RuleBasedBreakIterator) rules.clone();
+ other.verify = (RuleBasedBreakIterator) verify.clone();
+ if (text != null)
+ other.text = (CharArrayIterator) text.clone();
+ if (working != null)
+ other.working = (CharArrayIterator) working.clone();
+ if (verifyText != null)
+ other.verifyText = (CharArrayIterator) verifyText.clone();
+ return other;
+ }
+}