1 package org.apache.lucene.analysis.icu.segmentation;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.text.CharacterIterator;
22 import com.ibm.icu.lang.UCharacter;
23 import com.ibm.icu.text.BreakIterator;
24 import com.ibm.icu.text.RuleBasedBreakIterator;
25 import com.ibm.icu.text.UnicodeSet;
28 * Syllable iterator for Lao text.
30 * This breaks Lao text into syllables according to:
31 * <i>Syllabification of Lao Script for Line Breaking</i>
32 * Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
33 * Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP.
35 * <li>http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
36 * <li>http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
39 * Most work is accomplished with RBBI rules, however some additional special logic is needed
40 * that cannot be coded in a grammar, and this is implemented here.
42 * For example, what appears to be a final consonant might instead be part of the next syllable.
43 * Rules match in a greedy fashion, leaving an illegal sequence that matches no rules.
45 * Take for instance the text ກວ່າດອກ
46 * The first rule greedily matches ກວ່າດ, but then ອກ is encountered, which is illegal.
47 * What LaoBreakIterator does, according to the paper:
49 * <li>backtrack and remove the ດ from the last syllable, placing it on the current syllable.
50 * <li>verify the modified previous syllable (ກວ່າ ) is still legal.
51 * <li>verify the modified current syllable (ດອກ) is now legal.
52 * <li>If 2 or 3 fails, then restore the ດ to the last syllable and skip the current character.
55 * Finally, LaoBreakIterator also takes care of the second concern mentioned in the paper.
56 * This is the issue of combining marks being in the wrong order (typos).
57 * @lucene.experimental
59 public class LaoBreakIterator extends BreakIterator {
60 RuleBasedBreakIterator rules;
61 CharArrayIterator text;
63 CharArrayIterator working = new CharArrayIterator();
64 int workingOffset = 0;
66 CharArrayIterator verifyText = new CharArrayIterator();
67 RuleBasedBreakIterator verify;
69 private static final UnicodeSet laoSet;
71 laoSet = new UnicodeSet("[:Lao:]");
76 public LaoBreakIterator(RuleBasedBreakIterator rules) {
77 this.rules = (RuleBasedBreakIterator) rules.clone();
78 this.verify = (RuleBasedBreakIterator) rules.clone();
82 public int current() {
83 int current = rules.current();
84 return current == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + current;
89 working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
90 rules.setText(working);
92 int first = rules.first();
93 return first == BreakIterator.DONE ? BreakIterator.DONE : workingOffset + first;
97 public int following(int offset) {
98 throw new UnsupportedOperationException();
102 public CharacterIterator getText() {
108 throw new UnsupportedOperationException();
113 int current = current();
114 int next = rules.next();
115 if (next == BreakIterator.DONE)
118 next += workingOffset;
120 char c = working.current();
121 int following = rules.next(); // lookahead
122 if (following != BreakIterator.DONE) {
123 following += workingOffset;
124 if (rules.getRuleStatus() == 0 && laoSet.contains(c) && verifyPushBack(current, next)) {
125 workingOffset = next - 1;
126 working.setText(text.getText(), text.getStart() + workingOffset, text.getLength() - workingOffset);
129 rules.previous(); // undo the lookahead
136 public int next(int n) {
138 throw new UnsupportedOperationException("Backwards traversal is unsupported");
140 int result = current();
149 public int previous() {
150 throw new UnsupportedOperationException("Backwards traversal is unsupported");
154 public void setText(CharacterIterator text) {
155 if (!(text instanceof CharArrayIterator))
156 throw new UnsupportedOperationException("unsupported CharacterIterator");
157 this.text = (CharArrayIterator) text;
158 ccReorder(this.text.getText(), this.text.getStart(), this.text.getLength());
159 working.setText(this.text.getText(), this.text.getStart(), this.text.getLength());
160 rules.setText(working);
165 public void setText(String newText) {
166 CharArrayIterator ci = new CharArrayIterator();
167 ci.setText(newText.toCharArray(), 0, newText.length());
171 private boolean verifyPushBack(int current, int next) {
172 int shortenedSyllable = next - current - 1;
174 verifyText.setText(text.getText(), text.getStart() + current, shortenedSyllable);
175 verify.setText(verifyText);
176 if (verify.next() != shortenedSyllable || verify.getRuleStatus() == 0)
180 verifyText.setText(text.getText(), text.getStart() + next - 1, text.getLength() - next + 1);
181 verify.setText(verifyText);
183 return (verify.next() != BreakIterator.DONE && verify.getRuleStatus() != 0);
186 // TODO: only bubblesort around runs of combining marks, instead of the entire text.
187 private void ccReorder(char[] text, int start, int length) {
192 for (int i = start; i < start + length; i++) {
193 final char c = text[i];
194 final int cc = UCharacter.getCombiningClass(c);
195 if (cc > 0 && cc < prevCC) {
197 text[i] = text[i - 1];
205 } while (reordered == true);
209 * Clone method. Creates another LaoBreakIterator with the same behavior
210 * and current state as this one.
214 public Object clone() {
215 LaoBreakIterator other = (LaoBreakIterator) super.clone();
216 other.rules = (RuleBasedBreakIterator) rules.clone();
217 other.verify = (RuleBasedBreakIterator) verify.clone();
219 other.text = (CharArrayIterator) text.clone();
221 other.working = (CharArrayIterator) working.clone();
222 if (verifyText != null)
223 other.verifyText = (CharArrayIterator) verifyText.clone();