2 # Licensed to the Apache Software Foundation (ASF) under one or more
\r
3 # contributor license agreements. See the NOTICE file distributed with
\r
4 # this work for additional information regarding copyright ownership.
\r
5 # The ASF licenses this file to You under the Apache License, Version 2.0
\r
6 # (the "License"); you may not use this file except in compliance with
\r
7 # the License. You may obtain a copy of the License at
\r
9 # http://www.apache.org/licenses/LICENSE-2.0
\r
11 # Unless required by applicable law or agreed to in writing, software
\r
12 # distributed under the License is distributed on an "AS IS" BASIS,
\r
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
\r
14 # See the License for the specific language governing permissions and
\r
15 # limitations under the License.
\r
17 # Parses Lao text, with syllable as token.
\r
19 # The definition of Lao syllable is based from:
\r
21 # Syllabification of Lao Script for Line Breaking
\r
22 # Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak,
\r
23 # Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP
\r
24 # http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf
\r
25 # http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf
\r
28 # There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper.
\r
29 # For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work.
\r
31 # Syllable structure, where X is the nuclear consonant:
\r
37 # +----+----+----+----+----+----+----+-----+
\r
38 # | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 |
\r
39 # +----+----+----+----+----+----+----+-----+
\r
45 # X0 represents a vowel which occurs before the nuclear consonant.
\r
46 # It can always define the beginning of syllable.
\r
47 $X0 = [\u0EC0-\u0EC4];
\r
48 # X1 is a combination consonant which comes before the nuclear consonant,
\r
49 # but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ}
\r
51 # X represents the nuclear consonant.
\r
52 $X = [\u0E81-\u0EAE\u0EDC\u0EDD];
\r
53 # X2 is a combination consonant which comes after the nuclear consonant,
\r
54 # which is placed under or next to the nuclear consonant.
\r
55 $X2 = [\u0EBC\u0EA3\u0EA7\u0EA5];
\r
56 # X3 represents a vowel which occurs under the nuclear consonant.
\r
57 $X3 = [\u0EB8\u0EB9];
\r
58 # X4 represents a vowel which occurs above the nuclear consonant.
\r
59 $X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1];
\r
60 # X5 represents a tone mark which occurs above the nuclear consonant or upper vowel.
\r
61 $X5 = [\u0EC8-\u0ECB];
\r
62 # X6 represents a consonant vowel, which occurs after the nuclear consonant.
\r
63 # It functions when the syllable doesn’t have any vowels. And it always exists with X8.
\r
64 $X6 = [\u0EA7\u0EAD\u0EBD];
\r
65 # X7 represents a final vowel.
\r
66 # However X7_1 always represents the end of syllable and it never exists with tone mark.
\r
67 $X7 = [\u0EB0\u0EB2\u0EB3];
\r
68 # X8 represents an alternate consonant.
\r
69 $X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7];
\r
70 # X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3.
\r
71 $X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5];
\r
72 # X10 represents a sign mark.
\r
73 # It always occurs at the end of a syllable, but mostly people keep it separate from syllable.
\r
74 $X10 = [\u0EAF\u0EC6\u0ECC];
\r
78 $X4_1_2 = [\u0EB4\u0EB5];
\r
79 $X4_3_4 = [\u0EB6\u0EB7];
\r
90 $Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
91 $Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
92 $Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
93 $Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1;
\r
94 $Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
95 $Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
96 $Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
98 $Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7);
\r
103 $Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
104 $Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1;
\r
105 $Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
107 $Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3);
\r
114 $Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
115 $Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1;
\r
116 $Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8);
\r
118 $Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3);
\r
124 $Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
129 $Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
132 $Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
135 $X4_1_4 = [\u0EB4-\u0EB7];
\r
137 $Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
142 $Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
146 $Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
147 $Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1;
\r
149 $Rule9 = ($Rule9_1 | $Rule9_2);
\r
152 $Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
155 $Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
158 $Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1;
\r
161 $Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
166 $Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?;
\r
168 $LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14);
\r
170 $WordJoin = [:Line_Break=Word_Joiner:];
\r
172 $LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*;
\r
175 # default numerical definitions
\r
177 $Extend = [\p{Word_Break = Extend}];
\r
178 $Format = [\p{Word_Break = Format}];
\r
179 $MidNumLet = [\p{Word_Break = MidNumLet}];
\r
180 $MidNum = [\p{Word_Break = MidNum}];
\r
181 $Numeric = [\p{Word_Break = Numeric}];
\r
182 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
\r
183 $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
\r
184 $MidNumEx = $MidNum ($Extend | $Format)*;
\r
185 $NumericEx = $Numeric ($Extend | $Format)*;
\r
186 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
\r
190 $LaoJoinedSyllableEx {200};
\r
191 # default numeric rules
\r
192 $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
\r