2 # Licensed to the Apache Software Foundation (ASF) under one or more
\r
3 # contributor license agreements. See the NOTICE file distributed with
\r
4 # this work for additional information regarding copyright ownership.
\r
5 # The ASF licenses this file to You under the Apache License, Version 2.0
\r
6 # (the "License"); you may not use this file except in compliance with
\r
7 # the License. You may obtain a copy of the License at
\r
9 # http://www.apache.org/licenses/LICENSE-2.0
\r
11 # Unless required by applicable law or agreed to in writing, software
\r
12 # distributed under the License is distributed on an "AS IS" BASIS,
\r
13 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
\r
14 # See the License for the specific language governing permissions and
\r
15 # limitations under the License.
\r
18 # Parses Khmer text, with orthographic syllable as token.
\r
20 # The definition of Khmer orthographic syllable is taken from the Unicode Standard.
\r
22 # B = base character (consonant, independent vowel, etc)
\r
23 $KhmerBase = [\u1780-\u17B3];
\r
25 $KhmerRobat = [\u17CC];
\r
26 # C = consonant shifter
\r
27 $KhmerShifter = [\u17C9\u17CA];
\r
28 # S = subscript consonant or independent vowel sign
\r
29 $KhmerSub = ([\u17D2] $KhmerBase);
\r
30 # V = dependent vowel sign
\r
31 $KhmerVowel = [\u17B4-\u17C5];
\r
32 # Z = zero-width joiner or non-joiner
\r
33 $KhmerZWC = [\u200C\u200D];
\r
34 # O = any other sign
\r
35 $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD];
\r
37 $WordJoin = [:Line_Break=Word_Joiner:];
\r
39 $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;
\r
41 $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;
\r
44 # default numerical definitions
\r
46 $Extend = [\p{Word_Break = Extend}];
\r
47 $Format = [\p{Word_Break = Format}];
\r
48 $MidNumLet = [\p{Word_Break = MidNumLet}];
\r
49 $MidNum = [\p{Word_Break = MidNum}];
\r
50 $Numeric = [\p{Word_Break = Numeric}];
\r
51 $ExtendNumLet = [\p{Word_Break = ExtendNumLet}];
\r
52 $MidNumLetEx = $MidNumLet ($Extend | $Format)*;
\r
53 $MidNumEx = $MidNum ($Extend | $Format)*;
\r
54 $NumericEx = $Numeric ($Extend | $Format)*;
\r
55 $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;
\r
58 $KhmerJoinedSyllableEx {200};
\r
60 # default numeric rules
\r
61 $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};
\r