# # Licensed to the Apache Software Foundation (ASF) under one or more # contributor license agreements. See the NOTICE file distributed with # this work for additional information regarding copyright ownership. # The ASF licenses this file to You under the Apache License, Version 2.0 # (the "License"); you may not use this file except in compliance with # the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # # # Parses Khmer text, with orthographic syllable as token. # # The definition of Khmer orthographic syllable is taken from the Unicode Standard. # # B = base character (consonant, independent vowel, etc) $KhmerBase = [\u1780-\u17B3]; # R = robat $KhmerRobat = [\u17CC]; # C = consonant shifter $KhmerShifter = [\u17C9\u17CA]; # S = subscript consonant or independent vowel sign $KhmerSub = ([\u17D2] $KhmerBase); # V = dependent vowel sign $KhmerVowel = [\u17B4-\u17C5]; # Z = zero-width joiner or non-joiner $KhmerZWC = [\u200C\u200D]; # O = any other sign $KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; $WordJoin = [:Line_Break=Word_Joiner:]; $KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?; $KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*; # # default numerical definitions # $Extend = [\p{Word_Break = Extend}]; $Format = [\p{Word_Break = Format}]; $MidNumLet = [\p{Word_Break = MidNumLet}]; $MidNum = [\p{Word_Break = MidNum}]; $Numeric = [\p{Word_Break = Numeric}]; $ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; $MidNumLetEx = $MidNumLet ($Extend | $Format)*; $MidNumEx = $MidNum ($Extend | $Format)*; $NumericEx = $Numeric ($Extend | $Format)*; $ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; !!forward; $KhmerJoinedSyllableEx {200}; # default numeric rules $NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};