+++ /dev/null
-#\r
-# Licensed to the Apache Software Foundation (ASF) under one or more\r
-# contributor license agreements. See the NOTICE file distributed with\r
-# this work for additional information regarding copyright ownership.\r
-# The ASF licenses this file to You under the Apache License, Version 2.0\r
-# (the "License"); you may not use this file except in compliance with\r
-# the License. You may obtain a copy of the License at\r
-#\r
-# http://www.apache.org/licenses/LICENSE-2.0\r
-#\r
-# Unless required by applicable law or agreed to in writing, software\r
-# distributed under the License is distributed on an "AS IS" BASIS,\r
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.\r
-# See the License for the specific language governing permissions and\r
-# limitations under the License.\r
-#\r
-# \r
-# Parses Khmer text, with orthographic syllable as token.\r
-#\r
-# The definition of Khmer orthographic syllable is taken from the Unicode Standard.\r
-#\r
-# B = base character (consonant, independent vowel, etc)\r
-$KhmerBase = [\u1780-\u17B3];\r
-# R = robat\r
-$KhmerRobat = [\u17CC];\r
-# C = consonant shifter\r
-$KhmerShifter = [\u17C9\u17CA];\r
-# S = subscript consonant or independent vowel sign\r
-$KhmerSub = ([\u17D2] $KhmerBase);\r
-# V = dependent vowel sign\r
-$KhmerVowel = [\u17B4-\u17C5];\r
-# Z = zero-width joiner or non-joiner\r
-$KhmerZWC = [\u200C\u200D];\r
-# O = any other sign\r
-$KhmerSign = [\u17C6-\u17C8\u17CB\u17CD-\u17D1\u17DC\u17DD]; \r
-\r
-$WordJoin = [:Line_Break=Word_Joiner:];\r
-\r
-$KhmerSyllableEx = $KhmerBase ($KhmerRobat | $KhmerShifter)? ($KhmerSub ($KhmerRobat)?)* (($KhmerZWC)? $KhmerVowel)? ($KhmerSign)? ($KhmerSub)?;\r
-\r
-$KhmerJoinedSyllableEx = $KhmerSyllableEx ($WordJoin $KhmerSyllableEx)*;\r
-\r
-#\r
-# default numerical definitions\r
-#\r
-$Extend = [\p{Word_Break = Extend}];\r
-$Format = [\p{Word_Break = Format}];\r
-$MidNumLet = [\p{Word_Break = MidNumLet}];\r
-$MidNum = [\p{Word_Break = MidNum}];\r
-$Numeric = [\p{Word_Break = Numeric}];\r
-$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; \r
-$MidNumLetEx = $MidNumLet ($Extend | $Format)*;\r
-$MidNumEx = $MidNum ($Extend | $Format)*;\r
-$NumericEx = $Numeric ($Extend | $Format)*;\r
-$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*;\r
-\r
-!!forward;\r
-$KhmerJoinedSyllableEx {200};\r
-\r
-# default numeric rules\r
-$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};\r