X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/icu/src/data/uax29/Lao.rbbi diff --git a/lucene-java-3.4.0/lucene/contrib/icu/src/data/uax29/Lao.rbbi b/lucene-java-3.4.0/lucene/contrib/icu/src/data/uax29/Lao.rbbi deleted file mode 100644 index 27dcaca..0000000 --- a/lucene-java-3.4.0/lucene/contrib/icu/src/data/uax29/Lao.rbbi +++ /dev/null @@ -1,192 +0,0 @@ -# -# Licensed to the Apache Software Foundation (ASF) under one or more -# contributor license agreements. See the NOTICE file distributed with -# this work for additional information regarding copyright ownership. -# The ASF licenses this file to You under the Apache License, Version 2.0 -# (the "License"); you may not use this file except in compliance with -# the License. You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. -# -# Parses Lao text, with syllable as token. -# -# The definition of Lao syllable is based from: -# -# Syllabification of Lao Script for Line Breaking -# Phonpasit Phissamay, Valaxay Dalolay, Chitaphone Chanhsililath, Oulaiphone Silimasak, -# Sarmad Hussain, Nadir Durrani, Science Technology and Environment Agency, CRULP -# http://www.panl10n.net/english/final%20reports/pdf%20files/Laos/LAO06.pdf -# http://www.panl10n.net/Presentations/Cambodia/Phonpassit/LineBreakingAlgo.pdf -# -# NOTE: -# There are some ambiguities in Lao syllabification without additional processing, as mentioned in the paper. -# For this reason, this RBBI grammar really only works with LaoBreakIterator, as it does this additional work. -# -# Syllable structure, where X is the nuclear consonant: -# -# +----+ -# | X5 | -# +----+ -# | X4 | -# +----+----+----+----+----+----+----+-----+ -# | X0 | X1 | X | X6 | X7 | X8 | X9 | X10 | -# +----+----+----+----+----+----+----+-----+ -# | X2 | -# +----+ -# | X3 | -# +----+ -# -# X0 represents a vowel which occurs before the nuclear consonant. -# It can always define the beginning of syllable. -$X0 = [\u0EC0-\u0EC4]; -# X1 is a combination consonant which comes before the nuclear consonant, -# but only if nuclear consonant is one of {ງ ຍ ລ ວ ຼ ມ ນ ຣ} -$X1 = [\u0EAB]; -# X represents the nuclear consonant. -$X = [\u0E81-\u0EAE\u0EDC\u0EDD]; -# X2 is a combination consonant which comes after the nuclear consonant, -# which is placed under or next to the nuclear consonant. -$X2 = [\u0EBC\u0EA3\u0EA7\u0EA5]; -# X3 represents a vowel which occurs under the nuclear consonant. -$X3 = [\u0EB8\u0EB9]; -# X4 represents a vowel which occurs above the nuclear consonant. -$X4 = [\u0EB4-\u0EB7\u0ECD\u0EBB\u0EB1]; -# X5 represents a tone mark which occurs above the nuclear consonant or upper vowel. -$X5 = [\u0EC8-\u0ECB]; -# X6 represents a consonant vowel, which occurs after the nuclear consonant. -# It functions when the syllable doesn’t have any vowels. And it always exists with X8. -$X6 = [\u0EA7\u0EAD\u0EBD]; -# X7 represents a final vowel. -# However X7_1 always represents the end of syllable and it never exists with tone mark. -$X7 = [\u0EB0\u0EB2\u0EB3]; -# X8 represents an alternate consonant. -$X8 = [\u0E81\u0E87\u0E8D\u0E94\u0E99\u0EA1\u0E9A\u0EA7]; -# X9 represents alternate consonants to pronounce foreign terms, it always exist with X10_3. -$X9 = [\u0E88\u0EAA\u0E8A\u0E9E\u0E9F\u0EA5]; -# X10 represents a sign mark. -# It always occurs at the end of a syllable, but mostly people keep it separate from syllable. -$X10 = [\u0EAF\u0EC6\u0ECC]; - -# Section 1 -$X0_1 = [\u0EC0]; -$X4_1_2 = [\u0EB4\u0EB5]; -$X4_3_4 = [\u0EB6\u0EB7]; -$X4_6 = [\u0EBB]; -$X4_7 = [\u0EB1]; -$X6_2 = [\u0EAD]; -$X6_3 = [\u0EBD]; -$X7_1 = [\u0EB0]; -$X7_2 = [\u0EB2]; -$X10_1 = [\u0EAF]; -$X10_2 = [\u0EC6]; -$X10_3 = [\u0ECC]; - -$Rule1_1 = $X0_1 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule1_2 = $X0_1 ($X1)? $X ($X2)? $X4_1_2 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule1_3 = $X0_1 ($X1)? $X ($X2)? $X4_3_4 ($X5)? $X6_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule1_4 = $X0_1 ($X1)? $X ($X2)? ($X7_2)? $X7_1; -$Rule1_5 = $X0_1 ($X1)? $X ($X2)? $X4_6 ($X5)? $X7_2 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule1_6 = $X0_1 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule1_7 = $X0_1 ($X1)? $X ($X2)? ($X4_7)? ($X5)? $X6_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -$Rule1 = ($Rule1_1 | $Rule1_2 | $Rule1_3 | $Rule1_4 | $Rule1_5 | $Rule1_6 | $Rule1_7); - -# Section 2 -$X0_2 = [\u0EC1]; - -$Rule2_1 = $X0_2 ($X1)? $X ($X2)? ($X5)? ($X6)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule2_2 = $X0_2 ($X1)? $X ($X2)? $X7_1; -$Rule2_3 = $X0_2 ($X1)? $X ($X2)? $X4_7 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -$Rule2 = ($Rule2_1 | $Rule2_2 | $Rule2_3); - -# Section 3 -$X0_3 = [\u0EC2]; -$X8_3 = [\u0E8D]; -$X8_8 = [\u0EA7]; - -$Rule3_1 = $X0_3 ($X1)? $X ($X2)? ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule3_2 = $X0_3 ($X1)? $X ($X2)? $X7_1; -$Rule3_3 = $X0_3 ($X1)? $X ($X2)? $X4_7 ($X5)? ($X8_3 | $X8_8); - -$Rule3 = ($Rule3_1 | $Rule3_2 | $Rule3_3); - -# Section 4 -$X0_4 = [\u0EC4]; -$X6_1 = [\u0EA7]; - -$Rule4 = $X0_4 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 5 -$X0_5 = [\u0EC3]; - -$Rule5 = $X0_5 ($X1)? $X ($X2)? ($X5)? ($X6_1)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 6 -$Rule6 = ($X1)? $X ($X2)? $X3 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 7 -$X4_1_4 = [\u0EB4-\u0EB7]; - -$Rule7 = ($X1)? $X ($X2)? $X4_1_4 ($X5)? ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 8 -$X4_5 = [\u0ECD]; - -$Rule8 = ($X1)? $X ($X2)? $X4_5 ($X5)? ($X7_2)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 9 - -$Rule9_1 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; -$Rule9_2 = ($X1)? $X ($X2)? $X4_6 ($X5)? $X6_1 $X7_1; - -$Rule9 = ($Rule9_1 | $Rule9_2); - -# Section 10 -$Rule10 = ($X1)? $X ($X2)? $X4_7 ($X5)? ($X6_1)? $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 11 -$Rule11 = ($X1)? $X ($X2)? ($X5)? $X6 $X8 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 12 -$Rule12 = ($X1)? $X ($X2)? ($X5)? $X7_1; - -# Section 13 -$Rule13 = ($X1)? $X ($X2)? ($X5)? $X7_2 ($X8)? ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -# Section 14 -$X7_3 = [\u0EB3]; - -$Rule14 = ($X1)? $X ($X2)? ($X5)? $X7_3 ($X9 $X10_3)? ($X10_2)? ($X10_1)?; - -$LaoSyllableEx = ($Rule1 | $Rule2 | $Rule3 | $Rule4 | $Rule5 | $Rule6 | $Rule7 | $Rule8 | $Rule9 | $Rule10 | $Rule11 | $Rule12 | $Rule13 | $Rule14); - -$WordJoin = [:Line_Break=Word_Joiner:]; - -$LaoJoinedSyllableEx = $LaoSyllableEx ($WordJoin $LaoSyllableEx)*; - -# -# default numerical definitions -# -$Extend = [\p{Word_Break = Extend}]; -$Format = [\p{Word_Break = Format}]; -$MidNumLet = [\p{Word_Break = MidNumLet}]; -$MidNum = [\p{Word_Break = MidNum}]; -$Numeric = [\p{Word_Break = Numeric}]; -$ExtendNumLet = [\p{Word_Break = ExtendNumLet}]; -$MidNumLetEx = $MidNumLet ($Extend | $Format)*; -$MidNumEx = $MidNum ($Extend | $Format)*; -$NumericEx = $Numeric ($Extend | $Format)*; -$ExtendNumLetEx = $ExtendNumLet ($Extend | $Format)*; - -!!forward; - -$LaoJoinedSyllableEx {200}; -# default numeric rules -$NumericEx $ExtendNumLetEx? (($MidNumEx | $MidNumLetEx)? $NumericEx $ExtendNumLetEx?)* {100};