pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / icu / src / data / utr30 / DiacriticFolding.txt
diff --git a/lucene-java-3.5.0/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt b/lucene-java-3.5.0/lucene/contrib/icu/src/data/utr30/DiacriticFolding.txt
new file mode 100644 (file)
index 0000000..5e5f2de
--- /dev/null
@@ -0,0 +1,563 @@
+# Copyright 2001-2010 Unicode, Inc.
+# 
+# Disclaimer
+# 
+# This source code is provided as is by Unicode, Inc. No claims are
+# made as to fitness for any particular purpose. No warranties of any
+# kind are expressed or implied. The recipient agrees to determine
+# applicability of information provided. If this file has been
+# purchased on magnetic or optical media from Unicode, Inc., the
+# sole remedy for any claim will be exchange of defective media
+# within 90 days of receipt.
+# 
+# Limitations on Rights to Redistribute This Code
+# 
+# Unicode, Inc. hereby grants the right to freely use the information
+# supplied in this file in the creation of products supporting the
+# Unicode Standard, and to make copies of this file in any form
+# for internal or external distribution as long as this notice
+# remains attached.
+
+### Custom Normalization mappings for UTR#30 
+### (http://www.unicode.org/reports/tr30/tr30-4.html)
+###
+### Created from Unicode 5.2 UCD
+###
+
+# Removes diacritics, as defined by [:Diacritic:]
+# These may or may not be combining marks
+005E>
+0060>
+00B7>
+02B9..02D7>
+02DE>
+02DF>
+02E5..033F>
+0342>
+0346..034E>
+0350..0357>
+035D..0362>
+0375>
+0483..0487>
+0559>
+0591..05A1>
+05A3..05BD>
+05BF>
+05C1>
+05C2>
+05C4>
+064B..0652>
+0657>
+0658>
+06DF>
+06E0>
+06E5>
+06E6>
+06EA..06EC>
+0730..074A>
+07A6..07B0>
+07EB..07F5>
+0818>
+0819>
+093C>
+094D>
+0951..0954>
+0971>
+09BC>
+09CD>
+0A3C>
+0A4D>
+0ABC>
+0ACD>
+0B3C>
+0B4D>
+0BCD>
+0C4D>
+0CBC>
+0CCD>
+0D4D>
+0DCA>
+0E47..0E4C>
+0E4E>
+0EC8..0ECC>
+0F18>
+0F19>
+0F35>
+0F37>
+0F39>
+0F3E>
+0F3F>
+0F82..0F84>
+0F86>
+0F87>
+0FC6>
+1037>
+1039>
+103A>
+1087..108D>
+108F>
+109A>
+109B>
+17C9..17D3>
+17DD>
+1939..193B>
+1A75..1A7C>
+1A7F>
+1B34>
+1B44>
+1B6B..1B73>
+1BAA>
+1C36>
+1C37>
+1C78..1C7D>
+1CD0..1CE8>
+1CED>
+1D2F>
+1D3B>
+1D4E>
+1DC4..1DCF>
+1DFD..1DFF>
+2CEF..2CF1>
+2E2F>
+302A..302F>
+3099>
+309A>
+30FC>
+A66F>
+A67C>
+A67D>
+A67F>
+A6F0>
+A6F1>
+A717..A721>
+A788>
+A8C4>
+A8E0..A8F1>
+A92B..A92E>
+A953>
+A9B3>
+A9C0>
+AA7B>
+AABF..AAC2>
+ABEC>
+ABED>
+FB1E>
+FE20..FE26>
+110B9>
+110BA>
+1D167..1D169>
+1D16D..1D172>
+1D17B..1D182>
+1D185..1D18B>
+1D1AA..1D1AD>
+
+# Latin script "composed" that do not further decompose, so decompose here
+# These are from AsciiFoldingFilter
+00E6>0061 0065
+00F0>0064
+00F8>006F
+00FE>0074 0068
+0111>0064
+0127>0068
+0131>0069
+0138>0071
+0142>006C
+014B>006E
+0153>006F 0065
+0167>0074
+0180>0062
+0183>0062
+0185>0062
+0188>0063
+018C>0064
+018D>0064
+0192>0066
+0195>0068 0076
+0199>006B
+019A>006C
+#019B>
+019E>006E
+#01A3>
+01A5>0070
+#01A8>
+#01AA>
+01AB>0074
+01AD>0074
+01B4>0079
+01B6>007A
+#01B9>
+#01BA>
+01BB>0032
+01BD>0035
+#01BE>
+01BF>0077
+01C0>007C
+01C1>007C 007C
+#01C2>
+01C3>0021
+01DD>0065
+01E5>0047
+021D>007A
+0221>0064
+0223>006F 0075
+0225>007A
+0234>006C
+0235>006E
+0236>0074
+0237>006A
+0238>0064 0062
+0239>0071 0070
+023C>0063
+023F>0073
+0240>007A
+#0242>
+0247>0065
+0249>006A
+024B>0071
+024D>0072
+024F>0079
+0250>0061
+0251>0061
+0252>0061
+0253>0062
+0254>006F
+0255>0063
+0256>0064
+0257>0064
+0258>0065
+0259>0061
+025A>0061
+025B>0065
+025C>0065
+025D>0065
+025E>0065
+025F>006A
+0260>0067
+0261>0067
+0262>0047
+#0263>
+#0264>
+0265>0068
+0266>0068
+#0267>
+0268>0069
+0269>0069
+026A>0049
+026B>006C
+026C>006C
+026D>006C
+#026E>
+026F>006D
+0270>006D
+0271>006D
+0272>006E
+0273>006E
+0274>004E
+0275>006F
+0276>004F 0045
+#0277>
+#0278>
+#0279>
+#027A>
+#027B>
+027C>0072
+027D>0072
+027E>0072
+027F>0072
+0280>0052
+0281>0052
+0282>0073
+#0283>
+0284>006A
+#0285>
+#0286>
+0287>0074
+0288>0074
+0289>0075
+#028A>
+028B>0076
+028C>0076
+028D>0077
+028E>0079
+028F>0059
+0290>007A
+0291>007A
+#0292>
+#0293>
+#0294>
+#0295>
+#0296>
+0297>0043
+0298>006F
+0299>0042
+029A>0065
+029B>0047
+029C>0048
+029D>006A
+029E>006B
+029F>004C
+02A0>0071
+#02A1>
+#02A2>
+02A3>0064 007A
+#02A4>
+02A5>0064 007A
+02A6>0074 0073
+#02A7>
+02A8>0074 0063
+02A9>0066 006E
+02AA>006C 0073
+02AB>006C 007A
+02AC>0077 0077
+#02AD>
+02AE>0068
+02AF>0068
+1D00>0041
+1D01>0041 0045
+1D02>0061 0065
+1D03>0042
+1D04>0043
+1D05>0044
+1D06>0044
+1D07>0045
+1D08>0065
+1D09>0069
+1D0A>004A
+1D0B>004B
+1D0C>004C
+1D0D>004D
+1D0E>004E
+1D0F>004F
+1D10>004F
+1D11>006F
+#1D12>
+1D13>006F
+1D14>006F 0065
+1D15>004F 0055
+1D16>006F
+1D17>006F
+1D18>0050
+1D19>0052
+1D1A>0052
+1D1B>0054
+1D1C>0055
+1D1D>0075
+1D1E>0075
+1D1F>006D
+1D20>0056
+1D21>0057
+1D22>005A
+#1D23>
+#1D24>
+#1D25>
+1D6B>0075 0065
+1D6C>0062
+1D6D>0064
+1D6E>0066
+1D6F>006D
+1D70>006E
+1D71>0070
+1D72>0072
+1D73>0072
+1D74>0073
+1D75>0074
+1D76>007A
+1D77>0067
+1D79>0067
+1D7A>0074 0068
+1D7B>0049
+1D7C>0069
+1D7D>0070
+1D7E>0055
+#1D7F>
+1D80>0062
+1D81>0064
+1D82>0066
+1D83>0067
+1D84>006B
+1D85>006C
+1D86>006D
+1D87>006E
+1D88>0070
+1D89>0072
+1D8A>0073
+#1D8B>
+1D8C>0076
+1D8D>0078
+1D8E>007A
+1D8F>0061
+1D90>0061
+1D91>0064
+1D92>0065
+1D93>0065
+1D94>0065
+1D95>0061
+1D96>0069
+1D97>006F
+#1D98>
+1D99>0075
+#1D9A>
+1E9C>0073
+1E9D>0073
+1E9F>0064
+1EFB>006C 006C
+1EFD>0076
+1EFF>0079
+214E>0066
+#2180>
+#2181>
+#2182>
+2184>0063
+#2185>
+#2186>
+#2187>
+#2188>
+2C61>006C
+2C65>0061
+2C66>0074
+2C68>0068
+2C6A>006B
+2C6C>007A
+2C71>0076
+2C73>0077
+2C74>0076
+2C76>0068
+#2C77>
+2C78>0065
+#2C79>
+2C7A>006F
+2C7B>0045
+#A723>
+#A725>
+#A727>
+A729>0074 007A
+#A72B>
+#A72D>
+#A72F>
+A730>0046
+A731>0053
+A733>0061 0061
+A735>0061 006F
+A737>0061 0075
+A739>0061 0076
+A73B>0061 0076
+A73D>0061 0079
+A73F>0063
+A741>006B
+A743>006B
+A745>006B
+A747>006C
+A749>006C
+A74B>006F
+A74D>006F
+A74F>006F 006F
+A751>0070
+A753>0070
+A755>0070
+A757>0071
+A759>0071
+A75B>0072
+#A75D>
+A75F>0076
+A761>0076 0079
+A763>007A
+A765>0074 0068
+A767>0074 0068
+A769>0076 
+#A76B>
+#A76D>
+#A76F>
+#A771>
+#A772>
+#A773>
+#A774>
+#A775>
+#A776>
+#A777>
+#A778>
+A77A>0064
+A77C>0066
+A77F>0067
+A781>006C
+A783>0072
+A785>0053
+A787>0074
+A78C>0027
+A7FB>0046
+A7FC>0070
+A7FD>004D
+A7FE>0049
+A7FF>004D
+
+# Cyrillic script "composed" that do not further decompose, so decompose here
+# These are from UTR#30 DiacriticFolding.txt
+
+047D>0461
+048B>0439
+048F>0440
+0491>0433
+0493>0433
+0495>0433
+0497>0436
+0499>0437
+049B>043A
+049D>043A
+049F>043A
+04A3>043D
+04A7>043F
+04AB>0441
+04AD>0442
+04B1>04AF
+04B3>0425
+04B7>04BC
+04B9>0447
+04BF>04BC
+04C4>043A
+04C6>043B
+04C8>043D
+04CA>043D
+04CC>04BC
+04CE>043C
+
+# Additional signs and diacritic, from examination of [:Mark:]&[:Lm:]
+0358..035C>
+05A2>
+05C5>
+05C7>
+0610..061A>
+0640>
+06D6..06DE>
+06E1..06E4>
+06E7..06E9>
+06ED>
+0653..0656>
+0659..065F>
+0670>
+0711>
+07FA>
+0816..0817>
+081B..0823>
+0825..0827>
+0829>
+082A..082D>
+0900>0901
+1714>
+1734>
+1DC0..1DC3>
+1DD0..1DE6>
+20D0..20F0>
+2DE0..2DFF>
+A670..A672>
+A802>
+10A3F>
+11046>
+1D165..1D166>
+1D242..1D244>
+
+# Additional Arabic/Hebrew decompositions
+05F3>0027
+05F4>0022
+0629>0647
+0649>064A
+06A9>0643
+06CC>064A