1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
20 from icu import Normalizer2, UNormalizationMode2
21 except ImportError, e:
24 from unittest import main
25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
30 class TestICUNormalizer2Filter(BaseTokenStreamTestCase):
32 def testDefaults(self):
34 from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
36 class analyzer(PythonAnalyzer):
37 def tokenStream(_self, fieldName, reader):
38 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
43 self._assertAnalyzesTo(a, "This is a test",
44 [ "this", "is", "a", "test" ])
47 self._assertAnalyzesTo(a, "Ruß", [ "russ" ])
50 self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μάϊοσ" ])
51 self._assertAnalyzesTo(a, u"Μάϊος", [ u"μάϊοσ" ])
53 # supplementary case folding
54 self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
57 self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
59 # removal of default ignorables
60 self._assertAnalyzesTo(a, u"क्ष", [ u"क्ष" ])
62 def testAlternate(self):
64 from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
66 class analyzer(PythonAnalyzer):
67 # specify nfc with decompose to get nfd
68 def tokenStream(_self, fieldName, reader):
69 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
70 Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
73 # decompose EAcute into E + combining Acute
74 self._assertAnalyzesTo(a, u"\u00E9", [ u"\u0065\u0301" ])
77 if __name__ == "__main__":
85 if '-loop' in sys.argv:
86 sys.argv.remove('-loop')