1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
20 from icu import Normalizer2, UNormalizationMode2
21 except ImportError, e:
24 from unittest import main
25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
30 class TestICUFoldingFilter(BaseTokenStreamTestCase):
32 def testDefaults(self):
34 from lucene.ICUFoldingFilter import ICUFoldingFilter
36 class _analyzer(PythonAnalyzer):
37 def tokenStream(_self, fieldName, reader):
38 return ICUFoldingFilter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
43 self._assertAnalyzesTo(a, "This is a test",
44 [ "this", "is", "a", "test" ])
47 self._assertAnalyzesTo(a, u"Ruß", [ "russ" ])
49 # case folding with accent removal
50 self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μαιοσ" ])
51 self._assertAnalyzesTo(a, u"Μάϊος", [ u"μαιοσ" ])
53 # supplementary case folding
54 self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
57 self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
59 # removal of default ignorables
60 self._assertAnalyzesTo(a, u"क्ष", [ u"कष" ])
62 # removal of latin accents (composed)
63 self._assertAnalyzesTo(a, u"résumé", [ "resume" ])
65 # removal of latin accents (decomposed)
66 self._assertAnalyzesTo(a, u"re\u0301sume\u0301", [ u"resume" ])
69 self._assertAnalyzesTo(a, u"৭০৬", [ "706" ])
71 # ascii-folding-filter type stuff
72 self._assertAnalyzesTo(a, u"đis is cræzy", [ "dis", "is", "craezy" ])
75 if __name__ == "__main__":
83 if '-loop' in sys.argv:
84 sys.argv.remove('-loop')