Configured build for Ubuntu and added Stempel polish analyzer
[pylucene.git] / test / test_ICUNormalizer2Filter.py
1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 #   Licensed under the Apache License, Version 2.0 (the "License");
4 #   you may not use this file except in compliance with the License.
5 #   You may obtain a copy of the License at
6 #
7 #       http://www.apache.org/licenses/LICENSE-2.0
8 #
9 #   Unless required by applicable law or agreed to in writing, software
10 #   distributed under the License is distributed on an "AS IS" BASIS,
11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 #   See the License for the specific language governing permissions and
13 #   limitations under the License.
14 # ====================================================================
15 #
16 #  Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
18
19 try:
20     from icu import Normalizer2, UNormalizationMode2
21 except ImportError, e:
22     pass
23
24 from unittest import main
25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
26
27 from lucene import *
28
29
30 class TestICUNormalizer2Filter(BaseTokenStreamTestCase):
31
32     def testDefaults(self):
33
34         from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
35
36         class analyzer(PythonAnalyzer):
37             def tokenStream(_self, fieldName, reader):
38                 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
39
40         a = analyzer()
41
42         # case folding
43         self._assertAnalyzesTo(a, "This is a test",
44                                [ "this", "is", "a", "test" ])
45
46         # case folding
47         self._assertAnalyzesTo(a, "Ruß", [ "russ" ])
48     
49         # case folding
50         self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μάϊοσ" ])
51         self._assertAnalyzesTo(a, u"Μάϊος", [ u"μάϊοσ" ])
52
53         # supplementary case folding
54         self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
55     
56         # normalization
57         self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
58
59         # removal of default ignorables
60         self._assertAnalyzesTo(a, u"क्‍ष", [ u"क्ष" ])
61   
62     def testAlternate(self):
63
64         from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
65
66         class analyzer(PythonAnalyzer):
67             # specify nfc with decompose to get nfd
68             def tokenStream(_self, fieldName, reader):
69                 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
70                                             Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
71
72         a = analyzer()
73         # decompose EAcute into E + combining Acute
74         self._assertAnalyzesTo(a, u"\u00E9", [ u"\u0065\u0301" ])
75
76
77 if __name__ == "__main__":
78     import sys, lucene
79     try:
80         import icu
81     except ImportError:
82         pass
83     else:
84         lucene.initVM()
85         if '-loop' in sys.argv:
86             sys.argv.remove('-loop')
87             while True:
88                 try:
89                     main()
90                 except:
91                     pass
92         else:
93              main()