PyLucene 3.4.0-1 import
[pylucene.git] / test / test_ICUTransformFilter.py
1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 #   Licensed under the Apache License, Version 2.0 (the "License");
4 #   you may not use this file except in compliance with the License.
5 #   You may obtain a copy of the License at
6 #
7 #       http://www.apache.org/licenses/LICENSE-2.0
8 #
9 #   Unless required by applicable law or agreed to in writing, software
10 #   distributed under the License is distributed on an "AS IS" BASIS,
11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 #   See the License for the specific language governing permissions and
13 #   limitations under the License.
14 # ====================================================================
15 #
16 #  Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
18
19 try:
20     from icu import Transliterator, UTransDirection
21 except ImportError, e:
22     pass
23
24 from unittest import main
25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
26
27 from lucene import *
28
29
30 class TestICUTransformFilter(BaseTokenStreamTestCase):
31   
32     def _checkToken(self, transform, input, expected):
33
34         from lucene.ICUTransformFilter import ICUTransformFilter
35         ts = ICUTransformFilter(KeywordTokenizer(StringReader(input)),
36                                 transform)
37         self._assertTokenStreamContents(ts, [ expected ])
38
39     def _getTransliterator(self, name):
40
41         return Transliterator.createInstance(name, UTransDirection.FORWARD)
42
43     def testBasicFunctionality(self):
44
45         self._checkToken(self._getTransliterator("Traditional-Simplified"), 
46                          u"簡化字", u"简化字")
47         self._checkToken(self._getTransliterator("Katakana-Hiragana"),
48                          u"ヒラガナ", u"ひらがな")
49         self._checkToken(self._getTransliterator("Fullwidth-Halfwidth"), 
50                          u"アルアノリウ", u"アルアノリウ")
51         self._checkToken(self._getTransliterator("Any-Latin"), 
52                          u"Αλφαβητικός Κατάλογος", u"Alphabētikós Katálogos")
53         self._checkToken(self._getTransliterator("NFD; [:Nonspacing Mark:] Remove"), 
54                          u"Alphabētikós Katálogos", u"Alphabetikos Katalogos")
55         self._checkToken(self._getTransliterator("Han-Latin"),
56                          u"中国", u"zhōng guó")
57   
58     def testCustomFunctionality(self):
59
60         # convert a's to b's and b's to c's        
61         rules = "a > b; b > c;"
62         self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
63   
64     def testCustomFunctionality2(self):
65         
66         # convert a's to b's and b's to c's        
67         rules = "c { a > b; a > d;"
68         self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
69   
70     def testOptimizer2(self):
71
72         self._checkToken(self._getTransliterator("Traditional-Simplified; Lower"),
73                          "ABCDE", "abcde")
74
75
76 if __name__ == "__main__":
77     import sys, lucene
78     try:
79         import icu
80     except ImportError:
81         pass
82     else:
83         lucene.initVM()
84         if '-loop' in sys.argv:
85             sys.argv.remove('-loop')
86             while True:
87                 try:
88                     main()
89                 except:
90                     pass
91         else:
92              main()