1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
20 from icu import Transliterator, UTransDirection
21 except ImportError, e:
24 from unittest import main
25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
30 class TestICUTransformFilter(BaseTokenStreamTestCase):
32 def _checkToken(self, transform, input, expected):
34 from lucene.ICUTransformFilter import ICUTransformFilter
35 ts = ICUTransformFilter(KeywordTokenizer(StringReader(input)),
37 self._assertTokenStreamContents(ts, [ expected ])
39 def _getTransliterator(self, name):
41 return Transliterator.createInstance(name, UTransDirection.FORWARD)
43 def testBasicFunctionality(self):
45 self._checkToken(self._getTransliterator("Traditional-Simplified"),
47 self._checkToken(self._getTransliterator("Katakana-Hiragana"),
49 self._checkToken(self._getTransliterator("Fullwidth-Halfwidth"),
51 self._checkToken(self._getTransliterator("Any-Latin"),
52 u"Αλφαβητικός Κατάλογος", u"Alphabētikós Katálogos")
53 self._checkToken(self._getTransliterator("NFD; [:Nonspacing Mark:] Remove"),
54 u"Alphabētikós Katálogos", u"Alphabetikos Katalogos")
55 self._checkToken(self._getTransliterator("Han-Latin"),
58 def testCustomFunctionality(self):
60 # convert a's to b's and b's to c's
61 rules = "a > b; b > c;"
62 self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "abacadaba", "bcbcbdbcb")
64 def testCustomFunctionality2(self):
66 # convert a's to b's and b's to c's
67 rules = "c { a > b; a > d;"
68 self._checkToken(Transliterator.createFromRules("test", rules, UTransDirection.FORWARD), "caa", "cbd")
70 def testOptimizer2(self):
72 self._checkToken(self._getTransliterator("Traditional-Simplified; Lower"),
76 if __name__ == "__main__":
84 if '-loop' in sys.argv:
85 sys.argv.remove('-loop')