test/test_ICUFoldingFilter.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15 #
  16 #  Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
  17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
  18
  19 try:
  20     from icu import Normalizer2, UNormalizationMode2
  21 except ImportError, e:
  22     pass
  23
  24 from unittest import main
  25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
  26
  27 from lucene import *
  28
  29
  30 class TestICUFoldingFilter(BaseTokenStreamTestCase):
  31
  32     def testDefaults(self):
  33
  34         from lucene.ICUFoldingFilter import ICUFoldingFilter
  35
  36         class _analyzer(PythonAnalyzer):
  37             def tokenStream(_self, fieldName, reader):
  38                 return ICUFoldingFilter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
  39
  40         a = _analyzer()
  41
  42         # case folding
  43         self._assertAnalyzesTo(a, "This is a test",
  44                                [ "this", "is", "a", "test" ])
  45
  46         # case folding
  47         self._assertAnalyzesTo(a, u"Ruß", [ "russ" ])
  48
  49         # case folding with accent removal
  50         self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μαιοσ" ])
  51         self._assertAnalyzesTo(a, u"Μάϊος", [ u"μαιοσ" ])
  52
  53         # supplementary case folding
  54         self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
  55
  56         # normalization
  57         self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
  58
  59         # removal of default ignorables
  60         self._assertAnalyzesTo(a, u"क्‍ष", [ u"कष" ])
  61
  62         # removal of latin accents (composed)
  63         self._assertAnalyzesTo(a, u"résumé", [ "resume" ])
  64
  65         # removal of latin accents (decomposed)
  66         self._assertAnalyzesTo(a, u"re\u0301sume\u0301", [ u"resume" ])
  67
  68         # fold native digits
  69         self._assertAnalyzesTo(a, u"৭০৬", [ "706" ])
  70
  71         # ascii-folding-filter type stuff
  72         self._assertAnalyzesTo(a, u"đis is cræzy", [ "dis", "is", "craezy" ])
  73
  74
  75 if __name__ == "__main__":
  76     import sys, lucene
  77     try:
  78         import icu
  79     except ImportError:
  80         pass
  81     else:
  82         lucene.initVM()
  83         if '-loop' in sys.argv:
  84             sys.argv.remove('-loop')
  85             while True:
  86                 try:
  87                     main()
  88                 except:
  89                     pass
  90         else:
  91              main()