test/test_ICUNormalizer2Filter.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15 #
  16 #  Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
  17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
  18
  19 try:
  20     from icu import Normalizer2, UNormalizationMode2
  21 except ImportError, e:
  22     pass
  23
  24 from unittest import main
  25 from BaseTokenStreamTestCase import BaseTokenStreamTestCase
  26
  27 from lucene import *
  28
  29
  30 class TestICUNormalizer2Filter(BaseTokenStreamTestCase):
  31
  32     def testDefaults(self):
  33
  34         from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
  35
  36         class analyzer(PythonAnalyzer):
  37             def tokenStream(_self, fieldName, reader):
  38                 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader))
  39
  40         a = analyzer()
  41
  42         # case folding
  43         self._assertAnalyzesTo(a, "This is a test",
  44                                [ "this", "is", "a", "test" ])
  45
  46         # case folding
  47         self._assertAnalyzesTo(a, "Ruß", [ "russ" ])
  48
  49         # case folding
  50         self._assertAnalyzesTo(a, u"ΜΆΪΟΣ", [ u"μάϊοσ" ])
  51         self._assertAnalyzesTo(a, u"Μάϊος", [ u"μάϊοσ" ])
  52
  53         # supplementary case folding
  54         self._assertAnalyzesTo(a, u"𐐖", [ u"𐐾" ])
  55
  56         # normalization
  57         self._assertAnalyzesTo(a, u"ﴳﴺﰧ", [ u"طمطمطم" ])
  58
  59         # removal of default ignorables
  60         self._assertAnalyzesTo(a, u"क्‍ष", [ u"क्ष" ])
  61
  62     def testAlternate(self):
  63
  64         from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
  65
  66         class analyzer(PythonAnalyzer):
  67             # specify nfc with decompose to get nfd
  68             def tokenStream(_self, fieldName, reader):
  69                 return ICUNormalizer2Filter(WhitespaceTokenizer(Version.LUCENE_CURRENT, reader),
  70                                             Normalizer2.getInstance(None, "nfc", UNormalizationMode2.DECOMPOSE))
  71
  72         a = analyzer()
  73         # decompose EAcute into E + combining Acute
  74         self._assertAnalyzesTo(a, u"\u00E9", [ u"\u0065\u0301" ])
  75
  76
  77 if __name__ == "__main__":
  78     import sys, lucene
  79     try:
  80         import icu
  81     except ImportError:
  82         pass
  83     else:
  84         lucene.initVM()
  85         if '-loop' in sys.argv:
  86             sys.argv.remove('-loop')
  87             while True:
  88                 try:
  89                     main()
  90                 except:
  91                     pass
  92         else:
  93              main()