python/ICUNormalizer2Filter.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15 #
  16 #  Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
  17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
  18 #
  19 #  Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
  20 #
  21 #  With this filter, you can normalize text in the following ways:
  22 #   - NFKC Normalization, Case Folding, and removing Ignorables (the default)
  23 #   - Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
  24 #   - Based on rules from a custom normalization mapping.
  25 #
  26 #  If you use the defaults, this filter is a simple way to standardize
  27 #  Unicode text in a language-independent way for search:
  28 #   - The case folding that it does can be seen as a replacement for
  29 #     LowerCaseFilter: For example, it handles cases such as the Greek
  30 #     sigma, so that "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
  31 #   - The normalization will standardizes different forms of the same
  32 #     character in Unicode. For example, CJK full-width numbers will be
  33 #     standardized to their ASCII forms.
  34 #   - Ignorables such as Zero-Width Joiner and Variation Selectors are
  35 #     removed. These are typically modifier characters that affect display.
  36 #
  37 # ====================================================================
  38
  39 from lucene import PythonTokenFilter, CharTermAttribute
  40 from icu import Normalizer2, UNormalizationMode2, UNormalizationCheckResult
  41
  42
  43 class ICUNormalizer2Filter(PythonTokenFilter):
  44
  45     def __init__(self, input, normalizer=None):
  46         super(ICUNormalizer2Filter, self).__init__(input)
  47
  48         self.input = input
  49         self.termAtt = self.addAttribute(CharTermAttribute.class_);
  50
  51         if normalizer is None:
  52             normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE)
  53         self.normalizer = normalizer
  54
  55     def incrementToken(self):
  56
  57         if self.input.incrementToken():
  58             text = self.termAtt.toString()
  59
  60             if self.normalizer.quickCheck(text) != UNormalizationCheckResult.YES:
  61                 self.termAtt.setEmpty()
  62                 self.termAtt.append(self.normalizer.normalize(text))
  63
  64             return True
  65
  66         return False