1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUNormalizer2Filter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
19 # Normalize token text with ICU's {@link com.ibm.icu.text.Normalizer2}
21 # With this filter, you can normalize text in the following ways:
22 # - NFKC Normalization, Case Folding, and removing Ignorables (the default)
23 # - Using a standard Normalization mode (NFC, NFD, NFKC, NFKD)
24 # - Based on rules from a custom normalization mapping.
26 # If you use the defaults, this filter is a simple way to standardize
27 # Unicode text in a language-independent way for search:
28 # - The case folding that it does can be seen as a replacement for
29 # LowerCaseFilter: For example, it handles cases such as the Greek
30 # sigma, so that "Μάϊος" and "ΜΆΪΟΣ" will match correctly.
31 # - The normalization will standardizes different forms of the same
32 # character in Unicode. For example, CJK full-width numbers will be
33 # standardized to their ASCII forms.
34 # - Ignorables such as Zero-Width Joiner and Variation Selectors are
35 # removed. These are typically modifier characters that affect display.
37 # ====================================================================
39 from lucene import PythonTokenFilter, CharTermAttribute
40 from icu import Normalizer2, UNormalizationMode2, UNormalizationCheckResult
43 class ICUNormalizer2Filter(PythonTokenFilter):
45 def __init__(self, input, normalizer=None):
46 super(ICUNormalizer2Filter, self).__init__(input)
49 self.termAtt = self.addAttribute(CharTermAttribute.class_);
51 if normalizer is None:
52 normalizer = Normalizer2.getInstance(None, "nfkc_cf", UNormalizationMode2.COMPOSE)
53 self.normalizer = normalizer
55 def incrementToken(self):
57 if self.input.incrementToken():
58 text = self.termAtt.toString()
60 if self.normalizer.quickCheck(text) != UNormalizationCheckResult.YES:
61 self.termAtt.setEmpty()
62 self.termAtt.append(self.normalizer.normalize(text))