1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
19 # A TokenFilter that applies search term folding to Unicode text,
20 # applying foldings from UTR#30 Character Foldings.
22 # This filter applies the following foldings from the report to unicode text:
26 # Canonical duplicates folding
28 # Diacritic removal (including stroke, hook, descender)
29 # Greek letterforms folding
31 # Hebrew Alternates folding
35 # Multigraph Expansions: All
36 # Native digit folding
39 # Positional forms folding
42 # Spacing Accents folding
45 # Suzhou Numeral folding
48 # Vertical forms folding
51 # Additionally, Default Ignorables are removed, and text is normalized to NFKC.
52 # All foldings, case folding, and normalization mappings are applied
53 # recursively to ensure a fully folded and normalized result.
55 # ====================================================================
59 from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
60 from icu import ResourceBundle, Normalizer2, UNormalizationMode2
62 utr30 = os.path.join(lucene.__dir__, 'resources',
63 'org', 'apache', 'lucene', 'analysis', 'icu',
65 ResourceBundle.setAppData("utr30", utr30)
68 class ICUFoldingFilter(ICUNormalizer2Filter):
70 def __init__(self, input):
72 normalizer = Normalizer2.getInstance("utr30", "utr30",
73 UNormalizationMode2.COMPOSE)
74 super(ICUFoldingFilter, self).__init__(input, normalizer)