python/ICUFoldingFilter.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15 #
  16 #  Port of java/org/apache/lucene/analysis/icu/ICUFoldingFilter.java
  17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
  18 #
  19 #  A TokenFilter that applies search term folding to Unicode text,
  20 #  applying foldings from UTR#30 Character Foldings.
  21 #
  22 #  This filter applies the following foldings from the report to unicode text:
  23 #
  24 #  Accent removal
  25 #  Case folding
  26 #  Canonical duplicates folding
  27 #  Dashes folding
  28 #  Diacritic removal (including stroke, hook, descender)
  29 #  Greek letterforms folding
  30 #  Han Radical folding
  31 #  Hebrew Alternates folding
  32 #  Jamo folding
  33 #  Letterforms folding
  34 #  Math symbol folding
  35 #  Multigraph Expansions: All
  36 #  Native digit folding
  37 #  No-break folding
  38 #  Overline folding
  39 #  Positional forms folding
  40 #  Small forms folding
  41 #  Space folding
  42 #  Spacing Accents folding
  43 #  Subscript folding
  44 #  Superscript folding
  45 #  Suzhou Numeral folding
  46 #  Symbol folding
  47 #  Underline folding
  48 #  Vertical forms folding
  49 #  Width folding
  50 #
  51 #  Additionally, Default Ignorables are removed, and text is normalized to NFKC.
  52 #  All foldings, case folding, and normalization mappings are applied
  53 #  recursively to ensure a fully folded and normalized result.
  54 #
  55 # ====================================================================
  56
  57 import os, lucene
  58
  59 from lucene.ICUNormalizer2Filter import ICUNormalizer2Filter
  60 from icu import ResourceBundle, Normalizer2, UNormalizationMode2
  61
  62 utr30 = os.path.join(lucene.__dir__, 'resources',
  63                      'org', 'apache', 'lucene', 'analysis', 'icu',
  64                      'utr30.dat')
  65 ResourceBundle.setAppData("utr30", utr30)
  66
  67
  68 class ICUFoldingFilter(ICUNormalizer2Filter):
  69
  70     def __init__(self, input):
  71
  72         normalizer = Normalizer2.getInstance("utr30", "utr30",
  73                                              UNormalizationMode2.COMPOSE)
  74         super(ICUFoldingFilter, self).__init__(input, normalizer)