python/ICUTransformFilter.py

   1 # -*- coding: utf-8 -*-
   2 # ====================================================================
   3 #   Licensed under the Apache License, Version 2.0 (the "License");
   4 #   you may not use this file except in compliance with the License.
   5 #   You may obtain a copy of the License at
   6 #
   7 #       http://www.apache.org/licenses/LICENSE-2.0
   8 #
   9 #   Unless required by applicable law or agreed to in writing, software
  10 #   distributed under the License is distributed on an "AS IS" BASIS,
  11 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12 #   See the License for the specific language governing permissions and
  13 #   limitations under the License.
  14 # ====================================================================
  15 #
  16 #  Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
  17 #  using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
  18 #
  19 #  A TokenFilter that transforms text with ICU.
  20 #
  21 #  ICU provides text-transformation functionality via its Transliteration API.
  22 #  Although script conversion is its most common use, a Transliterator can
  23 #  actually perform a more general class of tasks. In fact, Transliterator
  24 #  defines a very general API which specifies only that a segment of the input
  25 #  text is replaced by new text. The particulars of this conversion are
  26 #  determined entirely by subclasses of Transliterator.
  27 #
  28 #  Some useful transformations for search are built-in:
  29 #   - Conversion from Traditional to Simplified Chinese characters
  30 #   - Conversion from Hiragana to Katakana
  31 #   - Conversion from Fullwidth to Halfwidth forms.
  32 #   - Script conversions, for example Serbian Cyrillic to Latin
  33 #
  34 #  Example usage: <blockquote>stream = new ICUTransformFilter(stream,
  35 #  Transliterator.getInstance("Traditional-Simplified"));</blockquote>
  36 #
  37 #  For more details, see the ICU User Guide at:
  38 #  http://userguide.icu-project.org/transforms/general
  39 #
  40 # ====================================================================
  41
  42 from lucene import PythonTokenFilter, CharTermAttribute
  43 from icu import Transliterator, UTransPosition
  44
  45
  46 class ICUTransformFilter(PythonTokenFilter):
  47
  48     # Create a new ICUTransformFilter that transforms text on the given
  49     # stream.
  50     #
  51     #  @param input {@link TokenStream} to filter.
  52     #  @param transform Transliterator to transform the text.
  53
  54     def __init__(self, input, transform):
  55
  56         super(ICUTransformFilter, self).__init__(input)
  57
  58         # Reusable position object
  59         self.position = UTransPosition()
  60
  61         # term attribute, will be updated with transformed text.
  62         self.termAtt = self.addAttribute(CharTermAttribute.class_)
  63
  64         self.input = input
  65         self.transform = transform
  66
  67     def incrementToken(self):
  68
  69         if self.input.incrementToken():
  70             text = self.termAtt.toString()
  71             length = len(text)
  72
  73             self.position.start = 0
  74             self.position.limit = length
  75             self.position.contextStart = 0
  76             self.position.contextLimit = length
  77
  78             text = self.transform.filteredTransliterate(text, self.position,
  79                                                         False)
  80             self.termAtt.setEmpty()
  81             self.termAtt.append(text)
  82
  83             return True
  84
  85         return False