1 # -*- coding: utf-8 -*-
2 # ====================================================================
3 # Licensed under the Apache License, Version 2.0 (the "License");
4 # you may not use this file except in compliance with the License.
5 # You may obtain a copy of the License at
7 # http://www.apache.org/licenses/LICENSE-2.0
9 # Unless required by applicable law or agreed to in writing, software
10 # distributed under the License is distributed on an "AS IS" BASIS,
11 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12 # See the License for the specific language governing permissions and
13 # limitations under the License.
14 # ====================================================================
16 # Port of java/org/apache/lucene/analysis/icu/ICUTransformFilter.java
17 # using IBM's C++ ICU wrapped by PyICU (http://pyicu.osafoundation.org)
19 # A TokenFilter that transforms text with ICU.
21 # ICU provides text-transformation functionality via its Transliteration API.
22 # Although script conversion is its most common use, a Transliterator can
23 # actually perform a more general class of tasks. In fact, Transliterator
24 # defines a very general API which specifies only that a segment of the input
25 # text is replaced by new text. The particulars of this conversion are
26 # determined entirely by subclasses of Transliterator.
28 # Some useful transformations for search are built-in:
29 # - Conversion from Traditional to Simplified Chinese characters
30 # - Conversion from Hiragana to Katakana
31 # - Conversion from Fullwidth to Halfwidth forms.
32 # - Script conversions, for example Serbian Cyrillic to Latin
34 # Example usage: <blockquote>stream = new ICUTransformFilter(stream,
35 # Transliterator.getInstance("Traditional-Simplified"));</blockquote>
37 # For more details, see the ICU User Guide at:
38 # http://userguide.icu-project.org/transforms/general
40 # ====================================================================
42 from lucene import PythonTokenFilter, CharTermAttribute
43 from icu import Transliterator, UTransPosition
46 class ICUTransformFilter(PythonTokenFilter):
48 # Create a new ICUTransformFilter that transforms text on the given
51 # @param input {@link TokenStream} to filter.
52 # @param transform Transliterator to transform the text.
54 def __init__(self, input, transform):
56 super(ICUTransformFilter, self).__init__(input)
58 # Reusable position object
59 self.position = UTransPosition()
61 # term attribute, will be updated with transformed text.
62 self.termAtt = self.addAttribute(CharTermAttribute.class_)
65 self.transform = transform
67 def incrementToken(self):
69 if self.input.incrementToken():
70 text = self.termAtt.toString()
73 self.position.start = 0
74 self.position.limit = length
75 self.position.contextStart = 0
76 self.position.contextLimit = length
78 text = self.transform.filteredTransliterate(text, self.position,
80 self.termAtt.setEmpty()
81 self.termAtt.append(text)