1 package org.apache.lucene.analysis.el;
4 * Copyright 2005 The Apache Software Foundation
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.IOException;
21 import org.apache.lucene.analysis.TokenFilter;
22 import org.apache.lucene.analysis.TokenStream;
23 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
24 import org.apache.lucene.util.CharacterUtils;
25 import org.apache.lucene.util.Version;
28 * Normalizes token text to lower case, removes some Greek diacritics,
29 * and standardizes final sigma to sigma.
31 * <p>You must specify the required {@link Version}
32 * compatibility when creating GreekLowerCaseFilter:
34 * <li> As of 3.1, supplementary characters are properly lowercased.
37 public final class GreekLowerCaseFilter extends TokenFilter {
38 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
39 private final CharacterUtils charUtils;
41 /** @deprecated Use {@link #GreekLowerCaseFilter(Version, TokenStream)} instead. */
43 public GreekLowerCaseFilter(TokenStream in) {
44 this(Version.LUCENE_30, in);
48 * Create a GreekLowerCaseFilter that normalizes Greek token text.
50 * @param matchVersion Lucene compatibility version,
51 * See <a href="#version">above</a>
52 * @param in TokenStream to filter
54 public GreekLowerCaseFilter(Version matchVersion, TokenStream in) {
56 this.charUtils = CharacterUtils.getInstance(matchVersion);
60 public boolean incrementToken() throws IOException {
61 if (input.incrementToken()) {
62 char[] chArray = termAtt.buffer();
63 int chLen = termAtt.length();
64 for (int i = 0; i < chLen;) {
65 i += Character.toChars(
66 lowerCase(charUtils.codePointAt(chArray, i)), chArray, i);
74 private int lowerCase(int codepoint) {
76 /* There are two lowercase forms of sigma:
77 * U+03C2: small final sigma (end of word)
78 * U+03C3: small sigma (otherwise)
80 * Standardize both to U+03C3
82 case '\u03C2': /* small final sigma */
83 return '\u03C3'; /* small sigma */
85 /* Some greek characters contain diacritics.
86 * This filter removes these, converting to the lowercase base form.
89 case '\u0386': /* capital alpha with tonos */
90 case '\u03AC': /* small alpha with tonos */
91 return '\u03B1'; /* small alpha */
93 case '\u0388': /* capital epsilon with tonos */
94 case '\u03AD': /* small epsilon with tonos */
95 return '\u03B5'; /* small epsilon */
97 case '\u0389': /* capital eta with tonos */
98 case '\u03AE': /* small eta with tonos */
99 return '\u03B7'; /* small eta */
101 case '\u038A': /* capital iota with tonos */
102 case '\u03AA': /* capital iota with dialytika */
103 case '\u03AF': /* small iota with tonos */
104 case '\u03CA': /* small iota with dialytika */
105 case '\u0390': /* small iota with dialytika and tonos */
106 return '\u03B9'; /* small iota */
108 case '\u038E': /* capital upsilon with tonos */
109 case '\u03AB': /* capital upsilon with dialytika */
110 case '\u03CD': /* small upsilon with tonos */
111 case '\u03CB': /* small upsilon with dialytika */
112 case '\u03B0': /* small upsilon with dialytika and tonos */
113 return '\u03C5'; /* small upsilon */
115 case '\u038C': /* capital omicron with tonos */
116 case '\u03CC': /* small omicron with tonos */
117 return '\u03BF'; /* small omicron */
119 case '\u038F': /* capital omega with tonos */
120 case '\u03CE': /* small omega with tonos */
121 return '\u03C9'; /* small omega */
123 /* The previous implementation did the conversion below.
124 * Only implemented for backwards compatibility with old indexes.
127 case '\u03A2': /* reserved */
128 return '\u03C2'; /* small final sigma */
131 return Character.toLowerCase(codepoint);