2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart.hhmm;
20 import org.apache.lucene.analysis.cn.smart.Utility;
21 import org.apache.lucene.analysis.cn.smart.WordType;
25 * Filters a {@link SegToken} by converting full-width latin to half-width, then lowercasing latin.
26 * Additionally, all punctuation is converted into {@link Utility#COMMON_DELIMITER}
28 * @lucene.experimental
30 public class SegTokenFilter {
33 * Filter an input {@link SegToken}
35 * Full-width latin will be converted to half-width, then all latin will be lowercased.
36 * All punctuation is converted into {@link Utility#COMMON_DELIMITER}
39 * @param token input {@link SegToken}
40 * @return normalized {@link SegToken}
42 public SegToken filter(SegToken token) {
43 switch (token.wordType) {
44 case WordType.FULLWIDTH_NUMBER:
45 case WordType.FULLWIDTH_STRING: /* first convert full-width -> half-width */
46 for (int i = 0; i < token.charArray.length; i++) {
47 if (token.charArray[i] >= 0xFF10)
48 token.charArray[i] -= 0xFEE0;
50 if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
51 token.charArray[i] += 0x0020;
55 for (int i = 0; i < token.charArray.length; i++) {
56 if (token.charArray[i] >= 0x0041 && token.charArray[i] <= 0x005A) /* lowercase latin */
57 token.charArray[i] += 0x0020;
60 case WordType.DELIMITER: /* convert all punctuation to Utility.COMMON_DELIMITER */
61 token.charArray = Utility.COMMON_DELIMITER;