2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart;
20 import java.util.Collections;
21 import java.util.List;
23 import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
24 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
25 import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
28 * Segment a sentence of Chinese text into words.
29 * @lucene.experimental
33 private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
35 private SegTokenFilter tokenFilter = new SegTokenFilter();
38 * Segment a sentence into words with {@link HHMMSegmenter}
40 * @param sentence input sentence
41 * @param startOffset start offset of sentence
42 * @return {@link List} of {@link SegToken}
44 public List<SegToken> segmentSentence(String sentence, int startOffset) {
46 List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
47 // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
48 List<SegToken> result = Collections.emptyList();
50 if (segTokenList.size() > 2) // if its not an empty sentence
51 result = segTokenList.subList(1, segTokenList.size() - 1);
53 for (SegToken st : result)
54 convertSegToken(st, sentence, startOffset);
60 * Process a {@link SegToken} so that it is ready for indexing.
62 * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
64 * @param st input {@link SegToken}
65 * @param sentence associated Sentence
66 * @param sentenceStartOffset offset into sentence
67 * @return Lucene {@link SegToken}
69 public SegToken convertSegToken(SegToken st, String sentence,
70 int sentenceStartOffset) {
72 switch (st.wordType) {
75 case WordType.FULLWIDTH_NUMBER:
76 case WordType.FULLWIDTH_STRING:
77 st.charArray = sentence.substring(st.startOffset, st.endOffset)
84 st = tokenFilter.filter(st);
85 st.startOffset += sentenceStartOffset;
86 st.endOffset += sentenceStartOffset;