lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordSegmenter.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart;
  19
  20 import java.util.Collections;
  21 import java.util.List;
  22
  23 import org.apache.lucene.analysis.cn.smart.hhmm.HHMMSegmenter;
  24 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
  25 import org.apache.lucene.analysis.cn.smart.hhmm.SegTokenFilter;
  26
  27 /**
  28  * Segment a sentence of Chinese text into words.
  29  * @lucene.experimental
  30  */
  31 class WordSegmenter {
  32
  33   private HHMMSegmenter hhmmSegmenter = new HHMMSegmenter();
  34
  35   private SegTokenFilter tokenFilter = new SegTokenFilter();
  36
  37   /**
  38    * Segment a sentence into words with {@link HHMMSegmenter}
  39    *
  40    * @param sentence input sentence
  41    * @param startOffset start offset of sentence
  42    * @return {@link List} of {@link SegToken}
  43    */
  44   public List<SegToken> segmentSentence(String sentence, int startOffset) {
  45
  46     List<SegToken> segTokenList = hhmmSegmenter.process(sentence);
  47     // tokens from sentence, excluding WordType.SENTENCE_BEGIN and WordType.SENTENCE_END
  48     List<SegToken> result = Collections.emptyList();
  49
  50     if (segTokenList.size() > 2) // if its not an empty sentence
  51       result = segTokenList.subList(1, segTokenList.size() - 1);
  52
  53     for (SegToken st : result)
  54       convertSegToken(st, sentence, startOffset);
  55
  56     return result;
  57   }
  58
  59   /**
  60    * Process a {@link SegToken} so that it is ready for indexing.
  61    *
  62    * This method calculates offsets and normalizes the token with {@link SegTokenFilter}.
  63    *
  64    * @param st input {@link SegToken}
  65    * @param sentence associated Sentence
  66    * @param sentenceStartOffset offset into sentence
  67    * @return Lucene {@link SegToken}
  68    */
  69   public SegToken convertSegToken(SegToken st, String sentence,
  70       int sentenceStartOffset) {
  71
  72     switch (st.wordType) {
  73       case WordType.STRING:
  74       case WordType.NUMBER:
  75       case WordType.FULLWIDTH_NUMBER:
  76       case WordType.FULLWIDTH_STRING:
  77         st.charArray = sentence.substring(st.startOffset, st.endOffset)
  78             .toCharArray();
  79         break;
  80       default:
  81         break;
  82     }
  83
  84     st = tokenFilter.filter(st);
  85     st.startOffset += sentenceStartOffset;
  86     st.endOffset += sentenceStartOffset;
  87     return st;
  88   }
  89 }