lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/WordTokenFilter.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart;
  19
  20 import java.io.IOException;
  21 import java.util.Iterator;
  22 import java.util.List;
  23
  24 import org.apache.lucene.analysis.TokenFilter;
  25 import org.apache.lucene.analysis.TokenStream;
  26 import org.apache.lucene.analysis.cn.smart.hhmm.SegToken;
  27 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  28 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  29 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  30
  31 /**
  32  * A {@link TokenFilter} that breaks sentences into words.
  33  * @lucene.experimental
  34  */
  35 public final class WordTokenFilter extends TokenFilter {
  36
  37   private WordSegmenter wordSegmenter;
  38
  39   private Iterator<SegToken> tokenIter;
  40
  41   private List<SegToken> tokenBuffer;
  42
  43   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  44   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  45   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  46
  47   /**
  48    * Construct a new WordTokenizer.
  49    *
  50    * @param in {@link TokenStream} of sentences
  51    */
  52   public WordTokenFilter(TokenStream in) {
  53     super(in);
  54     this.wordSegmenter = new WordSegmenter();
  55   }
  56
  57   @Override
  58   public boolean incrementToken() throws IOException {
  59     if (tokenIter == null || !tokenIter.hasNext()) {
  60       // there are no remaining tokens from the current sentence... are there more sentences?
  61       if (input.incrementToken()) {
  62         // a new sentence is available: process it.
  63         tokenBuffer = wordSegmenter.segmentSentence(termAtt.toString(), offsetAtt.startOffset());
  64         tokenIter = tokenBuffer.iterator();
  65         /*
  66          * it should not be possible to have a sentence with 0 words, check just in case.
  67          * returning EOS isn't the best either, but its the behavior of the original code.
  68          */
  69         if (!tokenIter.hasNext())
  70           return false;
  71       } else {
  72         return false; // no more sentences, end of stream!
  73       }
  74     }
  75     // WordTokenFilter must clear attributes, as it is creating new tokens.
  76     clearAttributes();
  77     // There are remaining tokens from the current sentence, return the next one.
  78     SegToken nextWord = tokenIter.next();
  79     termAtt.copyBuffer(nextWord.charArray, 0, nextWord.charArray.length);
  80     offsetAtt.setOffset(nextWord.startOffset, nextWord.endOffset);
  81     typeAtt.setType("word");
  82     return true;
  83   }
  84
  85   @Override
  86   public void reset() throws IOException {
  87     super.reset();
  88     tokenIter = null;
  89   }
  90 }