lucene-java-3.4.0/lucene/contrib/analyzers/smartcn/src/java/org/apache/lucene/analysis/cn/smart/SentenceTokenizer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.cn.smart;
  19
  20 import java.io.IOException;
  21 import java.io.Reader;
  22
  23 import org.apache.lucene.analysis.Tokenizer;
  24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  26 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  27 import org.apache.lucene.util.AttributeSource;
  28
  29 /**
  30  * Tokenizes input text into sentences.
  31  * <p>
  32  * The output tokens can then be broken into words with {@link WordTokenFilter}
  33  * </p>
  34  * @lucene.experimental
  35  */
  36 public final class SentenceTokenizer extends Tokenizer {
  37
  38   /**
  39    * End of sentence punctuation: 。，！？；,!?;
  40    */
  41   private final static String PUNCTION = "。，！？；,!?;";
  42
  43   private final StringBuilder buffer = new StringBuilder();
  44
  45   private int tokenStart = 0, tokenEnd = 0;
  46
  47   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
  48   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
  49   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
  50
  51   public SentenceTokenizer(Reader reader) {
  52     super(reader);
  53   }
  54
  55   public SentenceTokenizer(AttributeSource source, Reader reader) {
  56     super(source, reader);
  57   }
  58
  59   public SentenceTokenizer(AttributeFactory factory, Reader reader) {
  60     super(factory, reader);
  61   }
  62
  63   @Override
  64   public boolean incrementToken() throws IOException {
  65     clearAttributes();
  66     buffer.setLength(0);
  67     int ci;
  68     char ch, pch;
  69     boolean atBegin = true;
  70     tokenStart = tokenEnd;
  71     ci = input.read();
  72     ch = (char) ci;
  73
  74     while (true) {
  75       if (ci == -1) {
  76         break;
  77       } else if (PUNCTION.indexOf(ch) != -1) {
  78         // End of a sentence
  79         buffer.append(ch);
  80         tokenEnd++;
  81         break;
  82       } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
  83         tokenStart++;
  84         tokenEnd++;
  85         ci = input.read();
  86         ch = (char) ci;
  87       } else {
  88         buffer.append(ch);
  89         atBegin = false;
  90         tokenEnd++;
  91         pch = ch;
  92         ci = input.read();
  93         ch = (char) ci;
  94         // Two spaces, such as CR, LF
  95         if (Utility.SPACES.indexOf(ch) != -1
  96             && Utility.SPACES.indexOf(pch) != -1) {
  97           // buffer.append(ch);
  98           tokenEnd++;
  99           break;
 100         }
 101       }
 102     }
 103     if (buffer.length() == 0)
 104       return false;
 105     else {
 106       termAtt.setEmpty().append(buffer);
 107       offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
 108       typeAtt.setType("sentence");
 109       return true;
 110     }
 111   }
 112
 113   @Override
 114   public void reset() throws IOException {
 115     super.reset();
 116     tokenStart = tokenEnd = 0;
 117   }
 118
 119   @Override
 120   public void reset(Reader input) throws IOException {
 121     super.reset(input);
 122     reset();
 123   }
 124
 125   @Override
 126   public void end() throws IOException {
 127     // set final offset
 128     final int finalOffset = correctOffset(tokenEnd);
 129     offsetAtt.setOffset(finalOffset, finalOffset);
 130   }
 131 }