--- /dev/null
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+package org.apache.lucene.analysis.cn.smart;
+
+import java.io.IOException;
+import java.io.Reader;
+
+import org.apache.lucene.analysis.Tokenizer;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
+import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
+import org.apache.lucene.util.AttributeSource;
+
+/**
+ * Tokenizes input text into sentences.
+ * <p>
+ * The output tokens can then be broken into words with {@link WordTokenFilter}
+ * </p>
+ * @lucene.experimental
+ */
+public final class SentenceTokenizer extends Tokenizer {
+
+ /**
+ * End of sentence punctuation: 。,!?;,!?;
+ */
+ private final static String PUNCTION = "。,!?;,!?;";
+
+ private final StringBuilder buffer = new StringBuilder();
+
+ private int tokenStart = 0, tokenEnd = 0;
+
+ private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
+ private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
+ private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
+
+ public SentenceTokenizer(Reader reader) {
+ super(reader);
+ }
+
+ public SentenceTokenizer(AttributeSource source, Reader reader) {
+ super(source, reader);
+ }
+
+ public SentenceTokenizer(AttributeFactory factory, Reader reader) {
+ super(factory, reader);
+ }
+
+ @Override
+ public boolean incrementToken() throws IOException {
+ clearAttributes();
+ buffer.setLength(0);
+ int ci;
+ char ch, pch;
+ boolean atBegin = true;
+ tokenStart = tokenEnd;
+ ci = input.read();
+ ch = (char) ci;
+
+ while (true) {
+ if (ci == -1) {
+ break;
+ } else if (PUNCTION.indexOf(ch) != -1) {
+ // End of a sentence
+ buffer.append(ch);
+ tokenEnd++;
+ break;
+ } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
+ tokenStart++;
+ tokenEnd++;
+ ci = input.read();
+ ch = (char) ci;
+ } else {
+ buffer.append(ch);
+ atBegin = false;
+ tokenEnd++;
+ pch = ch;
+ ci = input.read();
+ ch = (char) ci;
+ // Two spaces, such as CR, LF
+ if (Utility.SPACES.indexOf(ch) != -1
+ && Utility.SPACES.indexOf(pch) != -1) {
+ // buffer.append(ch);
+ tokenEnd++;
+ break;
+ }
+ }
+ }
+ if (buffer.length() == 0)
+ return false;
+ else {
+ termAtt.setEmpty().append(buffer);
+ offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
+ typeAtt.setType("sentence");
+ return true;
+ }
+ }
+
+ @Override
+ public void reset() throws IOException {
+ super.reset();
+ tokenStart = tokenEnd = 0;
+ }
+
+ @Override
+ public void reset(Reader input) throws IOException {
+ super.reset(input);
+ reset();
+ }
+
+ @Override
+ public void end() throws IOException {
+ // set final offset
+ final int finalOffset = correctOffset(tokenEnd);
+ offsetAtt.setOffset(finalOffset, finalOffset);
+ }
+}