2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.cn.smart;
20 import java.io.IOException;
21 import java.io.Reader;
23 import org.apache.lucene.analysis.Tokenizer;
24 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
25 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
26 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
27 import org.apache.lucene.util.AttributeSource;
30 * Tokenizes input text into sentences.
32 * The output tokens can then be broken into words with {@link WordTokenFilter}
34 * @lucene.experimental
36 public final class SentenceTokenizer extends Tokenizer {
39 * End of sentence punctuation: 。,!?;,!?;
41 private final static String PUNCTION = "。,!?;,!?;";
43 private final StringBuilder buffer = new StringBuilder();
45 private int tokenStart = 0, tokenEnd = 0;
47 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
48 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
49 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
51 public SentenceTokenizer(Reader reader) {
55 public SentenceTokenizer(AttributeSource source, Reader reader) {
56 super(source, reader);
59 public SentenceTokenizer(AttributeFactory factory, Reader reader) {
60 super(factory, reader);
64 public boolean incrementToken() throws IOException {
69 boolean atBegin = true;
70 tokenStart = tokenEnd;
77 } else if (PUNCTION.indexOf(ch) != -1) {
82 } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
94 // Two spaces, such as CR, LF
95 if (Utility.SPACES.indexOf(ch) != -1
96 && Utility.SPACES.indexOf(pch) != -1) {
103 if (buffer.length() == 0)
106 termAtt.setEmpty().append(buffer);
107 offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
108 typeAtt.setType("sentence");
114 public void reset() throws IOException {
116 tokenStart = tokenEnd = 0;
120 public void reset(Reader input) throws IOException {
126 public void end() throws IOException {
128 final int finalOffset = correctOffset(tokenEnd);
129 offsetAtt.setOffset(finalOffset, finalOffset);