+++ /dev/null
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-package org.apache.lucene.analysis.cn.smart;
-
-import java.io.IOException;
-import java.io.Reader;
-
-import org.apache.lucene.analysis.Tokenizer;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
-import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
-import org.apache.lucene.util.AttributeSource;
-
-/**
- * Tokenizes input text into sentences.
- * <p>
- * The output tokens can then be broken into words with {@link WordTokenFilter}
- * </p>
- * @lucene.experimental
- */
-public final class SentenceTokenizer extends Tokenizer {
-
- /**
- * End of sentence punctuation: 。,!?;,!?;
- */
- private final static String PUNCTION = "。,!?;,!?;";
-
- private final StringBuilder buffer = new StringBuilder();
-
- private int tokenStart = 0, tokenEnd = 0;
-
- private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
- private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
- private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
-
- public SentenceTokenizer(Reader reader) {
- super(reader);
- }
-
- public SentenceTokenizer(AttributeSource source, Reader reader) {
- super(source, reader);
- }
-
- public SentenceTokenizer(AttributeFactory factory, Reader reader) {
- super(factory, reader);
- }
-
- @Override
- public boolean incrementToken() throws IOException {
- clearAttributes();
- buffer.setLength(0);
- int ci;
- char ch, pch;
- boolean atBegin = true;
- tokenStart = tokenEnd;
- ci = input.read();
- ch = (char) ci;
-
- while (true) {
- if (ci == -1) {
- break;
- } else if (PUNCTION.indexOf(ch) != -1) {
- // End of a sentence
- buffer.append(ch);
- tokenEnd++;
- break;
- } else if (atBegin && Utility.SPACES.indexOf(ch) != -1) {
- tokenStart++;
- tokenEnd++;
- ci = input.read();
- ch = (char) ci;
- } else {
- buffer.append(ch);
- atBegin = false;
- tokenEnd++;
- pch = ch;
- ci = input.read();
- ch = (char) ci;
- // Two spaces, such as CR, LF
- if (Utility.SPACES.indexOf(ch) != -1
- && Utility.SPACES.indexOf(pch) != -1) {
- // buffer.append(ch);
- tokenEnd++;
- break;
- }
- }
- }
- if (buffer.length() == 0)
- return false;
- else {
- termAtt.setEmpty().append(buffer);
- offsetAtt.setOffset(correctOffset(tokenStart), correctOffset(tokenEnd));
- typeAtt.setType("sentence");
- return true;
- }
- }
-
- @Override
- public void reset() throws IOException {
- super.reset();
- tokenStart = tokenEnd = 0;
- }
-
- @Override
- public void reset(Reader input) throws IOException {
- super.reset(input);
- reset();
- }
-
- @Override
- public void end() throws IOException {
- // set final offset
- final int finalOffset = correctOffset(tokenEnd);
- offsetAtt.setOffset(finalOffset, finalOffset);
- }
-}