1 package org.apache.lucene.analysis.standard;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.io.InputStream;
22 import java.io.InputStreamReader;
23 import java.io.Reader;
25 import org.apache.lucene.analysis.Tokenizer;
26 import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
27 import org.apache.lucene.analysis.standard.std31.UAX29URLEmailTokenizerImpl31;
28 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
31 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
32 import org.apache.lucene.util.AttributeSource;
33 import org.apache.lucene.util.Version;
34 import org.apache.lucene.util.AttributeSource.AttributeFactory;
37 * This class implements Word Break rules from the Unicode Text Segmentation
38 * algorithm, as specified in
39 * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
40 * URLs and email addresses are also tokenized according to the relevant RFCs.
42 * Tokens produced are of the following types:
44 * <li><ALPHANUM>: A sequence of alphabetic and numeric characters</li>
45 * <li><NUM>: A number</li>
46 * <li><URL>: A URL</li>
47 * <li><EMAIL>: An email address</li>
48 * <li><SOUTHEAST_ASIAN>: A sequence of characters from South and Southeast
49 * Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
50 * <li><IDEOGRAPHIC>: A single CJKV ideographic character</li>
51 * <li><HIRAGANA>: A single hiragana character</li>
54 * <p>You must specify the required {@link Version}
55 * compatibility when creating UAX29URLEmailTokenizer:
57 * <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
58 * from their combining characters. If you use a previous version number,
59 * you get the exact broken behavior for backwards compatibility.
63 public final class UAX29URLEmailTokenizer extends Tokenizer {
64 /** A private instance of the JFlex-constructed scanner */
65 private final StandardTokenizerInterface scanner;
67 public static final int ALPHANUM = 0;
68 public static final int NUM = 1;
69 public static final int SOUTHEAST_ASIAN = 2;
70 public static final int IDEOGRAPHIC = 3;
71 public static final int HIRAGANA = 4;
72 public static final int KATAKANA = 5;
73 public static final int HANGUL = 6;
74 public static final int URL = 7;
75 public static final int EMAIL = 8;
77 /** String token types that correspond to token type int constants */
78 public static final String [] TOKEN_TYPES = new String [] {
79 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM],
80 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM],
81 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN],
82 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC],
83 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA],
84 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA],
85 StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
90 /** Alphanumeric sequences
91 * @deprecated use {@link #TOKEN_TYPES} instead */
93 public static final String WORD_TYPE = TOKEN_TYPES[ALPHANUM];
96 * @deprecated use {@link #TOKEN_TYPES} instead */
98 public static final String NUMERIC_TYPE = TOKEN_TYPES[NUM];
100 /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax
101 * @deprecated use {@link #TOKEN_TYPES} instead */
103 public static final String URL_TYPE = TOKEN_TYPES[URL];
106 * @deprecated use {@link #TOKEN_TYPES} instead */
108 public static final String EMAIL_TYPE = TOKEN_TYPES[EMAIL];
111 * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
112 * scripts (Thai, Lao, Myanmar, Khmer, etc.). Sequences of these are kept
113 * together as as a single token rather than broken up, because the logic
114 * required to break them at word boundaries is too complex for UAX#29.
116 * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
117 * @deprecated use {@link #TOKEN_TYPES} instead
120 public static final String SOUTH_EAST_ASIAN_TYPE = TOKEN_TYPES[SOUTHEAST_ASIAN];
122 /** @deprecated use {@link #TOKEN_TYPES} instead */
124 public static final String IDEOGRAPHIC_TYPE = TOKEN_TYPES[IDEOGRAPHIC];
126 /** @deprecated use {@link #TOKEN_TYPES} instead */
128 public static final String HIRAGANA_TYPE = TOKEN_TYPES[HIRAGANA];
130 /** @deprecated use {@link #TOKEN_TYPES} instead */
132 public static final String KATAKANA_TYPE = TOKEN_TYPES[KATAKANA];
134 /** @deprecated use {@link #TOKEN_TYPES} instead */
136 public static final String HANGUL_TYPE = TOKEN_TYPES[HANGUL];
138 private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
140 /** Set the max allowed token length. Any token longer
141 * than this is skipped. */
142 public void setMaxTokenLength(int length) {
143 this.maxTokenLength = length;
146 /** @see #setMaxTokenLength */
147 public int getMaxTokenLength() {
148 return maxTokenLength;
151 /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, Reader)} instead. */
153 public UAX29URLEmailTokenizer(Reader input) {
154 this(Version.LUCENE_31, input);
157 /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, Reader)} instead. */
159 public UAX29URLEmailTokenizer(InputStream input) {
160 this(Version.LUCENE_31, new InputStreamReader(input));
163 /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, AttributeSource, Reader)} instead. */
165 public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
166 this(Version.LUCENE_31, source, input);
169 /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. */
171 public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
172 this(Version.LUCENE_31, factory, input);
176 * Creates a new instance of the UAX29URLEmailTokenizer. Attaches
177 * the <code>input</code> to the newly created JFlex scanner.
179 * @param input The input reader
181 public UAX29URLEmailTokenizer(Version matchVersion, Reader input) {
183 this.scanner = getScannerFor(matchVersion, input);
187 * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeSource}.
189 public UAX29URLEmailTokenizer(Version matchVersion, AttributeSource source, Reader input) {
190 super(source, input);
191 this.scanner = getScannerFor(matchVersion, input);
195 * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeFactory}
197 public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
198 super(factory, input);
199 this.scanner = getScannerFor(matchVersion, input);
202 private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) {
203 if (matchVersion.onOrAfter(Version.LUCENE_34)) {
204 return new UAX29URLEmailTokenizerImpl(input);
206 return new UAX29URLEmailTokenizerImpl31(input);
210 // this tokenizer generates three attributes:
211 // term offset, positionIncrement and type
212 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
213 private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
214 private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
215 private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
218 public final boolean incrementToken() throws IOException {
223 int tokenType = scanner.getNextToken();
225 if (tokenType == StandardTokenizerInterface.YYEOF) {
229 if (scanner.yylength() <= maxTokenLength) {
230 posIncrAtt.setPositionIncrement(posIncr);
231 scanner.getText(termAtt);
232 final int start = scanner.yychar();
233 offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
234 typeAtt.setType(TOKEN_TYPES[tokenType]);
237 // When we skip a too-long term, we still increment the
238 // position increment
244 public final void end() {
246 int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
247 offsetAtt.setOffset(finalOffset, finalOffset);
251 public void reset(Reader reader) throws IOException {
253 scanner.yyreset(reader);