1 package org.apache.lucene.analysis.standard;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
22 WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
23 the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
27 import java.io.Reader;
28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
32 %class ClassicTokenizerImpl
33 %implements StandardTokenizerInterface
36 %function getNextToken
42 public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
43 public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
44 public static final int ACRONYM = StandardTokenizer.ACRONYM;
45 public static final int COMPANY = StandardTokenizer.COMPANY;
46 public static final int EMAIL = StandardTokenizer.EMAIL;
47 public static final int HOST = StandardTokenizer.HOST;
48 public static final int NUM = StandardTokenizer.NUM;
49 public static final int CJ = StandardTokenizer.CJ;
51 * @deprecated this solves a bug where HOSTs that end with '.' are identified
55 public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
57 public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
59 public final int yychar()
65 * Fills CharTermAttribute with the current token text.
67 public final void getText(CharTermAttribute t) {
68 t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
73 THAI = [\u0E00-\u0E59]
75 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
76 ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
78 // internal apostrophes: O'Reilly, you're, O'Reilly's
79 // use a post-filter to remove possessives
80 APOSTROPHE = {ALPHA} ("'" {ALPHA})+
82 // acronyms: U.S.A., I.B.M., etc.
83 // use a post-filter to remove dots
84 ACRONYM = {LETTER} "." ({LETTER} ".")+
86 ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
88 // company names like AT&T and Excite@Home.
89 COMPANY = {ALPHA} ("&"|"@") {ALPHA}
92 EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
95 HOST = {ALPHANUM} ((".") {ALPHANUM})+
97 // floating point, serial, model numbers, ip addresses, etc.
98 // every other segment must have at least one digit
99 NUM = ({ALPHANUM} {P} {HAS_DIGIT}
100 | {HAS_DIGIT} {P} {ALPHANUM}
101 | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
102 | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
103 | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
104 | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
107 P = ("_"|"-"|"/"|"."|",")
109 // at least one digit
110 HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
114 // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
115 LETTER = !(![:letter:]|{CJ})
117 // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
118 CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
120 WHITESPACE = \r\n | [ \r\n\t\f]
124 {ALPHANUM} { return ALPHANUM; }
125 {APOSTROPHE} { return APOSTROPHE; }
126 {ACRONYM} { return ACRONYM; }
127 {COMPANY} { return COMPANY; }
128 {EMAIL} { return EMAIL; }
129 {HOST} { return HOST; }
130 {NUM} { return NUM; }
132 {ACRONYM_DEP} { return ACRONYM_DEP; }
134 /** Ignore the rest */
135 . | {WHITESPACE} { /* ignore */ }