--- /dev/null
+package org.apache.lucene.analysis.standard;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements. See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+/*
+
+WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
+ the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
+
+*/
+
+import java.io.Reader;
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+%%
+
+%class ClassicTokenizerImpl
+%implements StandardTokenizerInterface
+%unicode 3.0
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM = StandardTokenizer.ALPHANUM;
+public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE;
+public static final int ACRONYM = StandardTokenizer.ACRONYM;
+public static final int COMPANY = StandardTokenizer.COMPANY;
+public static final int EMAIL = StandardTokenizer.EMAIL;
+public static final int HOST = StandardTokenizer.HOST;
+public static final int NUM = StandardTokenizer.NUM;
+public static final int CJ = StandardTokenizer.CJ;
+/**
+ * @deprecated this solves a bug where HOSTs that end with '.' are identified
+ * as ACRONYMs.
+ */
+@Deprecated
+public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP;
+
+public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
+
+public final int yychar()
+{
+ return yychar;
+}
+
+/**
+ * Fills CharTermAttribute with the current token text.
+ */
+public final void getText(CharTermAttribute t) {
+ t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
+%}
+
+THAI = [\u0E00-\u0E59]
+
+// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
+ALPHANUM = ({LETTER}|{THAI}|[:digit:])+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possessives
+APOSTROPHE = {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM = {LETTER} "." ({LETTER} ".")+
+
+ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY = {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST = {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM = ({ALPHANUM} {P} {HAS_DIGIT}
+ | {HAS_DIGIT} {P} {ALPHANUM}
+ | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+ | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+ | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+// punctuation
+P = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
+
+ALPHA = ({LETTER})+
+
+// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
+LETTER = !(![:letter:]|{CJ})
+
+// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
+CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+%%
+
+{ALPHANUM} { return ALPHANUM; }
+{APOSTROPHE} { return APOSTROPHE; }
+{ACRONYM} { return ACRONYM; }
+{COMPANY} { return COMPANY; }
+{EMAIL} { return EMAIL; }
+{HOST} { return HOST; }
+{NUM} { return NUM; }
+{CJ} { return CJ; }
+{ACRONYM_DEP} { return ACRONYM_DEP; }
+
+/** Ignore the rest */
+. | {WHITESPACE} { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }