X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex deleted file mode 100644 index 037b71a..0000000 --- a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex +++ /dev/null @@ -1,135 +0,0 @@ -package org.apache.lucene.analysis.standard; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -/* - -WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate - the tokenizer, only use the trunk version of JFlex 1.5 at the moment! - -*/ - -import java.io.Reader; -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -%% - -%class ClassicTokenizerImpl -%implements StandardTokenizerInterface -%unicode 3.0 -%integer -%function getNextToken -%pack -%char - -%{ - -public static final int ALPHANUM = StandardTokenizer.ALPHANUM; -public static final int APOSTROPHE = StandardTokenizer.APOSTROPHE; -public static final int ACRONYM = StandardTokenizer.ACRONYM; -public static final int COMPANY = StandardTokenizer.COMPANY; -public static final int EMAIL = StandardTokenizer.EMAIL; -public static final int HOST = StandardTokenizer.HOST; -public static final int NUM = StandardTokenizer.NUM; -public static final int CJ = StandardTokenizer.CJ; -/** - * @deprecated this solves a bug where HOSTs that end with '.' are identified - * as ACRONYMs. - */ -@Deprecated -public static final int ACRONYM_DEP = StandardTokenizer.ACRONYM_DEP; - -public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES; - -public final int yychar() -{ - return yychar; -} - -/** - * Fills CharTermAttribute with the current token text. - */ -public final void getText(CharTermAttribute t) { - t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); -} - -%} - -THAI = [\u0E00-\u0E59] - -// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function) -ALPHANUM = ({LETTER}|{THAI}|[:digit:])+ - -// internal apostrophes: O'Reilly, you're, O'Reilly's -// use a post-filter to remove possessives -APOSTROPHE = {ALPHA} ("'" {ALPHA})+ - -// acronyms: U.S.A., I.B.M., etc. -// use a post-filter to remove dots -ACRONYM = {LETTER} "." ({LETTER} ".")+ - -ACRONYM_DEP = {ALPHANUM} "." ({ALPHANUM} ".")+ - -// company names like AT&T and Excite@Home. -COMPANY = {ALPHA} ("&"|"@") {ALPHA} - -// email addresses -EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ - -// hostname -HOST = {ALPHANUM} ((".") {ALPHANUM})+ - -// floating point, serial, model numbers, ip addresses, etc. -// every other segment must have at least one digit -NUM = ({ALPHANUM} {P} {HAS_DIGIT} - | {HAS_DIGIT} {P} {ALPHANUM} - | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ - | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) - -// punctuation -P = ("_"|"-"|"/"|"."|",") - -// at least one digit -HAS_DIGIT = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])* - -ALPHA = ({LETTER})+ - -// From the JFlex manual: "the expression that matches everything of not matched by is !(!|)" -LETTER = !(![:letter:]|{CJ}) - -// Chinese and Japanese (but NOT Korean, which is included in [:letter:]) -CJ = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] - -WHITESPACE = \r\n | [ \r\n\t\f] - -%% - -{ALPHANUM} { return ALPHANUM; } -{APOSTROPHE} { return APOSTROPHE; } -{ACRONYM} { return ACRONYM; } -{COMPANY} { return COMPANY; } -{EMAIL} { return EMAIL; } -{HOST} { return HOST; } -{NUM} { return NUM; } -{CJ} { return CJ; } -{ACRONYM_DEP} { return ACRONYM_DEP; } - -/** Ignore the rest */ -. | {WHITESPACE} { /* ignore */ }