pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.4.0 / lucene / src / java / org / apache / lucene / analysis / standard / ClassicTokenizerImpl.jflex
diff --git a/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex b/lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex
deleted file mode 100644 (file)
index 037b71a..0000000
+++ /dev/null
@@ -1,135 +0,0 @@
-package org.apache.lucene.analysis.standard;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements.  See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License.  You may obtain a copy of the License at
- *
- *     http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-/*
-
-WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
-      the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
-
-*/
-
-import java.io.Reader;
-import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
-
-%%
-
-%class ClassicTokenizerImpl
-%implements StandardTokenizerInterface
-%unicode 3.0
-%integer
-%function getNextToken
-%pack
-%char
-
-%{
-
-public static final int ALPHANUM          = StandardTokenizer.ALPHANUM;
-public static final int APOSTROPHE        = StandardTokenizer.APOSTROPHE;
-public static final int ACRONYM           = StandardTokenizer.ACRONYM;
-public static final int COMPANY           = StandardTokenizer.COMPANY;
-public static final int EMAIL             = StandardTokenizer.EMAIL;
-public static final int HOST              = StandardTokenizer.HOST;
-public static final int NUM               = StandardTokenizer.NUM;
-public static final int CJ                = StandardTokenizer.CJ;
-/**
- * @deprecated this solves a bug where HOSTs that end with '.' are identified
- *             as ACRONYMs.
- */
-@Deprecated
-public static final int ACRONYM_DEP       = StandardTokenizer.ACRONYM_DEP;
-
-public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
-
-public final int yychar()
-{
-    return yychar;
-}
-
-/**
- * Fills CharTermAttribute with the current token text.
- */
-public final void getText(CharTermAttribute t) {
-  t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
-}
-
-%}
-
-THAI       = [\u0E00-\u0E59]
-
-// basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
-ALPHANUM   = ({LETTER}|{THAI}|[:digit:])+
-
-// internal apostrophes: O'Reilly, you're, O'Reilly's
-// use a post-filter to remove possessives
-APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
-
-// acronyms: U.S.A., I.B.M., etc.
-// use a post-filter to remove dots
-ACRONYM    =  {LETTER} "." ({LETTER} ".")+
-
-ACRONYM_DEP    = {ALPHANUM} "." ({ALPHANUM} ".")+
-
-// company names like AT&T and Excite@Home.
-COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
-
-// email addresses
-EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
-
-// hostname
-HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
-
-// floating point, serial, model numbers, ip addresses, etc.
-// every other segment must have at least one digit
-NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
-           | {HAS_DIGIT} {P} {ALPHANUM}
-           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
-           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
-           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
-           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
-
-// punctuation
-P               = ("_"|"-"|"/"|"."|",")
-
-// at least one digit
-HAS_DIGIT  = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
-
-ALPHA      = ({LETTER})+
-
-// From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
-LETTER     = !(![:letter:]|{CJ})
-
-// Chinese and Japanese (but NOT Korean, which is included in [:letter:])
-CJ         = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
-
-WHITESPACE = \r\n | [ \r\n\t\f]
-
-%%
-
-{ALPHANUM}                                                     { return ALPHANUM; }
-{APOSTROPHE}                                                   { return APOSTROPHE; }
-{ACRONYM}                                                      { return ACRONYM; }
-{COMPANY}                                                      { return COMPANY; }
-{EMAIL}                                                        { return EMAIL; }
-{HOST}                                                         { return HOST; }
-{NUM}                                                          { return NUM; }
-{CJ}                                                           { return CJ; }
-{ACRONYM_DEP}                                                  { return ACRONYM_DEP; }
-
-/** Ignore the rest */
-. | {WHITESPACE}                                               { /* ignore */ }