lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/ClassicTokenizerImpl.jflex

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 /*
  21
  22 WARNING: if you change ClassicTokenizerImpl.jflex and need to regenerate
  23       the tokenizer, only use the trunk version of JFlex 1.5 at the moment!
  24
  25 */
  26
  27 import java.io.Reader;
  28 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  29
  30 %%
  31
  32 %class ClassicTokenizerImpl
  33 %implements StandardTokenizerInterface
  34 %unicode 3.0
  35 %integer
  36 %function getNextToken
  37 %pack
  38 %char
  39
  40 %{
  41
  42 public static final int ALPHANUM          = StandardTokenizer.ALPHANUM;
  43 public static final int APOSTROPHE        = StandardTokenizer.APOSTROPHE;
  44 public static final int ACRONYM           = StandardTokenizer.ACRONYM;
  45 public static final int COMPANY           = StandardTokenizer.COMPANY;
  46 public static final int EMAIL             = StandardTokenizer.EMAIL;
  47 public static final int HOST              = StandardTokenizer.HOST;
  48 public static final int NUM               = StandardTokenizer.NUM;
  49 public static final int CJ                = StandardTokenizer.CJ;
  50 /**
  51  * @deprecated this solves a bug where HOSTs that end with '.' are identified
  52  *             as ACRONYMs.
  53  */
  54 @Deprecated
  55 public static final int ACRONYM_DEP       = StandardTokenizer.ACRONYM_DEP;
  56
  57 public static final String [] TOKEN_TYPES = StandardTokenizer.TOKEN_TYPES;
  58
  59 public final int yychar()
  60 {
  61     return yychar;
  62 }
  63
  64 /**
  65  * Fills CharTermAttribute with the current token text.
  66  */
  67 public final void getText(CharTermAttribute t) {
  68   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
  69 }
  70
  71 %}
  72
  73 THAI       = [\u0E00-\u0E59]
  74
  75 // basic word: a sequence of digits & letters (includes Thai to enable ThaiAnalyzer to function)
  76 ALPHANUM   = ({LETTER}|{THAI}|[:digit:])+
  77
  78 // internal apostrophes: O'Reilly, you're, O'Reilly's
  79 // use a post-filter to remove possessives
  80 APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
  81
  82 // acronyms: U.S.A., I.B.M., etc.
  83 // use a post-filter to remove dots
  84 ACRONYM    =  {LETTER} "." ({LETTER} ".")+
  85
  86 ACRONYM_DEP     = {ALPHANUM} "." ({ALPHANUM} ".")+
  87
  88 // company names like AT&T and Excite@Home.
  89 COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
  90
  91 // email addresses
  92 EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
  93
  94 // hostname
  95 HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
  96
  97 // floating point, serial, model numbers, ip addresses, etc.
  98 // every other segment must have at least one digit
  99 NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
 100            | {HAS_DIGIT} {P} {ALPHANUM}
 101            | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
 102            | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
 103            | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
 104            | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
 105
 106 // punctuation
 107 P                = ("_"|"-"|"/"|"."|",")
 108
 109 // at least one digit
 110 HAS_DIGIT  = ({LETTER}|[:digit:])* [:digit:] ({LETTER}|[:digit:])*
 111
 112 ALPHA      = ({LETTER})+
 113
 114 // From the JFlex manual: "the expression that matches everything of <a> not matched by <b> is !(!<a>|<b>)"
 115 LETTER     = !(![:letter:]|{CJ})
 116
 117 // Chinese and Japanese (but NOT Korean, which is included in [:letter:])
 118 CJ         = [\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
 119
 120 WHITESPACE = \r\n | [ \r\n\t\f]
 121
 122 %%
 123
 124 {ALPHANUM}                                                     { return ALPHANUM; }
 125 {APOSTROPHE}                                                   { return APOSTROPHE; }
 126 {ACRONYM}                                                      { return ACRONYM; }
 127 {COMPANY}                                                      { return COMPANY; }
 128 {EMAIL}                                                        { return EMAIL; }
 129 {HOST}                                                         { return HOST; }
 130 {NUM}                                                          { return NUM; }
 131 {CJ}                                                           { return CJ; }
 132 {ACRONYM_DEP}                                                  { return ACRONYM_DEP; }
 133
 134 /** Ignore the rest */
 135 . | {WHITESPACE}                                               { /* ignore */ }