lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/StandardTokenizerImpl.jflex

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  21
  22 /**
  23  * This class implements Word Break rules from the Unicode Text Segmentation
  24  * algorithm, as specified in
  25  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
  26  * <p/>
  27  * Tokens produced are of the following types:
  28  * <ul>
  29  *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
  30  *   <li>&lt;NUM&gt;: A number</li>
  31  *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
  32  *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
  33  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  34  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  35  * </ul>
  36  */
  37 %%
  38
  39 %unicode 6.0
  40 %integer
  41 %final
  42 %public
  43 %class StandardTokenizerImpl
  44 %implements StandardTokenizerInterface
  45 %function getNextToken
  46 %char
  47
  48 %include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
  49 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
  50 Format =  ([\p{WB:Format}] | {FormatSupp})
  51 Numeric = ([\p{WB:Numeric}] | {NumericSupp})
  52 Extend =  ([\p{WB:Extend}] | {ExtendSupp})
  53 Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
  54 MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
  55 MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
  56 MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
  57 ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
  58 ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
  59 Han = ([\p{Script:Han}] | {HanSupp})
  60 Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
  61
  62 // Script=Hangul & Aletter
  63 HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
  64 // UAX#29 WB4. X (Extend | Format)* --> X
  65 //
  66 ALetterEx      = {ALetter}                     ({Format} | {Extend})*
  67 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
  68 NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
  69 KatakanaEx     = {Katakana}                    ({Format} | {Extend})*
  70 MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})*
  71 MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
  72 ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
  73
  74 HanEx = {Han} ({Format} | {Extend})*
  75 HiraganaEx = {Hiragana} ({Format} | {Extend})*
  76
  77 %{
  78   /** Alphanumeric sequences */
  79   public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
  80
  81   /** Numbers */
  82   public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
  83
  84   /**
  85    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
  86    * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept
  87    * together as as a single token rather than broken up, because the logic
  88    * required to break them at word boundaries is too complex for UAX#29.
  89    * <p>
  90    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
  91    */
  92   public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
  93
  94   public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
  95
  96   public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
  97
  98   public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
  99
 100   public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
 101
 102   public final int yychar()
 103   {
 104     return yychar;
 105   }
 106
 107   /**
 108    * Fills CharTermAttribute with the current token text.
 109    */
 110   public final void getText(CharTermAttribute t) {
 111     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 112   }
 113 %}
 114
 115 %%
 116
 117 // UAX#29 WB1.  sot     ÷
 118 //        WB2.          ÷      eot
 119 //
 120 <<EOF>> { return StandardTokenizerInterface.YYEOF; }
 121
 122 // UAX#29 WB8.   Numeric × Numeric
 123 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
 124 //        WB12.  Numeric × (MidNum | MidNumLet) Numeric
 125 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 126 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 127 //
 128 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
 129                               | {MidNumericEx} {NumericEx}
 130                               | {NumericEx})*
 131 {ExtendNumLetEx}*
 132   { return NUMERIC_TYPE; }
 133
 134 // subset of the below for typing purposes only!
 135 {HangulEx}+
 136   { return HANGUL_TYPE; }
 137
 138 {KatakanaEx}+
 139   { return KATAKANA_TYPE; }
 140
 141 // UAX#29 WB5.   ALetter × ALetter
 142 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
 143 //        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
 144 //        WB9.   ALetter × Numeric
 145 //        WB10.  Numeric × ALetter
 146 //        WB13.  Katakana × Katakana
 147 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 148 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 149 //
 150 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 151                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 152                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ )
 153 ({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 154                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 155                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ ) )*
 156 {ExtendNumLetEx}*
 157   { return WORD_TYPE; }
 158
 159
 160 // From UAX #29:
 161 //
 162 //    [C]haracters with the Line_Break property values of Contingent_Break (CB),
 163 //    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
 164 //    boundary property values based on criteria outside of the scope of this
 165 //    annex.  That means that satisfactory treatment of languages like Chinese
 166 //    or Thai requires special handling.
 167 //
 168 // In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
 169 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 170 //
 171 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
 172 // character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
 173 // Lao, etc.) are kept together.  This grammar does the same below.
 174 //
 175 // See also the Unicode Line Breaking Algorithm:
 176 //
 177 //    http://www.unicode.org/reports/tr14/#SA
 178 //
 179 {ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
 180
 181 // UAX#29 WB14.  Any ÷ Any
 182 //
 183 {HanEx} { return IDEOGRAPHIC_TYPE; }
 184 {HiraganaEx} { return HIRAGANA_TYPE; }
 185
 186
 187 // UAX#29 WB3.   CR × LF
 188 //        WB3a.  (Newline | CR | LF) ÷
 189 //        WB3b.  ÷ (Newline | CR | LF)
 190 //        WB14.  Any ÷ Any
 191 //
 192 [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }