lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/std31/StandardTokenizerImpl31.jflex

   1 package org.apache.lucene.analysis.standard.std31;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.standard.StandardTokenizer;
  21 import org.apache.lucene.analysis.standard.StandardTokenizerInterface;
  22 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  23
  24 /**
  25  * This class implements StandardTokenizer, except with a bug
  26  * (https://issues.apache.org/jira/browse/LUCENE-3358) where Han and Hiragana
  27  * characters would be split from combining characters:
  28  * @deprecated This class is only for exact backwards compatibility
  29  */
  30 @Deprecated
  31 %%
  32
  33 %unicode 6.0
  34 %integer
  35 %final
  36 %public
  37 %class StandardTokenizerImpl31
  38 %implements StandardTokenizerInterface
  39 %function getNextToken
  40 %char
  41
  42 %include src/java/org/apache/lucene/analysis/standard/std31/SUPPLEMENTARY.jflex-macro
  43 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
  44 Format =  ([\p{WB:Format}] | {FormatSupp})
  45 Numeric = ([\p{WB:Numeric}] | {NumericSupp})
  46 Extend =  ([\p{WB:Extend}] | {ExtendSupp})
  47 Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
  48 MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
  49 MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
  50 MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
  51 ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
  52 ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
  53 Han = ([\p{Script:Han}] | {HanSupp})
  54 Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
  55
  56 // Script=Hangul & Aletter
  57 HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
  58 // UAX#29 WB4. X (Extend | Format)* --> X
  59 //
  60 ALetterEx      = {ALetter}                     ({Format} | {Extend})*
  61 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
  62 NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
  63 KatakanaEx     = {Katakana}                    ({Format} | {Extend})*
  64 MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})*
  65 MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
  66 ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
  67
  68
  69 %{
  70   /** Alphanumeric sequences */
  71   public static final int WORD_TYPE = StandardTokenizer.ALPHANUM;
  72
  73   /** Numbers */
  74   public static final int NUMERIC_TYPE = StandardTokenizer.NUM;
  75
  76   /**
  77    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
  78    * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept
  79    * together as as a single token rather than broken up, because the logic
  80    * required to break them at word boundaries is too complex for UAX#29.
  81    * <p>
  82    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
  83    */
  84   public static final int SOUTH_EAST_ASIAN_TYPE = StandardTokenizer.SOUTHEAST_ASIAN;
  85
  86   public static final int IDEOGRAPHIC_TYPE = StandardTokenizer.IDEOGRAPHIC;
  87
  88   public static final int HIRAGANA_TYPE = StandardTokenizer.HIRAGANA;
  89
  90   public static final int KATAKANA_TYPE = StandardTokenizer.KATAKANA;
  91
  92   public static final int HANGUL_TYPE = StandardTokenizer.HANGUL;
  93
  94   public final int yychar()
  95   {
  96     return yychar;
  97   }
  98
  99   /**
 100    * Fills CharTermAttribute with the current token text.
 101    */
 102   public final void getText(CharTermAttribute t) {
 103     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 104   }
 105 %}
 106
 107 %%
 108
 109 // UAX#29 WB1.  sot     ÷
 110 //        WB2.          ÷      eot
 111 //
 112 <<EOF>> { return StandardTokenizerInterface.YYEOF; }
 113
 114 // UAX#29 WB8.   Numeric × Numeric
 115 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
 116 //        WB12.  Numeric × (MidNum | MidNumLet) Numeric
 117 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 118 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 119 //
 120 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
 121                               | {MidNumericEx} {NumericEx}
 122                               | {NumericEx})*
 123 {ExtendNumLetEx}*
 124   { return NUMERIC_TYPE; }
 125
 126 // subset of the below for typing purposes only!
 127 {HangulEx}+
 128   { return HANGUL_TYPE; }
 129
 130 {KatakanaEx}+
 131   { return KATAKANA_TYPE; }
 132
 133 // UAX#29 WB5.   ALetter × ALetter
 134 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
 135 //        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
 136 //        WB9.   ALetter × Numeric
 137 //        WB10.  Numeric × ALetter
 138 //        WB13.  Katakana × Katakana
 139 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 140 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 141 //
 142 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 143                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 144                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ )
 145 ({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 146                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 147                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ ) )*
 148 {ExtendNumLetEx}*
 149   { return WORD_TYPE; }
 150
 151
 152 // From UAX #29:
 153 //
 154 //    [C]haracters with the Line_Break property values of Contingent_Break (CB),
 155 //    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
 156 //    boundary property values based on criteria outside of the scope of this
 157 //    annex.  That means that satisfactory treatment of languages like Chinese
 158 //    or Thai requires special handling.
 159 //
 160 // In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
 161 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 162 //
 163 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
 164 // character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
 165 // Lao, etc.) are kept together.  This grammar does the same below.
 166 //
 167 // See also the Unicode Line Breaking Algorithm:
 168 //
 169 //    http://www.unicode.org/reports/tr14/#SA
 170 //
 171 {ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
 172
 173 // UAX#29 WB14.  Any ÷ Any
 174 //
 175 {Han} { return IDEOGRAPHIC_TYPE; }
 176 {Hiragana} { return HIRAGANA_TYPE; }
 177
 178
 179 // UAX#29 WB3.   CR × LF
 180 //        WB3a.  (Newline | CR | LF) ÷
 181 //        WB3b.  ÷ (Newline | CR | LF)
 182 //        WB14.  Any ÷ Any
 183 //
 184 [^] { /* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }