lucene-java-3.5.0/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizerImpl.jflex

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  21
  22 /**
  23  * This class implements Word Break rules from the Unicode Text Segmentation
  24  * algorithm, as specified in
  25  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
  26  * URLs and email addresses are also tokenized according to the relevant RFCs.
  27  * <p/>
  28  * Tokens produced are of the following types:
  29  * <ul>
  30  *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
  31  *   <li>&lt;NUM&gt;: A number</li>
  32  *   <li>&lt;URL&gt;: A URL</li>
  33  *   <li>&lt;EMAIL&gt;: An email address</li>
  34  *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
  35  *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
  36  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  37  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  38  * </ul>
  39  */
  40 %%
  41
  42 %unicode 6.0
  43 %integer
  44 %final
  45 %public
  46 %class UAX29URLEmailTokenizerImpl
  47 %implements StandardTokenizerInterface
  48 %function getNextToken
  49 %char
  50
  51 %include src/java/org/apache/lucene/analysis/standard/SUPPLEMENTARY.jflex-macro
  52 ALetter = ([\p{WB:ALetter}] | {ALetterSupp})
  53 Format =  ([\p{WB:Format}] | {FormatSupp})
  54 Numeric = ([\p{WB:Numeric}] | {NumericSupp})
  55 Extend =  ([\p{WB:Extend}] | {ExtendSupp})
  56 Katakana = ([\p{WB:Katakana}] | {KatakanaSupp})
  57 MidLetter = ([\p{WB:MidLetter}] | {MidLetterSupp})
  58 MidNum = ([\p{WB:MidNum}] | {MidNumSupp})
  59 MidNumLet = ([\p{WB:MidNumLet}] | {MidNumLetSupp})
  60 ExtendNumLet = ([\p{WB:ExtendNumLet}] | {ExtendNumLetSupp})
  61 ComplexContext = ([\p{LB:Complex_Context}] | {ComplexContextSupp})
  62 Han = ([\p{Script:Han}] | {HanSupp})
  63 Hiragana = ([\p{Script:Hiragana}] | {HiraganaSupp})
  64
  65 // Script=Hangul & Aletter
  66 HangulEx       = (!(!\p{Script:Hangul}|!\p{WB:ALetter})) ({Format} | {Extend})*
  67 // UAX#29 WB4. X (Extend | Format)* --> X
  68 //
  69 ALetterEx      = {ALetter}                     ({Format} | {Extend})*
  70 // TODO: Convert hard-coded full-width numeric range to property intersection (something like [\p{Full-Width}&&\p{Numeric}]) once JFlex supports it
  71 NumericEx      = ({Numeric} | [\uFF10-\uFF19]) ({Format} | {Extend})*
  72 KatakanaEx     = {Katakana}                    ({Format} | {Extend})*
  73 MidLetterEx    = ({MidLetter} | {MidNumLet})   ({Format} | {Extend})*
  74 MidNumericEx   = ({MidNum} | {MidNumLet})      ({Format} | {Extend})*
  75 ExtendNumLetEx = {ExtendNumLet}                ({Format} | {Extend})*
  76
  77 HanEx = {Han} ({Format} | {Extend})*
  78 HiraganaEx = {Hiragana} ({Format} | {Extend})*
  79
  80 // URL and E-mail syntax specifications:
  81 //
  82 //     RFC-952:  DOD INTERNET HOST TABLE SPECIFICATION
  83 //     RFC-1035: DOMAIN NAMES - IMPLEMENTATION AND SPECIFICATION
  84 //     RFC-1123: Requirements for Internet Hosts - Application and Support
  85 //     RFC-1738: Uniform Resource Locators (URL)
  86 //     RFC-3986: Uniform Resource Identifier (URI): Generic Syntax
  87 //     RFC-5234: Augmented BNF for Syntax Specifications: ABNF
  88 //     RFC-5321: Simple Mail Transfer Protocol
  89 //     RFC-5322: Internet Message Format
  90
  91 %include src/java/org/apache/lucene/analysis/standard/ASCIITLD.jflex-macro
  92
  93 DomainLabel = [A-Za-z0-9] ([-A-Za-z0-9]* [A-Za-z0-9])?
  94 DomainNameStrict = {DomainLabel} ("." {DomainLabel})* {ASCIITLD}
  95 DomainNameLoose  = {DomainLabel} ("." {DomainLabel})*
  96
  97 IPv4DecimalOctet = "0"{0,2} [0-9] | "0"? [1-9][0-9] | "1" [0-9][0-9] | "2" ([0-4][0-9] | "5" [0-5])
  98 IPv4Address  = {IPv4DecimalOctet} ("." {IPv4DecimalOctet}){3}
  99 IPv6Hex16Bit = [0-9A-Fa-f]{1,4}
 100 IPv6LeastSignificant32Bits = {IPv4Address} | ({IPv6Hex16Bit} ":" {IPv6Hex16Bit})
 101 IPv6Address =                                                  ({IPv6Hex16Bit} ":"){6} {IPv6LeastSignificant32Bits}
 102             |                                             "::" ({IPv6Hex16Bit} ":"){5} {IPv6LeastSignificant32Bits}
 103             |                            {IPv6Hex16Bit}?  "::" ({IPv6Hex16Bit} ":"){4} {IPv6LeastSignificant32Bits}
 104             | (({IPv6Hex16Bit} ":"){0,1} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){3} {IPv6LeastSignificant32Bits}
 105             | (({IPv6Hex16Bit} ":"){0,2} {IPv6Hex16Bit})? "::" ({IPv6Hex16Bit} ":"){2} {IPv6LeastSignificant32Bits}
 106             | (({IPv6Hex16Bit} ":"){0,3} {IPv6Hex16Bit})? "::"  {IPv6Hex16Bit} ":"     {IPv6LeastSignificant32Bits}
 107             | (({IPv6Hex16Bit} ":"){0,4} {IPv6Hex16Bit})? "::"                         {IPv6LeastSignificant32Bits}
 108             | (({IPv6Hex16Bit} ":"){0,5} {IPv6Hex16Bit})? "::"                         {IPv6Hex16Bit}
 109             | (({IPv6Hex16Bit} ":"){0,6} {IPv6Hex16Bit})? "::"
 110
 111 URIunreserved = [-._~A-Za-z0-9]
 112 URIpercentEncoded = "%" [0-9A-Fa-f]{2}
 113 URIsubDelims = [!$&'()*+,;=]
 114 URIloginSegment = ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims})*
 115 URIlogin = {URIloginSegment} (":" {URIloginSegment})? "@"
 116 URIquery    = "?" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 117 URIfragment = "#" ({URIunreserved} | {URIpercentEncoded} | {URIsubDelims} | [:@/?])*
 118 URIport = ":" [0-9]{1,5}
 119 URIhostStrict = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameStrict}
 120 URIhostLoose  = ("[" {IPv6Address} "]") | {IPv4Address} | {DomainNameLoose}
 121
 122 URIauthorityStrict =             {URIhostStrict} {URIport}?
 123 URIauthorityLoose  = {URIlogin}? {URIhostLoose}  {URIport}?
 124
 125 HTTPsegment = ({URIunreserved} | {URIpercentEncoded} | [;:@&=])*
 126 HTTPpath = ("/" {HTTPsegment})*
 127 HTTPscheme = [hH][tT][tT][pP][sS]? "://"
 128 HTTPurlFull = {HTTPscheme} {URIauthorityLoose}  {HTTPpath}? {URIquery}? {URIfragment}?
 129 // {HTTPurlNoScheme} excludes {URIlogin}, because it could otherwise accept e-mail addresses
 130 HTTPurlNoScheme =          {URIauthorityStrict} {HTTPpath}? {URIquery}? {URIfragment}?
 131 HTTPurl = {HTTPurlFull} | {HTTPurlNoScheme}
 132
 133 FTPorFILEsegment = ({URIunreserved} | {URIpercentEncoded} | [?:@&=])*
 134 FTPorFILEpath = "/" {FTPorFILEsegment} ("/" {FTPorFILEsegment})*
 135 FTPtype = ";" [tT][yY][pP][eE] "=" [aAiIdD]
 136 FTPscheme = [fF][tT][pP] "://"
 137 FTPurl = {FTPscheme} {URIauthorityLoose} {FTPorFILEpath} {FTPtype}? {URIfragment}?
 138
 139 FILEscheme = [fF][iI][lL][eE] "://"
 140 FILEurl = {FILEscheme} {URIhostLoose}? {FTPorFILEpath} {URIfragment}?
 141
 142 URL = {HTTPurl} | {FTPurl} | {FILEurl}
 143
 144 EMAILquotedString = [\"] ([\u0001-\u0008\u000B\u000C\u000E-\u0021\u0023-\u005B\u005D-\u007E] | [\\] [\u0000-\u007F])* [\"]
 145 EMAILatomText = [A-Za-z0-9!#$%&'*+-/=?\^_`{|}~]
 146 EMAILlabel = {EMAILatomText}+ | {EMAILquotedString}
 147 EMAILlocalPart = {EMAILlabel} ("." {EMAILlabel})*
 148 EMAILdomainLiteralText = [\u0001-\u0008\u000B\u000C\u000E-\u005A\u005E-\u007F] | [\\] [\u0000-\u007F]
 149 // DFA minimization allows {IPv6Address} and {IPv4Address} to be included
 150 // in the {EMAILbracketedHost} definition without incurring any size penalties,
 151 // since {EMAILdomainLiteralText} recognizes all valid IP addresses.
 152 // The IP address regexes are included in {EMAILbracketedHost} simply as a
 153 // reminder that they are acceptable bracketed host forms.
 154 EMAILbracketedHost = "[" ({EMAILdomainLiteralText}* | {IPv4Address} | [iI][pP][vV] "6:" {IPv6Address}) "]"
 155 EMAIL = {EMAILlocalPart} "@" ({DomainNameStrict} | {EMAILbracketedHost})
 156
 157
 158 %{
 159   /** Alphanumeric sequences */
 160   public static final int WORD_TYPE = UAX29URLEmailTokenizer.ALPHANUM;
 161
 162   /** Numbers */
 163   public static final int NUMERIC_TYPE = UAX29URLEmailTokenizer.NUM;
 164
 165   /**
 166    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
 167    * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept
 168    * together as as a single token rather than broken up, because the logic
 169    * required to break them at word boundaries is too complex for UAX#29.
 170    * <p>
 171    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
 172    */
 173   public static final int SOUTH_EAST_ASIAN_TYPE = UAX29URLEmailTokenizer.SOUTHEAST_ASIAN;
 174
 175   public static final int IDEOGRAPHIC_TYPE = UAX29URLEmailTokenizer.IDEOGRAPHIC;
 176
 177   public static final int HIRAGANA_TYPE = UAX29URLEmailTokenizer.HIRAGANA;
 178
 179   public static final int KATAKANA_TYPE = UAX29URLEmailTokenizer.KATAKANA;
 180
 181   public static final int HANGUL_TYPE = UAX29URLEmailTokenizer.HANGUL;
 182
 183   public static final int EMAIL_TYPE = UAX29URLEmailTokenizer.EMAIL;
 184
 185   public static final int URL_TYPE = UAX29URLEmailTokenizer.URL;
 186
 187   public final int yychar()
 188   {
 189     return yychar;
 190   }
 191
 192   /**
 193    * Fills CharTermAttribute with the current token text.
 194    */
 195   public final void getText(CharTermAttribute t) {
 196     t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
 197   }
 198 %}
 199
 200 %%
 201
 202 // UAX#29 WB1.  sot     ÷
 203 //        WB2.          ÷      eot
 204 //
 205 <<EOF>> { return StandardTokenizerInterface.YYEOF; }
 206
 207 {URL}   { return URL_TYPE; }
 208 {EMAIL} { return EMAIL_TYPE; }
 209
 210 // UAX#29 WB8.   Numeric × Numeric
 211 //        WB11.  Numeric (MidNum | MidNumLet) × Numeric
 212 //        WB12.  Numeric × (MidNum | MidNumLet) Numeric
 213 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 214 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 215 //
 216 {ExtendNumLetEx}* {NumericEx} ({ExtendNumLetEx}+ {NumericEx}
 217                               | {MidNumericEx} {NumericEx}
 218                               | {NumericEx})*
 219 {ExtendNumLetEx}*
 220   { return NUMERIC_TYPE; }
 221
 222 // subset of the below for typing purposes only!
 223 {HangulEx}+
 224   { return HANGUL_TYPE; }
 225
 226 {KatakanaEx}+
 227   { return KATAKANA_TYPE; }
 228
 229 // UAX#29 WB5.   ALetter × ALetter
 230 //        WB6.   ALetter × (MidLetter | MidNumLet) ALetter
 231 //        WB7.   ALetter (MidLetter | MidNumLet) × ALetter
 232 //        WB9.   ALetter × Numeric
 233 //        WB10.  Numeric × ALetter
 234 //        WB13.  Katakana × Katakana
 235 //        WB13a. (ALetter | Numeric | Katakana | ExtendNumLet) × ExtendNumLet
 236 //        WB13b. ExtendNumLet × (ALetter | Numeric | Katakana)
 237 //
 238 {ExtendNumLetEx}*  ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 239                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 240                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ )
 241 ({ExtendNumLetEx}+ ( {KatakanaEx} ({ExtendNumLetEx}* {KatakanaEx})*
 242                    | ( {NumericEx}  ({ExtendNumLetEx}+ {NumericEx} | {MidNumericEx} {NumericEx} | {NumericEx})*
 243                      | {ALetterEx}  ({ExtendNumLetEx}+ {ALetterEx} | {MidLetterEx}  {ALetterEx} | {ALetterEx})* )+ ) )*
 244 {ExtendNumLetEx}*
 245   { return WORD_TYPE; }
 246
 247
 248 // From UAX #29:
 249 //
 250 //    [C]haracters with the Line_Break property values of Contingent_Break (CB),
 251 //    Complex_Context (SA/South East Asian), and XX (Unknown) are assigned word
 252 //    boundary property values based on criteria outside of the scope of this
 253 //    annex.  That means that satisfactory treatment of languages like Chinese
 254 //    or Thai requires special handling.
 255 //
 256 // In Unicode 6.0, only one character has the \p{Line_Break = Contingent_Break}
 257 // property: U+FFFC (  ) OBJECT REPLACEMENT CHARACTER.
 258 //
 259 // In the ICU implementation of UAX#29, \p{Line_Break = Complex_Context}
 260 // character sequences (from South East Asian scripts like Thai, Myanmar, Khmer,
 261 // Lao, etc.) are kept together.  This grammar does the same below.
 262 //
 263 // See also the Unicode Line Breaking Algorithm:
 264 //
 265 //    http://www.unicode.org/reports/tr14/#SA
 266 //
 267 {ComplexContext}+ { return SOUTH_EAST_ASIAN_TYPE; }
 268
 269 // UAX#29 WB14.  Any ÷ Any
 270 //
 271 {HanEx} { return IDEOGRAPHIC_TYPE; }
 272 {HiraganaEx} { return HIRAGANA_TYPE; }
 273
 274
 275 // UAX#29 WB3.   CR × LF
 276 //        WB3a.  (Newline | CR | LF) ÷
 277 //        WB3b.  ÷ (Newline | CR | LF)
 278 //        WB14.  Any ÷ Any
 279 //
 280 [^] { /* Break so we don't hit fall-through warning: */ break;/* Not numeric, word, ideographic, hiragana, or SE Asian -- ignore it. */ }