add --shared
[pylucene.git] / lucene-java-3.4.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / wikipedia / WikipediaTokenizerImpl.jflex
1 package org.apache.lucene.analysis.wikipedia;
2
3 /**
4  * Licensed to the Apache Software Foundation (ASF) under one or more
5  * contributor license agreements.  See the NOTICE file distributed with
6  * this work for additional information regarding copyright ownership.
7  * The ASF licenses this file to You under the Apache License, Version 2.0
8  * (the "License"); you may not use this file except in compliance with
9  * the License.  You may obtain a copy of the License at
10  *
11  *     http://www.apache.org/licenses/LICENSE-2.0
12  *
13  * Unless required by applicable law or agreed to in writing, software
14  * distributed under the License is distributed on an "AS IS" BASIS,
15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  * See the License for the specific language governing permissions and
17  * limitations under the License.
18  */
19
20 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
21
22 %%
23
24 %class WikipediaTokenizerImpl
25 %unicode 3.0
26 %integer
27 %function getNextToken
28 %pack
29 %char
30
31 %{
32
33 public static final int ALPHANUM          = WikipediaTokenizer.ALPHANUM_ID;
34 public static final int APOSTROPHE        = WikipediaTokenizer.APOSTROPHE_ID;
35 public static final int ACRONYM           = WikipediaTokenizer.ACRONYM_ID;
36 public static final int COMPANY           = WikipediaTokenizer.COMPANY_ID;
37 public static final int EMAIL             = WikipediaTokenizer.EMAIL_ID;
38 public static final int HOST              = WikipediaTokenizer.HOST_ID;
39 public static final int NUM               = WikipediaTokenizer.NUM_ID;
40 public static final int CJ                = WikipediaTokenizer.CJ_ID;
41 public static final int INTERNAL_LINK     = WikipediaTokenizer.INTERNAL_LINK_ID;
42 public static final int EXTERNAL_LINK     = WikipediaTokenizer.EXTERNAL_LINK_ID;
43 public static final int CITATION          = WikipediaTokenizer.CITATION_ID;
44 public static final int CATEGORY          = WikipediaTokenizer.CATEGORY_ID;
45 public static final int BOLD              = WikipediaTokenizer.BOLD_ID;
46 public static final int ITALICS           = WikipediaTokenizer.ITALICS_ID;
47 public static final int BOLD_ITALICS      = WikipediaTokenizer.BOLD_ITALICS_ID;
48 public static final int HEADING           = WikipediaTokenizer.HEADING_ID;
49 public static final int SUB_HEADING       = WikipediaTokenizer.SUB_HEADING_ID;
50 public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;
51
52
53 private int currentTokType;
54 private int numBalanced = 0;
55 private int positionInc = 1;
56 private int numLinkToks = 0;
57 //Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
58 //this can be useful for detecting when a new reserved token is encountered
59 //see https://issues.apache.org/jira/browse/LUCENE-1133
60 private int numWikiTokensSeen = 0;
61
62 public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;
63
64 /**
65 Returns the number of tokens seen inside a category or link, etc.
66 @return the number of tokens seen inside the context of wiki syntax.
67 **/
68 public final int getNumWikiTokensSeen(){
69   return numWikiTokensSeen;
70 }
71
72 public final int yychar()
73 {
74     return yychar;
75 }
76
77 public final int getPositionIncrement(){
78   return positionInc;
79 }
80
81 /**
82  * Fills Lucene token with the current token text.
83  */
84 final void getText(CharTermAttribute t) {
85   t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
86 }
87
88 final int setText(StringBuilder buffer){
89   int length = zzMarkedPos - zzStartRead;
90   buffer.append(zzBuffer, zzStartRead, length);
91   return length;
92 }
93
94
95 %}
96
97 // basic word: a sequence of digits & letters
98 ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+
99
100 // internal apostrophes: O'Reilly, you're, O'Reilly's
101 // use a post-filter to remove possesives
102 APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
103
104 // acronyms: U.S.A., I.B.M., etc.
105 // use a post-filter to remove dots
106 ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
107
108 // company names like AT&T and Excite@Home.
109 COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
110
111 // email addresses
112 EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
113
114 // hostname
115 HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
116
117 // floating point, serial, model numbers, ip addresses, etc.
118 // every other segment must have at least one digit
119 NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
120            | {DIGIT}+ {P} {DIGIT}+
121            | {HAS_DIGIT} {P} {ALPHANUM}
122            | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
123            | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
124            | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
125            | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
126
127 TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
128
129 // punctuation
130 P                = ("_"|"-"|"/"|"."|",")
131
132 // at least one digit
133 HAS_DIGIT  =
134     ({LETTER}|{DIGIT})*
135     {DIGIT}
136     ({LETTER}|{DIGIT})*
137
138 ALPHA      = ({LETTER})+
139
140
141 LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
142
143 DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
144
145 KOREAN     = [\uac00-\ud7af\u1100-\u11ff]
146
147 // Chinese, Japanese
148 CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
149
150 WHITESPACE = \r\n | [ \r\n\t\f]
151
152 //Wikipedia
153 DOUBLE_BRACKET = "["{2}
154 DOUBLE_BRACKET_CLOSE = "]"{2}
155 DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
156 EXTERNAL_LINK = "["
157 TWO_SINGLE_QUOTES = "'"{2}
158 CITATION = "<ref>"
159 CITATION_CLOSE = "</ref>"
160 INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
161
162 DOUBLE_BRACE = "{"{2}
163 DOUBLE_BRACE_CLOSE = "}"{2}
164 PIPE = "|"
165 DOUBLE_EQUALS = "="{2}
166
167
168 %state CATEGORY_STATE
169 %state INTERNAL_LINK_STATE
170 %state EXTERNAL_LINK_STATE
171
172 %state TWO_SINGLE_QUOTES_STATE
173 %state THREE_SINGLE_QUOTES_STATE
174 %state FIVE_SINGLE_QUOTES_STATE
175 %state DOUBLE_EQUALS_STATE
176 %state DOUBLE_BRACE_STATE
177 %state STRING
178
179 %%
180
181 <YYINITIAL>{ALPHANUM}                                                     {positionInc = 1; return ALPHANUM; }
182 <YYINITIAL>{APOSTROPHE}                                                   {positionInc = 1; return APOSTROPHE; }
183 <YYINITIAL>{ACRONYM}                                                      {positionInc = 1; return ACRONYM; }
184 <YYINITIAL>{COMPANY}                                                      {positionInc = 1; return COMPANY; }
185 <YYINITIAL>{EMAIL}                                                        {positionInc = 1; return EMAIL; }
186 <YYINITIAL>{NUM}                                                          {positionInc = 1; return NUM; }
187 <YYINITIAL>{HOST}                                                         {positionInc = 1; return HOST; }
188 <YYINITIAL>{CJ}                                                           {positionInc = 1; return CJ; }
189
190 //wikipedia
191 <YYINITIAL>{
192   //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
193   //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
194   //tokens within the link are incremented
195   {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);}
196   {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);}
197   {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);}
198   {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}}
199   {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);}
200   {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
201   {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);}
202 //ignore
203   . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; }
204 }
205
206 <INTERNAL_LINK_STATE>{
207 //First {ALPHANUM} is always the link, set position to 0 for these
208 //This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
209   {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
210   {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);}
211   //ignore
212   . | {WHITESPACE}                                               { positionInc = 1; }
213 }
214
215 <EXTERNAL_LINK_STATE>{
216 //increment the link token, but then don't increment the tokens after that which are still in the link
217   ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
218   {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
219   "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);}
220   {WHITESPACE}                                               { positionInc = 1; }
221 }
222
223 <CATEGORY_STATE>{
224   {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
225   {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);}
226   //ignore
227   . | {WHITESPACE}                                               { positionInc = 1; }
228 }
229 //italics
230 <TWO_SINGLE_QUOTES_STATE>{
231   "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE);}
232    "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE);}
233    {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/}
234    //we can have links inside, let those override
235    {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
236    {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
237    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
238
239    //ignore
240   . | {WHITESPACE}                                               { /* ignore */ }
241 }
242 //bold
243 <THREE_SINGLE_QUOTES_STATE>{
244   {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
245   //we can have links inside, let those override
246    {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);}
247    {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
248    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
249
250    //ignore
251   . | {WHITESPACE}                                               { /* ignore */ }
252
253 }
254 //bold italics
255 <FIVE_SINGLE_QUOTES_STATE>{
256   {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
257   //we can have links inside, let those override
258    {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE);}
259    {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);}
260    {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);}
261
262    //ignore
263   . | {WHITESPACE}                                               { /* ignore */ }
264 }
265
266 <DOUBLE_EQUALS_STATE>{
267  "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);}
268  {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
269  {DOUBLE_EQUALS} {yybegin(YYINITIAL);}
270   //ignore
271   . | {WHITESPACE}                                               { /* ignore */ }
272 }
273
274 <DOUBLE_BRACE_STATE>{
275   {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
276   {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);}
277   {CITATION_CLOSE} {yybegin(YYINITIAL);}
278    //ignore
279   . | {WHITESPACE}                                               { /* ignore */ }
280 }
281
282 <STRING> {
283   "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/}
284   "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/}
285   "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/}
286   "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/}
287   {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
288   //we can have links inside, let those override
289    {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);}
290    {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);}
291    {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);}
292
293
294   {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
295
296   .|{WHITESPACE}                                              { /* ignore STRING */ }
297 }
298
299
300
301
302 /*
303 {INTERNAL_LINK}                                                { return curentTokType; }
304
305 {CITATION}                                                { return currentTokType; }
306 {CATEGORY}                                                { return currentTokType; }
307
308 {BOLD}                                                { return currentTokType; }
309 {ITALICS}                                                { return currentTokType; }
310 {BOLD_ITALICS}                                                { return currentTokType; }
311 {HEADING}                                                { return currentTokType; }
312 {SUB_HEADING}                                                { return currentTokType; }
313
314 */
315 //end wikipedia
316
317 /** Ignore the rest */
318 . | {WHITESPACE}|{TAGS}                                                { /* ignore */ }
319
320
321 //INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
322 //EXTERNAL_LINK = "["http://"{HOST}.*?"]"
323 //CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
324 //CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
325 //CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
326 //BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
327 //ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
328 //BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
329 //HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
330 //SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}