pylucene 3.5.0-3
[pylucene.git] / lucene-java-3.5.0 / lucene / contrib / analyzers / common / src / java / org / apache / lucene / analysis / wikipedia / WikipediaTokenizerImpl.jflex
diff --git a/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex b/lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex
new file mode 100644 (file)
index 0000000..4846143
--- /dev/null
@@ -0,0 +1,330 @@
+package org.apache.lucene.analysis.wikipedia;
+
+/**
+ * Licensed to the Apache Software Foundation (ASF) under one or more
+ * contributor license agreements.  See the NOTICE file distributed with
+ * this work for additional information regarding copyright ownership.
+ * The ASF licenses this file to You under the Apache License, Version 2.0
+ * (the "License"); you may not use this file except in compliance with
+ * the License.  You may obtain a copy of the License at
+ *
+ *     http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
+
+%%
+
+%class WikipediaTokenizerImpl
+%unicode 3.0
+%integer
+%function getNextToken
+%pack
+%char
+
+%{
+
+public static final int ALPHANUM          = WikipediaTokenizer.ALPHANUM_ID;
+public static final int APOSTROPHE        = WikipediaTokenizer.APOSTROPHE_ID;
+public static final int ACRONYM           = WikipediaTokenizer.ACRONYM_ID;
+public static final int COMPANY           = WikipediaTokenizer.COMPANY_ID;
+public static final int EMAIL             = WikipediaTokenizer.EMAIL_ID;
+public static final int HOST              = WikipediaTokenizer.HOST_ID;
+public static final int NUM               = WikipediaTokenizer.NUM_ID;
+public static final int CJ                = WikipediaTokenizer.CJ_ID;
+public static final int INTERNAL_LINK     = WikipediaTokenizer.INTERNAL_LINK_ID;
+public static final int EXTERNAL_LINK     = WikipediaTokenizer.EXTERNAL_LINK_ID;
+public static final int CITATION          = WikipediaTokenizer.CITATION_ID;
+public static final int CATEGORY          = WikipediaTokenizer.CATEGORY_ID;
+public static final int BOLD              = WikipediaTokenizer.BOLD_ID;
+public static final int ITALICS           = WikipediaTokenizer.ITALICS_ID;
+public static final int BOLD_ITALICS      = WikipediaTokenizer.BOLD_ITALICS_ID;
+public static final int HEADING           = WikipediaTokenizer.HEADING_ID;
+public static final int SUB_HEADING       = WikipediaTokenizer.SUB_HEADING_ID;
+public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID;
+
+
+private int currentTokType;
+private int numBalanced = 0;
+private int positionInc = 1;
+private int numLinkToks = 0;
+//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen
+//this can be useful for detecting when a new reserved token is encountered
+//see https://issues.apache.org/jira/browse/LUCENE-1133
+private int numWikiTokensSeen = 0;
+
+public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES;
+
+/**
+Returns the number of tokens seen inside a category or link, etc.
+@return the number of tokens seen inside the context of wiki syntax.
+**/
+public final int getNumWikiTokensSeen(){
+  return numWikiTokensSeen;
+}
+
+public final int yychar()
+{
+    return yychar;
+}
+
+public final int getPositionIncrement(){
+  return positionInc;
+}
+
+/**
+ * Fills Lucene token with the current token text.
+ */
+final void getText(CharTermAttribute t) {
+  t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead);
+}
+
+final int setText(StringBuilder buffer){
+  int length = zzMarkedPos - zzStartRead;
+  buffer.append(zzBuffer, zzStartRead, length);
+  return length;
+}
+
+
+%}
+
+// basic word: a sequence of digits & letters
+ALPHANUM   = ({LETTER}|{DIGIT}|{KOREAN})+
+
+// internal apostrophes: O'Reilly, you're, O'Reilly's
+// use a post-filter to remove possesives
+APOSTROPHE =  {ALPHA} ("'" {ALPHA})+
+
+// acronyms: U.S.A., I.B.M., etc.
+// use a post-filter to remove dots
+ACRONYM    =  {ALPHA} "." ({ALPHA} ".")+
+
+// company names like AT&T and Excite@Home.
+COMPANY    =  {ALPHA} ("&"|"@") {ALPHA}
+
+// email addresses
+EMAIL      =  {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+
+
+// hostname
+HOST       =  {ALPHANUM} ((".") {ALPHANUM})+
+
+// floating point, serial, model numbers, ip addresses, etc.
+// every other segment must have at least one digit
+NUM        = ({ALPHANUM} {P} {HAS_DIGIT}
+           | {DIGIT}+ {P} {DIGIT}+
+           | {HAS_DIGIT} {P} {ALPHANUM}
+           | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+
+           | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+
+           | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+)
+
+TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">"
+
+// punctuation
+P               = ("_"|"-"|"/"|"."|",")
+
+// at least one digit
+HAS_DIGIT  =
+    ({LETTER}|{DIGIT})*
+    {DIGIT}
+    ({LETTER}|{DIGIT})*
+
+ALPHA      = ({LETTER})+
+
+
+LETTER     = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc]
+
+DIGIT      = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049]
+
+KOREAN     = [\uac00-\ud7af\u1100-\u11ff]
+
+// Chinese, Japanese
+CJ         = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f]
+
+WHITESPACE = \r\n | [ \r\n\t\f]
+
+//Wikipedia
+DOUBLE_BRACKET = "["{2}
+DOUBLE_BRACKET_CLOSE = "]"{2}
+DOUBLE_BRACKET_CAT = "["{2}":"?"Category:"
+EXTERNAL_LINK = "["
+TWO_SINGLE_QUOTES = "'"{2}
+CITATION = "<ref>"
+CITATION_CLOSE = "</ref>"
+INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_
+
+DOUBLE_BRACE = "{"{2}
+DOUBLE_BRACE_CLOSE = "}"{2}
+PIPE = "|"
+DOUBLE_EQUALS = "="{2}
+
+
+%state CATEGORY_STATE
+%state INTERNAL_LINK_STATE
+%state EXTERNAL_LINK_STATE
+
+%state TWO_SINGLE_QUOTES_STATE
+%state THREE_SINGLE_QUOTES_STATE
+%state FIVE_SINGLE_QUOTES_STATE
+%state DOUBLE_EQUALS_STATE
+%state DOUBLE_BRACE_STATE
+%state STRING
+
+%%
+
+<YYINITIAL>{ALPHANUM}                                                     {positionInc = 1; return ALPHANUM; }
+<YYINITIAL>{APOSTROPHE}                                                   {positionInc = 1; return APOSTROPHE; }
+<YYINITIAL>{ACRONYM}                                                      {positionInc = 1; return ACRONYM; }
+<YYINITIAL>{COMPANY}                                                      {positionInc = 1; return COMPANY; }
+<YYINITIAL>{EMAIL}                                                        {positionInc = 1; return EMAIL; }
+<YYINITIAL>{NUM}                                                          {positionInc = 1; return NUM; }
+<YYINITIAL>{HOST}                                                         {positionInc = 1; return HOST; }
+<YYINITIAL>{CJ}                                                           {positionInc = 1; return CJ; }
+
+//wikipedia
+<YYINITIAL>{
+  //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state
+  //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent
+  //tokens within the link are incremented
+  {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
+  {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);/* Break so we don't hit fall-through warning: */ break;}
+  {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);/* Break so we don't hit fall-through warning: */ break;}
+  {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}/* Break so we don't hit fall-through warning: */ break;}
+  {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);/* Break so we don't hit fall-through warning: */ break;}
+  {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
+  {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);/* Break so we don't hit fall-through warning: */ break;}
+//ignore
+  . | {WHITESPACE} |{INFOBOX}                                               {numWikiTokensSeen = 0;  positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+}
+
+<INTERNAL_LINK_STATE>{
+//First {ALPHANUM} is always the link, set position to 0 for these
+//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL
+  {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;}
+  {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
+  //ignore
+  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+}
+
+<EXTERNAL_LINK_STATE>{
+//increment the link token, but then don't increment the tokens after that which are still in the link
+  ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;}
+  {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;}
+  "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
+  {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+}
+
+<CATEGORY_STATE>{
+  {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;}
+  {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);/* Break so we don't hit fall-through warning: */ break;}
+  //ignore
+  . | {WHITESPACE}                                               { positionInc = 1; /* Break so we don't hit fall-through warning: */ break;}
+}
+//italics
+<TWO_SINGLE_QUOTES_STATE>{
+  "'" {currentTokType = BOLD;  yybegin(THREE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   "'''" {currentTokType = BOLD_ITALICS;  yybegin(FIVE_SINGLE_QUOTES_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++;  yybegin(STRING); return currentTokType;/*italics*/}
+   //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+}
+//bold
+<THREE_SINGLE_QUOTES_STATE>{
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+
+}
+//bold italics
+<FIVE_SINGLE_QUOTES_STATE>{
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0;  yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+
+   //ignore
+  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+}
+
+<DOUBLE_EQUALS_STATE>{
+ "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING); /* Break so we don't hit fall-through warning: */ break;}
+ {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;}
+ {DOUBLE_EQUALS} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
+  //ignore
+  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+}
+
+<DOUBLE_BRACE_STATE>{
+  {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;}
+  {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
+  {CITATION_CLOSE} {yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;}
+   //ignore
+  . | {WHITESPACE}                                               { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+}
+
+<STRING> {
+  "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold italics*/}
+  "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end bold*/}
+  "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end italics*/}
+  "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL); /* Break so we don't hit fall-through warning: */ break;/*end sub header*/}
+  {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/}
+  //we can have links inside, let those override
+   {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE); /* Break so we don't hit fall-through warning: */ break;}
+   {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE); /* Break so we don't hit fall-through warning: */ break;}
+
+
+  {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/}
+
+  .|{WHITESPACE}                                              { /* Break so we don't hit fall-through warning: */ break;/* ignore STRING */ }
+}
+
+
+
+
+/*
+{INTERNAL_LINK}                                                { return curentTokType; }
+
+{CITATION}                                                { return currentTokType; }
+{CATEGORY}                                                { return currentTokType; }
+
+{BOLD}                                                { return currentTokType; }
+{ITALICS}                                                { return currentTokType; }
+{BOLD_ITALICS}                                                { return currentTokType; }
+{HEADING}                                                { return currentTokType; }
+{SUB_HEADING}                                                { return currentTokType; }
+
+*/
+//end wikipedia
+
+/** Ignore the rest */
+. | {WHITESPACE}|{TAGS}                                                { /* Break so we don't hit fall-through warning: */ break;/* ignore */ }
+
+
+//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//EXTERNAL_LINK = "["http://"{HOST}.*?"]"
+//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2}
+//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2}
+//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+'''
+//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+''
+//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+'''''
+//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2}
+//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}