X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex deleted file mode 100644 index 477c55b..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizerImpl.jflex +++ /dev/null @@ -1,330 +0,0 @@ -package org.apache.lucene.analysis.wikipedia; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import org.apache.lucene.analysis.tokenattributes.CharTermAttribute; - -%% - -%class WikipediaTokenizerImpl -%unicode 3.0 -%integer -%function getNextToken -%pack -%char - -%{ - -public static final int ALPHANUM = WikipediaTokenizer.ALPHANUM_ID; -public static final int APOSTROPHE = WikipediaTokenizer.APOSTROPHE_ID; -public static final int ACRONYM = WikipediaTokenizer.ACRONYM_ID; -public static final int COMPANY = WikipediaTokenizer.COMPANY_ID; -public static final int EMAIL = WikipediaTokenizer.EMAIL_ID; -public static final int HOST = WikipediaTokenizer.HOST_ID; -public static final int NUM = WikipediaTokenizer.NUM_ID; -public static final int CJ = WikipediaTokenizer.CJ_ID; -public static final int INTERNAL_LINK = WikipediaTokenizer.INTERNAL_LINK_ID; -public static final int EXTERNAL_LINK = WikipediaTokenizer.EXTERNAL_LINK_ID; -public static final int CITATION = WikipediaTokenizer.CITATION_ID; -public static final int CATEGORY = WikipediaTokenizer.CATEGORY_ID; -public static final int BOLD = WikipediaTokenizer.BOLD_ID; -public static final int ITALICS = WikipediaTokenizer.ITALICS_ID; -public static final int BOLD_ITALICS = WikipediaTokenizer.BOLD_ITALICS_ID; -public static final int HEADING = WikipediaTokenizer.HEADING_ID; -public static final int SUB_HEADING = WikipediaTokenizer.SUB_HEADING_ID; -public static final int EXTERNAL_LINK_URL = WikipediaTokenizer.EXTERNAL_LINK_URL_ID; - - -private int currentTokType; -private int numBalanced = 0; -private int positionInc = 1; -private int numLinkToks = 0; -//Anytime we start a new on a Wiki reserved token (category, link, etc.) this value will be 0, otherwise it will be the number of tokens seen -//this can be useful for detecting when a new reserved token is encountered -//see https://issues.apache.org/jira/browse/LUCENE-1133 -private int numWikiTokensSeen = 0; - -public static final String [] TOKEN_TYPES = WikipediaTokenizer.TOKEN_TYPES; - -/** -Returns the number of tokens seen inside a category or link, etc. -@return the number of tokens seen inside the context of wiki syntax. -**/ -public final int getNumWikiTokensSeen(){ - return numWikiTokensSeen; -} - -public final int yychar() -{ - return yychar; -} - -public final int getPositionIncrement(){ - return positionInc; -} - -/** - * Fills Lucene token with the current token text. - */ -final void getText(CharTermAttribute t) { - t.copyBuffer(zzBuffer, zzStartRead, zzMarkedPos-zzStartRead); -} - -final int setText(StringBuilder buffer){ - int length = zzMarkedPos - zzStartRead; - buffer.append(zzBuffer, zzStartRead, length); - return length; -} - - -%} - -// basic word: a sequence of digits & letters -ALPHANUM = ({LETTER}|{DIGIT}|{KOREAN})+ - -// internal apostrophes: O'Reilly, you're, O'Reilly's -// use a post-filter to remove possesives -APOSTROPHE = {ALPHA} ("'" {ALPHA})+ - -// acronyms: U.S.A., I.B.M., etc. -// use a post-filter to remove dots -ACRONYM = {ALPHA} "." ({ALPHA} ".")+ - -// company names like AT&T and Excite@Home. -COMPANY = {ALPHA} ("&"|"@") {ALPHA} - -// email addresses -EMAIL = {ALPHANUM} (("."|"-"|"_") {ALPHANUM})* "@" {ALPHANUM} (("."|"-") {ALPHANUM})+ - -// hostname -HOST = {ALPHANUM} ((".") {ALPHANUM})+ - -// floating point, serial, model numbers, ip addresses, etc. -// every other segment must have at least one digit -NUM = ({ALPHANUM} {P} {HAS_DIGIT} - | {DIGIT}+ {P} {DIGIT}+ - | {HAS_DIGIT} {P} {ALPHANUM} - | {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+ - | {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {ALPHANUM} {P} {HAS_DIGIT} ({P} {ALPHANUM} {P} {HAS_DIGIT})+ - | {HAS_DIGIT} {P} {ALPHANUM} ({P} {HAS_DIGIT} {P} {ALPHANUM})+) - -TAGS = "<"\/?{ALPHANUM}({WHITESPACE}*{ALPHANUM}=\"{ALPHANUM}\")*">" - -// punctuation -P = ("_"|"-"|"/"|"."|",") - -// at least one digit -HAS_DIGIT = - ({LETTER}|{DIGIT})* - {DIGIT} - ({LETTER}|{DIGIT})* - -ALPHA = ({LETTER})+ - - -LETTER = [\u0041-\u005a\u0061-\u007a\u00c0-\u00d6\u00d8-\u00f6\u00f8-\u00ff\u0100-\u1fff\uffa0-\uffdc] - -DIGIT = [\u0030-\u0039\u0660-\u0669\u06f0-\u06f9\u0966-\u096f\u09e6-\u09ef\u0a66-\u0a6f\u0ae6-\u0aef\u0b66-\u0b6f\u0be7-\u0bef\u0c66-\u0c6f\u0ce6-\u0cef\u0d66-\u0d6f\u0e50-\u0e59\u0ed0-\u0ed9\u1040-\u1049] - -KOREAN = [\uac00-\ud7af\u1100-\u11ff] - -// Chinese, Japanese -CJ = [\u3040-\u318f\u3100-\u312f\u3040-\u309F\u30A0-\u30FF\u31F0-\u31FF\u3300-\u337f\u3400-\u4dbf\u4e00-\u9fff\uf900-\ufaff\uff65-\uff9f] - -WHITESPACE = \r\n | [ \r\n\t\f] - -//Wikipedia -DOUBLE_BRACKET = "["{2} -DOUBLE_BRACKET_CLOSE = "]"{2} -DOUBLE_BRACKET_CAT = "["{2}":"?"Category:" -EXTERNAL_LINK = "[" -TWO_SINGLE_QUOTES = "'"{2} -CITATION = "" -CITATION_CLOSE = "" -INFOBOX = {DOUBLE_BRACE}("I"|"i")nfobox_ - -DOUBLE_BRACE = "{"{2} -DOUBLE_BRACE_CLOSE = "}"{2} -PIPE = "|" -DOUBLE_EQUALS = "="{2} - - -%state CATEGORY_STATE -%state INTERNAL_LINK_STATE -%state EXTERNAL_LINK_STATE - -%state TWO_SINGLE_QUOTES_STATE -%state THREE_SINGLE_QUOTES_STATE -%state FIVE_SINGLE_QUOTES_STATE -%state DOUBLE_EQUALS_STATE -%state DOUBLE_BRACE_STATE -%state STRING - -%% - -{ALPHANUM} {positionInc = 1; return ALPHANUM; } -{APOSTROPHE} {positionInc = 1; return APOSTROPHE; } -{ACRONYM} {positionInc = 1; return ACRONYM; } -{COMPANY} {positionInc = 1; return COMPANY; } -{EMAIL} {positionInc = 1; return EMAIL; } -{NUM} {positionInc = 1; return NUM; } -{HOST} {positionInc = 1; return HOST; } -{CJ} {positionInc = 1; return CJ; } - -//wikipedia -{ - //First {ALPHANUM} is always the link, set positioninc to 1 for double bracket, but then inside the internal link state - //set it to 0 for the next token, such that the link and the first token are in the same position, but then subsequent - //tokens within the link are incremented - {DOUBLE_BRACKET} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = INTERNAL_LINK; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CATEGORY; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = EXTERNAL_LINK_URL; yybegin(EXTERNAL_LINK_STATE);} - {TWO_SINGLE_QUOTES} {numWikiTokensSeen = 0; positionInc = 1; if (numBalanced == 0){numBalanced++;yybegin(TWO_SINGLE_QUOTES_STATE);} else{numBalanced = 0;}} - {DOUBLE_EQUALS} {numWikiTokensSeen = 0; positionInc = 1; yybegin(DOUBLE_EQUALS_STATE);} - {DOUBLE_BRACE} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);} - {CITATION} {numWikiTokensSeen = 0; positionInc = 1; currentTokType = CITATION; yybegin(DOUBLE_BRACE_STATE);} -//ignore - . | {WHITESPACE} |{INFOBOX} {numWikiTokensSeen = 0; positionInc = 1; } -} - -{ -//First {ALPHANUM} is always the link, set position to 0 for these -//This is slightly different from EXTERNAL_LINK_STATE because that one has an explicit grammar for capturing the URL - {ALPHANUM} {yybegin(INTERNAL_LINK_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_BRACKET_CLOSE} {numLinkToks = 0; yybegin(YYINITIAL);} - //ignore - . | {WHITESPACE} { positionInc = 1; } -} - -{ -//increment the link token, but then don't increment the tokens after that which are still in the link - ("http://"|"https://"){HOST}("/"?({ALPHANUM}|{P}|\?|"&"|"="|"#")*)* {positionInc = 1; numWikiTokensSeen++; yybegin(EXTERNAL_LINK_STATE); return currentTokType;} - {ALPHANUM} {if (numLinkToks == 0){positionInc = 0;} else{positionInc = 1;} numWikiTokensSeen++; currentTokType = EXTERNAL_LINK; yybegin(EXTERNAL_LINK_STATE); numLinkToks++; return currentTokType;} - "]" {numLinkToks = 0; positionInc = 0; yybegin(YYINITIAL);} - {WHITESPACE} { positionInc = 1; } -} - -{ - {ALPHANUM} {yybegin(CATEGORY_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_BRACKET_CLOSE} {yybegin(YYINITIAL);} - //ignore - . | {WHITESPACE} { positionInc = 1; } -} -//italics -{ - "'" {currentTokType = BOLD; yybegin(THREE_SINGLE_QUOTES_STATE);} - "'''" {currentTokType = BOLD_ITALICS; yybegin(FIVE_SINGLE_QUOTES_STATE);} - {ALPHANUM} {currentTokType = ITALICS; numWikiTokensSeen++; yybegin(STRING); return currentTokType;/*italics*/} - //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} - - //ignore - . | {WHITESPACE} { /* ignore */ } -} -//bold -{ - {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;} - //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} - - //ignore - . | {WHITESPACE} { /* ignore */ } - -} -//bold italics -{ - {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;} - //we can have links inside, let those override - {DOUBLE_BRACKET} {currentTokType = INTERNAL_LINK; numWikiTokensSeen = 0; yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {currentTokType = CATEGORY; numWikiTokensSeen = 0; yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {currentTokType = EXTERNAL_LINK; numWikiTokensSeen = 0; yybegin(EXTERNAL_LINK_STATE);} - - //ignore - . | {WHITESPACE} { /* ignore */ } -} - -{ - "=" {currentTokType = SUB_HEADING; numWikiTokensSeen = 0; yybegin(STRING);} - {ALPHANUM} {currentTokType = HEADING; yybegin(DOUBLE_EQUALS_STATE); numWikiTokensSeen++; return currentTokType;} - {DOUBLE_EQUALS} {yybegin(YYINITIAL);} - //ignore - . | {WHITESPACE} { /* ignore */ } -} - -{ - {ALPHANUM} {yybegin(DOUBLE_BRACE_STATE); numWikiTokensSeen = 0; return currentTokType;} - {DOUBLE_BRACE_CLOSE} {yybegin(YYINITIAL);} - {CITATION_CLOSE} {yybegin(YYINITIAL);} - //ignore - . | {WHITESPACE} { /* ignore */ } -} - - { - "'''''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end bold italics*/} - "'''" {numBalanced = 0;currentTokType = ALPHANUM;yybegin(YYINITIAL);/*end bold*/} - "''" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end italics*/} - "===" {numBalanced = 0;currentTokType = ALPHANUM; yybegin(YYINITIAL);/*end sub header*/} - {ALPHANUM} {yybegin(STRING); numWikiTokensSeen++; return currentTokType;/* STRING ALPHANUM*/} - //we can have links inside, let those override - {DOUBLE_BRACKET} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = INTERNAL_LINK;yybegin(INTERNAL_LINK_STATE);} - {DOUBLE_BRACKET_CAT} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = CATEGORY;yybegin(CATEGORY_STATE);} - {EXTERNAL_LINK} {numBalanced = 0; numWikiTokensSeen = 0; currentTokType = EXTERNAL_LINK;yybegin(EXTERNAL_LINK_STATE);} - - - {PIPE} {yybegin(STRING); return currentTokType;/*pipe*/} - - .|{WHITESPACE} { /* ignore STRING */ } -} - - - - -/* -{INTERNAL_LINK} { return curentTokType; } - -{CITATION} { return currentTokType; } -{CATEGORY} { return currentTokType; } - -{BOLD} { return currentTokType; } -{ITALICS} { return currentTokType; } -{BOLD_ITALICS} { return currentTokType; } -{HEADING} { return currentTokType; } -{SUB_HEADING} { return currentTokType; } - -*/ -//end wikipedia - -/** Ignore the rest */ -. | {WHITESPACE}|{TAGS} { /* ignore */ } - - -//INTERNAL_LINK = "["{2}({ALPHANUM}+{WHITESPACE}*)+"]"{2} -//EXTERNAL_LINK = "["http://"{HOST}.*?"]" -//CITATION = "{"{2}({ALPHANUM}+{WHITESPACE}*)+"}"{2} -//CATEGORY = "["{2}"Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2} -//CATEGORY_COLON = "["{2}":Category:"({ALPHANUM}+{WHITESPACE}*)+"]"{2} -//BOLD = '''({ALPHANUM}+{WHITESPACE}*)+''' -//ITALICS = ''({ALPHANUM}+{WHITESPACE}*)+'' -//BOLD_ITALICS = '''''({ALPHANUM}+{WHITESPACE}*)+''''' -//HEADING = "="{2}({ALPHANUM}+{WHITESPACE}*)+"="{2} -//SUB_HEADING ="="{3}({ALPHANUM}+{WHITESPACE}*)+"="{3}