lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/wikipedia/WikipediaTokenizer.java

   1 /**
   2  * Licensed to the Apache Software Foundation (ASF) under one or more
   3  * contributor license agreements.  See the NOTICE file distributed with
   4  * this work for additional information regarding copyright ownership.
   5  * The ASF licenses this file to You under the Apache License, Version 2.0
   6  * (the "License"); you may not use this file except in compliance with
   7  * the License.  You may obtain a copy of the License at
   8  *
   9  *     http://www.apache.org/licenses/LICENSE-2.0
  10  *
  11  * Unless required by applicable law or agreed to in writing, software
  12  * distributed under the License is distributed on an "AS IS" BASIS,
  13  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  14  * See the License for the specific language governing permissions and
  15  * limitations under the License.
  16  */
  17
  18 package org.apache.lucene.analysis.wikipedia;
  19
  20 import org.apache.lucene.analysis.Tokenizer;
  21 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  22 import org.apache.lucene.analysis.tokenattributes.FlagsAttribute;
  23 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  24 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  25 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  26 import org.apache.lucene.util.AttributeSource;
  27
  28 import java.io.IOException;
  29 import java.io.Reader;
  30 import java.util.*;
  31
  32
  33 /**
  34  * Extension of StandardTokenizer that is aware of Wikipedia syntax.  It is based off of the
  35  * Wikipedia tutorial available at http://en.wikipedia.org/wiki/Wikipedia:Tutorial, but it may not be complete.
  36  * <p/>
  37  * <p/>
  38  * @lucene.experimental
  39  */
  40 public final class WikipediaTokenizer extends Tokenizer {
  41   public static final String INTERNAL_LINK = "il";
  42   public static final String EXTERNAL_LINK = "el";
  43   //The URL part of the link, i.e. the first token
  44   public static final String EXTERNAL_LINK_URL = "elu";
  45   public static final String CITATION = "ci";
  46   public static final String CATEGORY = "c";
  47   public static final String BOLD = "b";
  48   public static final String ITALICS = "i";
  49   public static final String BOLD_ITALICS = "bi";
  50   public static final String HEADING = "h";
  51   public static final String SUB_HEADING = "sh";
  52
  53   public static final int ALPHANUM_ID          = 0;
  54   public static final int APOSTROPHE_ID        = 1;
  55   public static final int ACRONYM_ID           = 2;
  56   public static final int COMPANY_ID           = 3;
  57   public static final int EMAIL_ID             = 4;
  58   public static final int HOST_ID              = 5;
  59   public static final int NUM_ID               = 6;
  60   public static final int CJ_ID                = 7;
  61   public static final int INTERNAL_LINK_ID     = 8;
  62   public static final int EXTERNAL_LINK_ID     = 9;
  63   public static final int CITATION_ID          = 10;
  64   public static final int CATEGORY_ID          = 11;
  65   public static final int BOLD_ID              = 12;
  66   public static final int ITALICS_ID           = 13;
  67   public static final int BOLD_ITALICS_ID      = 14;
  68   public static final int HEADING_ID           = 15;
  69   public static final int SUB_HEADING_ID       = 16;
  70   public static final int EXTERNAL_LINK_URL_ID = 17;
  71
  72   /** String token types that correspond to token type int constants */
  73   public static final String [] TOKEN_TYPES = new String [] {
  74     "<ALPHANUM>",
  75     "<APOSTROPHE>",
  76     "<ACRONYM>",
  77     "<COMPANY>",
  78     "<EMAIL>",
  79     "<HOST>",
  80     "<NUM>",
  81     "<CJ>",
  82     INTERNAL_LINK,
  83     EXTERNAL_LINK,
  84     CITATION,
  85     CATEGORY,
  86     BOLD,
  87     ITALICS,
  88     BOLD_ITALICS,
  89     HEADING,
  90     SUB_HEADING,
  91     EXTERNAL_LINK_URL
  92   };
  93
  94   /**
  95    * Only output tokens
  96    */
  97   public static final int TOKENS_ONLY = 0;
  98   /**
  99    * Only output untokenized tokens, which are tokens that would normally be split into several tokens
 100    */
 101   public static final int UNTOKENIZED_ONLY = 1;
 102   /**
 103    * Output the both the untokenized token and the splits
 104    */
 105   public static final int BOTH = 2;
 106   /**
 107    * This flag is used to indicate that the produced "Token" would, if {@link #TOKENS_ONLY} was used, produce multiple tokens.
 108    */
 109   public static final int UNTOKENIZED_TOKEN_FLAG = 1;
 110   /**
 111    * A private instance of the JFlex-constructed scanner
 112    */
 113   private final WikipediaTokenizerImpl scanner;
 114
 115   private int tokenOutput = TOKENS_ONLY;
 116   private Set<String> untokenizedTypes = Collections.emptySet();
 117   private Iterator<AttributeSource.State> tokens = null;
 118
 119   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 120   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 121   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 122   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 123   private final FlagsAttribute flagsAtt = addAttribute(FlagsAttribute.class);
 124
 125   /**
 126    * Creates a new instance of the {@link WikipediaTokenizer}. Attaches the
 127    * <code>input</code> to a newly created JFlex scanner.
 128    *
 129    * @param input The Input Reader
 130    */
 131   public WikipediaTokenizer(Reader input) {
 132     this(input, TOKENS_ONLY, Collections.<String>emptySet());
 133   }
 134
 135   /**
 136    * Creates a new instance of the {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer}.  Attaches the
 137    * <code>input</code> to a the newly created JFlex scanner.
 138    *
 139    * @param input The input
 140    * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
 141    * @param untokenizedTypes
 142    */
 143   public WikipediaTokenizer(Reader input, int tokenOutput, Set<String> untokenizedTypes) {
 144     super(input);
 145     this.scanner = new WikipediaTokenizerImpl(input);
 146     init(tokenOutput, untokenizedTypes);
 147   }
 148
 149   /**
 150    * Creates a new instance of the {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer}.  Attaches the
 151    * <code>input</code> to a the newly created JFlex scanner. Uses the given {@link org.apache.lucene.util.AttributeSource.AttributeFactory}.
 152    *
 153    * @param input The input
 154    * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
 155    * @param untokenizedTypes
 156    */
 157   public WikipediaTokenizer(AttributeFactory factory, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
 158     super(factory, input);
 159     this.scanner = new WikipediaTokenizerImpl(input);
 160     init(tokenOutput, untokenizedTypes);
 161   }
 162
 163   /**
 164    * Creates a new instance of the {@link org.apache.lucene.analysis.wikipedia.WikipediaTokenizer}.  Attaches the
 165    * <code>input</code> to a the newly created JFlex scanner. Uses the given {@link AttributeSource}.
 166    *
 167    * @param input The input
 168    * @param tokenOutput One of {@link #TOKENS_ONLY}, {@link #UNTOKENIZED_ONLY}, {@link #BOTH}
 169    * @param untokenizedTypes
 170    */
 171   public WikipediaTokenizer(AttributeSource source, Reader input, int tokenOutput, Set<String> untokenizedTypes) {
 172     super(source, input);
 173     this.scanner = new WikipediaTokenizerImpl(input);
 174     init(tokenOutput, untokenizedTypes);
 175   }
 176
 177   private void init(int tokenOutput, Set<String> untokenizedTypes) {
 178     this.tokenOutput = tokenOutput;
 179     this.untokenizedTypes = untokenizedTypes;
 180   }
 181
 182   /*
 183   * (non-Javadoc)
 184   *
 185   * @see org.apache.lucene.analysis.TokenStream#next()
 186   */
 187   @Override
 188   public final boolean incrementToken() throws IOException {
 189     if (tokens != null && tokens.hasNext()){
 190       AttributeSource.State state = tokens.next();
 191       restoreState(state);
 192       return true;
 193     }
 194     clearAttributes();
 195     int tokenType = scanner.getNextToken();
 196
 197     if (tokenType == WikipediaTokenizerImpl.YYEOF) {
 198       return false;
 199     }
 200     String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType];
 201     if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){
 202       setupToken();
 203     } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){
 204       collapseTokens(tokenType);
 205
 206     }
 207     else if (tokenOutput == BOTH){
 208       //collapse into a single token, add it to tokens AND output the individual tokens
 209       //output the untokenized Token first
 210       collapseAndSaveTokens(tokenType, type);
 211     }
 212     posIncrAtt.setPositionIncrement(scanner.getPositionIncrement());
 213     typeAtt.setType(type);
 214     return true;
 215   }
 216
 217   private void collapseAndSaveTokens(int tokenType, String type) throws IOException {
 218     //collapse
 219     StringBuilder buffer = new StringBuilder(32);
 220     int numAdded = scanner.setText(buffer);
 221     //TODO: how to know how much whitespace to add
 222     int theStart = scanner.yychar();
 223     int lastPos = theStart + numAdded;
 224     int tmpTokType;
 225     int numSeen = 0;
 226     List<AttributeSource.State> tmp = new ArrayList<AttributeSource.State>();
 227     setupSavedToken(0, type);
 228     tmp.add(captureState());
 229     //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
 230     while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
 231       int currPos = scanner.yychar();
 232       //append whitespace
 233       for (int i = 0; i < (currPos - lastPos); i++){
 234         buffer.append(' ');
 235       }
 236       numAdded = scanner.setText(buffer);
 237       setupSavedToken(scanner.getPositionIncrement(), type);
 238       tmp.add(captureState());
 239       numSeen++;
 240       lastPos = currPos + numAdded;
 241     }
 242     //trim the buffer
 243     // TODO: this is inefficient
 244     String s = buffer.toString().trim();
 245     termAtt.setEmpty().append(s);
 246     offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
 247     flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
 248     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
 249     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
 250       scanner.yypushback(scanner.yylength());
 251     }
 252     tokens = tmp.iterator();
 253   }
 254
 255   private void setupSavedToken(int positionInc, String type){
 256     setupToken();
 257     posIncrAtt.setPositionIncrement(positionInc);
 258     typeAtt.setType(type);
 259   }
 260
 261   private void collapseTokens(int tokenType) throws IOException {
 262     //collapse
 263     StringBuilder buffer = new StringBuilder(32);
 264     int numAdded = scanner.setText(buffer);
 265     //TODO: how to know how much whitespace to add
 266     int theStart = scanner.yychar();
 267     int lastPos = theStart + numAdded;
 268     int tmpTokType;
 269     int numSeen = 0;
 270     //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type
 271     while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){
 272       int currPos = scanner.yychar();
 273       //append whitespace
 274       for (int i = 0; i < (currPos - lastPos); i++){
 275         buffer.append(' ');
 276       }
 277       numAdded = scanner.setText(buffer);
 278       numSeen++;
 279       lastPos = currPos + numAdded;
 280     }
 281     //trim the buffer
 282     // TODO: this is inefficient
 283     String s = buffer.toString().trim();
 284     termAtt.setEmpty().append(s);
 285     offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length()));
 286     flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG);
 287     //The way the loop is written, we will have proceeded to the next token.  We need to pushback the scanner to lastPos
 288     if (tmpTokType != WikipediaTokenizerImpl.YYEOF){
 289       scanner.yypushback(scanner.yylength());
 290     } else {
 291       tokens = null;
 292     }
 293   }
 294
 295   private void setupToken() {
 296     scanner.getText(termAtt);
 297     final int start = scanner.yychar();
 298     offsetAtt.setOffset(correctOffset(start), correctOffset(start + termAtt.length()));
 299   }
 300
 301   /*
 302   * (non-Javadoc)
 303   *
 304   * @see org.apache.lucene.analysis.TokenStream#reset()
 305   */
 306   @Override
 307   public void reset() throws IOException {
 308     super.reset();
 309     scanner.yyreset(input);
 310   }
 311
 312   @Override
 313   public void reset(Reader reader) throws IOException {
 314     super.reset(reader);
 315     reset();
 316   }
 317
 318   @Override
 319   public void end() throws IOException {
 320     // set final offset
 321     final int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
 322     this.offsetAtt.setOffset(finalOffset, finalOffset);
 323   }
 324 }