lucene-java-3.4.0/lucene/src/java/org/apache/lucene/analysis/standard/UAX29URLEmailTokenizer.java

   1 package org.apache.lucene.analysis.standard;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.IOException;
  21 import java.io.InputStream;
  22 import java.io.InputStreamReader;
  23 import java.io.Reader;
  24
  25 import org.apache.lucene.analysis.Tokenizer;
  26 import org.apache.lucene.analysis.standard.std31.StandardTokenizerImpl31;
  27 import org.apache.lucene.analysis.standard.std31.UAX29URLEmailTokenizerImpl31;
  28 import org.apache.lucene.analysis.tokenattributes.OffsetAttribute;
  29 import org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute;
  30 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
  31 import org.apache.lucene.analysis.tokenattributes.TypeAttribute;
  32 import org.apache.lucene.util.AttributeSource;
  33 import org.apache.lucene.util.Version;
  34 import org.apache.lucene.util.AttributeSource.AttributeFactory;
  35
  36 /**
  37  * This class implements Word Break rules from the Unicode Text Segmentation
  38  * algorithm, as specified in
  39  * <a href="http://unicode.org/reports/tr29/">Unicode Standard Annex #29</a>
  40  * URLs and email addresses are also tokenized according to the relevant RFCs.
  41  * <p/>
  42  * Tokens produced are of the following types:
  43  * <ul>
  44  *   <li>&lt;ALPHANUM&gt;: A sequence of alphabetic and numeric characters</li>
  45  *   <li>&lt;NUM&gt;: A number</li>
  46  *   <li>&lt;URL&gt;: A URL</li>
  47  *   <li>&lt;EMAIL&gt;: An email address</li>
  48  *   <li>&lt;SOUTHEAST_ASIAN&gt;: A sequence of characters from South and Southeast
  49  *       Asian languages, including Thai, Lao, Myanmar, and Khmer</li>
  50  *   <li>&lt;IDEOGRAPHIC&gt;: A single CJKV ideographic character</li>
  51  *   <li>&lt;HIRAGANA&gt;: A single hiragana character</li>
  52  * </ul>
  53  * <a name="version"/>
  54  * <p>You must specify the required {@link Version}
  55  * compatibility when creating UAX29URLEmailTokenizer:
  56  * <ul>
  57  *   <li> As of 3.4, Hiragana and Han characters are no longer wrongly split
  58  *   from their combining characters. If you use a previous version number,
  59  *   you get the exact broken behavior for backwards compatibility.
  60  * </ul>
  61  */
  62
  63 public final class UAX29URLEmailTokenizer extends Tokenizer {
  64   /** A private instance of the JFlex-constructed scanner */
  65   private final StandardTokenizerInterface scanner;
  66
  67   public static final int ALPHANUM          = 0;
  68   public static final int NUM               = 1;
  69   public static final int SOUTHEAST_ASIAN   = 2;
  70   public static final int IDEOGRAPHIC       = 3;
  71   public static final int HIRAGANA          = 4;
  72   public static final int KATAKANA          = 5;
  73   public static final int HANGUL            = 6;
  74   public static final int URL               = 7;
  75   public static final int EMAIL             = 8;
  76
  77   /** String token types that correspond to token type int constants */
  78   public static final String [] TOKEN_TYPES = new String [] {
  79     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.ALPHANUM],
  80     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.NUM],
  81     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.SOUTHEAST_ASIAN],
  82     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.IDEOGRAPHIC],
  83     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HIRAGANA],
  84     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.KATAKANA],
  85     StandardTokenizer.TOKEN_TYPES[StandardTokenizer.HANGUL],
  86     "<URL>",
  87     "<EMAIL>",
  88   };
  89
  90   /** Alphanumeric sequences
  91    * @deprecated use {@link #TOKEN_TYPES} instead */
  92   @Deprecated
  93   public static final String WORD_TYPE = TOKEN_TYPES[ALPHANUM];
  94
  95   /** Numbers
  96    * @deprecated use {@link #TOKEN_TYPES} instead */
  97   @Deprecated
  98   public static final String NUMERIC_TYPE = TOKEN_TYPES[NUM];
  99
 100   /** URLs with scheme: HTTP(S), FTP, or FILE; no-scheme URLs match HTTP syntax
 101    * @deprecated use {@link #TOKEN_TYPES} instead */
 102   @Deprecated
 103   public static final String URL_TYPE = TOKEN_TYPES[URL];
 104
 105   /** E-mail addresses
 106    * @deprecated use {@link #TOKEN_TYPES} instead */
 107   @Deprecated
 108   public static final String EMAIL_TYPE = TOKEN_TYPES[EMAIL];
 109
 110   /**
 111    * Chars in class \p{Line_Break = Complex_Context} are from South East Asian
 112    * scripts (Thai, Lao, Myanmar, Khmer, etc.).  Sequences of these are kept
 113    * together as as a single token rather than broken up, because the logic
 114    * required to break them at word boundaries is too complex for UAX#29.
 115    * <p>
 116    * See Unicode Line Breaking Algorithm: http://www.unicode.org/reports/tr14/#SA
 117    * @deprecated use {@link #TOKEN_TYPES} instead
 118    */
 119   @Deprecated
 120   public static final String SOUTH_EAST_ASIAN_TYPE = TOKEN_TYPES[SOUTHEAST_ASIAN];
 121
 122   /** @deprecated use {@link #TOKEN_TYPES} instead */
 123   @Deprecated
 124   public static final String IDEOGRAPHIC_TYPE = TOKEN_TYPES[IDEOGRAPHIC];
 125
 126   /** @deprecated use {@link #TOKEN_TYPES} instead */
 127   @Deprecated
 128   public static final String HIRAGANA_TYPE = TOKEN_TYPES[HIRAGANA];
 129
 130   /** @deprecated use {@link #TOKEN_TYPES} instead */
 131   @Deprecated
 132   public static final String KATAKANA_TYPE = TOKEN_TYPES[KATAKANA];
 133
 134   /** @deprecated use {@link #TOKEN_TYPES} instead */
 135   @Deprecated
 136   public static final String HANGUL_TYPE = TOKEN_TYPES[HANGUL];
 137
 138   private int maxTokenLength = StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH;
 139
 140   /** Set the max allowed token length.  Any token longer
 141    *  than this is skipped. */
 142   public void setMaxTokenLength(int length) {
 143     this.maxTokenLength = length;
 144   }
 145
 146   /** @see #setMaxTokenLength */
 147   public int getMaxTokenLength() {
 148     return maxTokenLength;
 149   }
 150
 151   /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, Reader)} instead. */
 152   @Deprecated
 153   public UAX29URLEmailTokenizer(Reader input) {
 154     this(Version.LUCENE_31, input);
 155   }
 156
 157   /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, Reader)} instead. */
 158   @Deprecated
 159   public UAX29URLEmailTokenizer(InputStream input) {
 160     this(Version.LUCENE_31, new InputStreamReader(input));
 161   }
 162
 163   /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, AttributeSource, Reader)} instead. */
 164   @Deprecated
 165   public UAX29URLEmailTokenizer(AttributeSource source, Reader input) {
 166     this(Version.LUCENE_31, source, input);
 167   }
 168
 169   /** @deprecated use {@link #UAX29URLEmailTokenizer(Version, AttributeSource.AttributeFactory, Reader)} instead. */
 170   @Deprecated
 171   public UAX29URLEmailTokenizer(AttributeFactory factory, Reader input) {
 172     this(Version.LUCENE_31, factory, input);
 173   }
 174
 175   /**
 176    * Creates a new instance of the UAX29URLEmailTokenizer.  Attaches
 177    * the <code>input</code> to the newly created JFlex scanner.
 178    *
 179    * @param input The input reader
 180    */
 181   public UAX29URLEmailTokenizer(Version matchVersion, Reader input) {
 182     super(input);
 183     this.scanner = getScannerFor(matchVersion, input);
 184   }
 185
 186   /**
 187    * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeSource}.
 188    */
 189   public UAX29URLEmailTokenizer(Version matchVersion, AttributeSource source, Reader input) {
 190     super(source, input);
 191     this.scanner = getScannerFor(matchVersion, input);
 192   }
 193
 194   /**
 195    * Creates a new UAX29URLEmailTokenizer with a given {@link AttributeFactory}
 196    */
 197   public UAX29URLEmailTokenizer(Version matchVersion, AttributeFactory factory, Reader input) {
 198     super(factory, input);
 199     this.scanner = getScannerFor(matchVersion, input);
 200   }
 201
 202   private static StandardTokenizerInterface getScannerFor(Version matchVersion, Reader input) {
 203     if (matchVersion.onOrAfter(Version.LUCENE_34)) {
 204       return new UAX29URLEmailTokenizerImpl(input);
 205     } else {
 206       return new UAX29URLEmailTokenizerImpl31(input);
 207     }
 208   }
 209
 210   // this tokenizer generates three attributes:
 211   // term offset, positionIncrement and type
 212   private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
 213   private final OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class);
 214   private final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class);
 215   private final TypeAttribute typeAtt = addAttribute(TypeAttribute.class);
 216
 217   @Override
 218   public final boolean incrementToken() throws IOException {
 219     clearAttributes();
 220     int posIncr = 1;
 221
 222     while(true) {
 223       int tokenType = scanner.getNextToken();
 224
 225       if (tokenType == StandardTokenizerInterface.YYEOF) {
 226         return false;
 227       }
 228
 229       if (scanner.yylength() <= maxTokenLength) {
 230         posIncrAtt.setPositionIncrement(posIncr);
 231         scanner.getText(termAtt);
 232         final int start = scanner.yychar();
 233         offsetAtt.setOffset(correctOffset(start), correctOffset(start+termAtt.length()));
 234         typeAtt.setType(TOKEN_TYPES[tokenType]);
 235         return true;
 236       } else
 237         // When we skip a too-long term, we still increment the
 238         // position increment
 239         posIncr++;
 240     }
 241   }
 242
 243   @Override
 244   public final void end() {
 245     // set final offset
 246     int finalOffset = correctOffset(scanner.yychar() + scanner.yylength());
 247     offsetAtt.setOffset(finalOffset, finalOffset);
 248   }
 249
 250   @Override
 251   public void reset(Reader reader) throws IOException {
 252     super.reset(reader);
 253     scanner.yyreset(reader);
 254   }
 255 }