1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Reader;
21 import java.io.IOException;
22 import java.io.Closeable;
23 import java.lang.reflect.Modifier;
25 import org.apache.lucene.util.CloseableThreadLocal;
26 import org.apache.lucene.store.AlreadyClosedException;
28 import org.apache.lucene.document.Fieldable;
30 /** An Analyzer builds TokenStreams, which analyze text. It thus represents a
31 * policy for extracting index terms from text.
33 * Typical implementations first build a Tokenizer, which breaks the stream of
34 * characters from the Reader into raw Tokens. One or more TokenFilters may
35 * then be applied to the output of the Tokenizer.
36 * <p>The {@code Analyzer}-API in Lucene is based on the decorator pattern.
37 * Therefore all non-abstract subclasses must be final or their {@link #tokenStream}
38 * and {@link #reusableTokenStream} implementations must be final! This is checked
39 * when Java assertions are enabled.
41 public abstract class Analyzer implements Closeable {
43 protected Analyzer() {
48 private boolean assertFinal() {
50 final Class<?> clazz = getClass();
51 if (!clazz.desiredAssertionStatus())
53 assert clazz.isAnonymousClass() ||
54 (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
56 Modifier.isFinal(clazz.getMethod("tokenStream", String.class, Reader.class).getModifiers()) &&
57 Modifier.isFinal(clazz.getMethod("reusableTokenStream", String.class, Reader.class).getModifiers())
59 "Analyzer implementation classes or at least their tokenStream() and reusableTokenStream() implementations must be final";
61 } catch (NoSuchMethodException nsme) {
66 /** Creates a TokenStream which tokenizes all the text in the provided
67 * Reader. Must be able to handle null field name for
68 * backward compatibility.
70 public abstract TokenStream tokenStream(String fieldName, Reader reader);
72 /** Creates a TokenStream that is allowed to be re-used
73 * from the previous time that the same thread called
74 * this method. Callers that do not need to use more
75 * than one TokenStream at the same time from this
76 * analyzer should use this method for better
79 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
80 return tokenStream(fieldName, reader);
83 private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
85 /** Used by Analyzers that implement reusableTokenStream
86 * to retrieve previously saved TokenStreams for re-use
87 * by the same thread. */
88 protected Object getPreviousTokenStream() {
90 return tokenStreams.get();
91 } catch (NullPointerException npe) {
92 if (tokenStreams == null) {
93 throw new AlreadyClosedException("this Analyzer is closed");
100 /** Used by Analyzers that implement reusableTokenStream
101 * to save a TokenStream for later re-use by the same
103 protected void setPreviousTokenStream(Object obj) {
105 tokenStreams.set(obj);
106 } catch (NullPointerException npe) {
107 if (tokenStreams == null) {
108 throw new AlreadyClosedException("this Analyzer is closed");
116 * Invoked before indexing a Fieldable instance if
117 * terms have already been added to that field. This allows custom
118 * analyzers to place an automatic position increment gap between
119 * Fieldable instances using the same field name. The default value
120 * position increment gap is 0. With a 0 position increment gap and
121 * the typical default token position increment of 1, all terms in a field,
122 * including across Fieldable instances, are in successive positions, allowing
123 * exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
125 * @param fieldName Fieldable name being indexed.
126 * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
128 public int getPositionIncrementGap(String fieldName) {
133 * Just like {@link #getPositionIncrementGap}, except for
134 * Token offsets instead. By default this returns 1 for
135 * tokenized fields and, as if the fields were joined
136 * with an extra space character, and 0 for un-tokenized
137 * fields. This method is only called if the field
138 * produced at least one token for indexing.
140 * @param field the field just indexed
141 * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
143 public int getOffsetGap(Fieldable field) {
144 if (field.isTokenized())
150 /** Frees persistent resources used by this Analyzer */
151 public void close() {
152 tokenStreams.close();