1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.Reader;
21 import java.io.IOException;
22 import java.io.Closeable;
23 import java.lang.reflect.Modifier;
25 import org.apache.lucene.util.CloseableThreadLocal;
26 import org.apache.lucene.store.AlreadyClosedException;
28 import org.apache.lucene.document.Fieldable;
30 /** An Analyzer builds TokenStreams, which analyze text. It thus represents a
31 * policy for extracting index terms from text.
33 * Typical implementations first build a Tokenizer, which breaks the stream of
34 * characters from the Reader into raw Tokens. One or more TokenFilters may
35 * then be applied to the output of the Tokenizer.
36 * <p>The {@code Analyzer}-API in Lucene is based on the decorator pattern.
37 * Therefore all non-abstract subclasses must be final or their {@link #tokenStream}
38 * and {@link #reusableTokenStream} implementations must be final! This is checked
39 * when Java assertions are enabled.
41 public abstract class Analyzer implements Closeable {
43 protected Analyzer() {
48 private boolean assertFinal() {
50 final Class<?> clazz = getClass();
51 assert clazz.isAnonymousClass() ||
52 (clazz.getModifiers() & (Modifier.FINAL | Modifier.PRIVATE)) != 0 ||
54 Modifier.isFinal(clazz.getMethod("tokenStream", String.class, Reader.class).getModifiers()) &&
55 Modifier.isFinal(clazz.getMethod("reusableTokenStream", String.class, Reader.class).getModifiers())
57 "Analyzer implementation classes or at least their tokenStream() and reusableTokenStream() implementations must be final";
59 } catch (NoSuchMethodException nsme) {
64 /** Creates a TokenStream which tokenizes all the text in the provided
65 * Reader. Must be able to handle null field name for
66 * backward compatibility.
68 public abstract TokenStream tokenStream(String fieldName, Reader reader);
70 /** Creates a TokenStream that is allowed to be re-used
71 * from the previous time that the same thread called
72 * this method. Callers that do not need to use more
73 * than one TokenStream at the same time from this
74 * analyzer should use this method for better
77 public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
78 return tokenStream(fieldName, reader);
81 private CloseableThreadLocal<Object> tokenStreams = new CloseableThreadLocal<Object>();
83 /** Used by Analyzers that implement reusableTokenStream
84 * to retrieve previously saved TokenStreams for re-use
85 * by the same thread. */
86 protected Object getPreviousTokenStream() {
88 return tokenStreams.get();
89 } catch (NullPointerException npe) {
90 if (tokenStreams == null) {
91 throw new AlreadyClosedException("this Analyzer is closed");
98 /** Used by Analyzers that implement reusableTokenStream
99 * to save a TokenStream for later re-use by the same
101 protected void setPreviousTokenStream(Object obj) {
103 tokenStreams.set(obj);
104 } catch (NullPointerException npe) {
105 if (tokenStreams == null) {
106 throw new AlreadyClosedException("this Analyzer is closed");
114 * Invoked before indexing a Fieldable instance if
115 * terms have already been added to that field. This allows custom
116 * analyzers to place an automatic position increment gap between
117 * Fieldable instances using the same field name. The default value
118 * position increment gap is 0. With a 0 position increment gap and
119 * the typical default token position increment of 1, all terms in a field,
120 * including across Fieldable instances, are in successive positions, allowing
121 * exact PhraseQuery matches, for instance, across Fieldable instance boundaries.
123 * @param fieldName Fieldable name being indexed.
124 * @return position increment gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
126 public int getPositionIncrementGap(String fieldName) {
131 * Just like {@link #getPositionIncrementGap}, except for
132 * Token offsets instead. By default this returns 1 for
133 * tokenized fields and, as if the fields were joined
134 * with an extra space character, and 0 for un-tokenized
135 * fields. This method is only called if the field
136 * produced at least one token for indexing.
138 * @param field the field just indexed
139 * @return offset gap, added to the next token emitted from {@link #tokenStream(String,Reader)}
141 public int getOffsetGap(Fieldable field) {
142 if (field.isTokenized())
148 /** Frees persistent resources used by this Analyzer */
149 public void close() {
150 tokenStreams.close();