1 package org.apache.lucene.analysis;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.IOException;
21 import java.lang.ref.WeakReference;
22 import java.util.Iterator;
23 import java.util.LinkedList;
24 import java.util.List;
26 import org.apache.lucene.util.AttributeImpl;
27 import org.apache.lucene.util.AttributeSource;
30 * This TokenFilter provides the ability to set aside attribute states
31 * that have already been analyzed. This is useful in situations where multiple fields share
32 * many common analysis steps and then go their separate ways.
34 * It is also useful for doing things like entity extraction or proper noun analysis as
35 * part of the analysis workflow and saving off those tokens for use in another field.
38 TeeSinkTokenFilter source1 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader1));
39 TeeSinkTokenFilter.SinkTokenStream sink1 = source1.newSinkTokenStream();
40 TeeSinkTokenFilter.SinkTokenStream sink2 = source1.newSinkTokenStream();
42 TeeSinkTokenFilter source2 = new TeeSinkTokenFilter(new WhitespaceTokenizer(reader2));
43 source2.addSinkTokenStream(sink1);
44 source2.addSinkTokenStream(sink2);
46 TokenStream final1 = new LowerCaseFilter(source1);
47 TokenStream final2 = source2;
48 TokenStream final3 = new EntityDetect(sink1);
49 TokenStream final4 = new URLDetect(sink2);
51 d.add(new Field("f1", final1));
52 d.add(new Field("f2", final2));
53 d.add(new Field("f3", final3));
54 d.add(new Field("f4", final4));
56 * In this example, <code>sink1</code> and <code>sink2</code> will both get tokens from both
57 * <code>reader1</code> and <code>reader2</code> after whitespace tokenizer
58 * and now we can further wrap any of these in extra analysis, and more "sources" can be inserted if desired.
59 * It is important, that tees are consumed before sinks (in the above example, the field names must be
60 * less the sink's field names). If you are not sure, which stream is consumed first, you can simply
61 * add another sink and then pass all tokens to the sinks at once using {@link #consumeAllTokens}.
62 * This TokenFilter is exhausted after this. In the above example, change
63 * the example above to:
66 TokenStream final1 = new LowerCaseFilter(source1.newSinkTokenStream());
67 TokenStream final2 = source2.newSinkTokenStream();
68 sink1.consumeAllTokens();
69 sink2.consumeAllTokens();
72 * In this case, the fields can be added in any order, because the sources are not used anymore and all sinks are ready.
73 * <p>Note, the EntityDetect and URLDetect TokenStreams are for the example and do not currently exist in Lucene.
75 public final class TeeSinkTokenFilter extends TokenFilter {
76 private final List<WeakReference<SinkTokenStream>> sinks = new LinkedList<WeakReference<SinkTokenStream>>();
79 * Instantiates a new TeeSinkTokenFilter.
81 public TeeSinkTokenFilter(TokenStream input) {
86 * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream.
88 public SinkTokenStream newSinkTokenStream() {
89 return newSinkTokenStream(ACCEPT_ALL_FILTER);
93 * Returns a new {@link SinkTokenStream} that receives all tokens consumed by this stream
94 * that pass the supplied filter.
97 public SinkTokenStream newSinkTokenStream(SinkFilter filter) {
98 SinkTokenStream sink = new SinkTokenStream(this.cloneAttributes(), filter);
99 this.sinks.add(new WeakReference<SinkTokenStream>(sink));
104 * Adds a {@link SinkTokenStream} created by another <code>TeeSinkTokenFilter</code>
105 * to this one. The supplied stream will also receive all consumed tokens.
106 * This method can be used to pass tokens from two different tees to one sink.
108 public void addSinkTokenStream(final SinkTokenStream sink) {
109 // check that sink has correct factory
110 if (!this.getAttributeFactory().equals(sink.getAttributeFactory())) {
111 throw new IllegalArgumentException("The supplied sink is not compatible to this tee");
113 // add eventually missing attribute impls to the existing sink
114 for (Iterator<AttributeImpl> it = this.cloneAttributes().getAttributeImplsIterator(); it.hasNext(); ) {
115 sink.addAttributeImpl(it.next());
117 this.sinks.add(new WeakReference<SinkTokenStream>(sink));
121 * <code>TeeSinkTokenFilter</code> passes all tokens to the added sinks
122 * when itself is consumed. To be sure, that all tokens from the input
123 * stream are passed to the sinks, you can call this methods.
124 * This instance is exhausted after this, but all sinks are instant available.
126 public void consumeAllTokens() throws IOException {
127 while (incrementToken()) {}
131 public boolean incrementToken() throws IOException {
132 if (input.incrementToken()) {
133 // capture state lazily - maybe no SinkFilter accepts this state
134 AttributeSource.State state = null;
135 for (WeakReference<SinkTokenStream> ref : sinks) {
136 final SinkTokenStream sink = ref.get();
138 if (sink.accept(this)) {
140 state = this.captureState();
142 sink.addState(state);
153 public final void end() throws IOException {
155 AttributeSource.State finalState = captureState();
156 for (WeakReference<SinkTokenStream> ref : sinks) {
157 final SinkTokenStream sink = ref.get();
159 sink.setFinalState(finalState);
165 * A filter that decides which {@link AttributeSource} states to store in the sink.
167 public static abstract class SinkFilter {
169 * Returns true, iff the current state of the passed-in {@link AttributeSource} shall be stored
172 public abstract boolean accept(AttributeSource source);
175 * Called by {@link SinkTokenStream#reset()}. This method does nothing by default
176 * and can optionally be overridden.
178 public void reset() throws IOException {
179 // nothing to do; can be overridden
183 public static final class SinkTokenStream extends TokenStream {
184 private final List<AttributeSource.State> cachedStates = new LinkedList<AttributeSource.State>();
185 private AttributeSource.State finalState;
186 private Iterator<AttributeSource.State> it = null;
187 private SinkFilter filter;
189 private SinkTokenStream(AttributeSource source, SinkFilter filter) {
191 this.filter = filter;
194 private boolean accept(AttributeSource source) {
195 return filter.accept(source);
198 private void addState(AttributeSource.State state) {
200 throw new IllegalStateException("The tee must be consumed before sinks are consumed.");
202 cachedStates.add(state);
205 private void setFinalState(AttributeSource.State finalState) {
206 this.finalState = finalState;
210 public final boolean incrementToken() throws IOException {
211 // lazy init the iterator
213 it = cachedStates.iterator();
220 AttributeSource.State state = it.next();
226 public final void end() throws IOException {
227 if (finalState != null) {
228 restoreState(finalState);
233 public final void reset() {
234 it = cachedStates.iterator();
238 private static final SinkFilter ACCEPT_ALL_FILTER = new SinkFilter() {
240 public boolean accept(AttributeSource source) {