2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.reverse;
20 import org.apache.lucene.analysis.TokenFilter;
21 import org.apache.lucene.analysis.TokenStream;
22 import org.apache.lucene.analysis.tokenattributes.CharTermAttribute;
23 import org.apache.lucene.util.Version;
25 import java.io.IOException;
28 * Reverse token string, for example "country" => "yrtnuoc".
30 * If <code>marker</code> is supplied, then tokens will be also prepended by
31 * that character. For example, with a marker of \u0001, "country" =>
32 * "\u0001yrtnuoc". This is useful when implementing efficient leading
36 * <p>You must specify the required {@link Version}
37 * compatibility when creating ReverseStringFilter, or when using any of
40 * <li> As of 3.1, supplementary characters are handled correctly
43 public final class ReverseStringFilter extends TokenFilter {
45 private final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class);
46 private final char marker;
47 private final Version matchVersion;
48 private static final char NOMARKER = '\uFFFF';
51 * Example marker character: U+0001 (START OF HEADING)
53 public static final char START_OF_HEADING_MARKER = '\u0001';
56 * Example marker character: U+001F (INFORMATION SEPARATOR ONE)
58 public static final char INFORMATION_SEPARATOR_MARKER = '\u001F';
61 * Example marker character: U+EC00 (PRIVATE USE AREA: EC00)
63 public static final char PUA_EC00_MARKER = '\uEC00';
66 * Example marker character: U+200F (RIGHT-TO-LEFT MARK)
68 public static final char RTL_DIRECTION_MARKER = '\u200F';
71 * Create a new ReverseStringFilter that reverses all tokens in the
72 * supplied {@link TokenStream}.
74 * The reversed tokens will not be marked.
77 * @param in {@link TokenStream} to filter
78 * @deprecated use {@link #ReverseStringFilter(Version, TokenStream)}
79 * instead. This constructor will be removed in Lucene 4.0
82 public ReverseStringFilter(TokenStream in) {
87 * Create a new ReverseStringFilter that reverses and marks all tokens in the
88 * supplied {@link TokenStream}.
90 * The reversed tokens will be prepended (marked) by the <code>marker</code>
94 * @param in {@link TokenStream} to filter
95 * @param marker A character used to mark reversed tokens
96 * @deprecated use {@link #ReverseStringFilter(Version, TokenStream, char)}
97 * instead. This constructor will be removed in Lucene 4.0
100 public ReverseStringFilter(TokenStream in, char marker) {
101 this(Version.LUCENE_30, in, marker);
105 * Create a new ReverseStringFilter that reverses all tokens in the
106 * supplied {@link TokenStream}.
108 * The reversed tokens will not be marked.
111 * @param matchVersion See <a href="#version">above</a>
112 * @param in {@link TokenStream} to filter
114 public ReverseStringFilter(Version matchVersion, TokenStream in) {
115 this(matchVersion, in, NOMARKER);
119 * Create a new ReverseStringFilter that reverses and marks all tokens in the
120 * supplied {@link TokenStream}.
122 * The reversed tokens will be prepended (marked) by the <code>marker</code>
126 * @param matchVersion See <a href="#version">above</a>
127 * @param in {@link TokenStream} to filter
128 * @param marker A character used to mark reversed tokens
130 public ReverseStringFilter(Version matchVersion, TokenStream in, char marker) {
132 this.matchVersion = matchVersion;
133 this.marker = marker;
137 public boolean incrementToken() throws IOException {
138 if (input.incrementToken()) {
139 int len = termAtt.length();
140 if (marker != NOMARKER) {
142 termAtt.resizeBuffer(len);
143 termAtt.buffer()[len - 1] = marker;
145 reverse( matchVersion, termAtt.buffer(), 0, len );
146 termAtt.setLength(len);
154 * Reverses the given input string
156 * @param input the string to reverse
157 * @return the given input string in reversed order
158 * @deprecated use {@link #reverse(Version, String)} instead. This method
159 * will be removed in Lucene 4.0
162 public static String reverse( final String input ){
163 return reverse(Version.LUCENE_30, input);
167 * Reverses the given input string
169 * @param matchVersion See <a href="#version">above</a>
170 * @param input the string to reverse
171 * @return the given input string in reversed order
173 public static String reverse( Version matchVersion, final String input ){
174 final char[] charInput = input.toCharArray();
175 reverse( matchVersion, charInput, 0, charInput.length );
176 return new String( charInput );
180 * Reverses the given input buffer in-place
181 * @param buffer the input char array to reverse
182 * @deprecated use {@link #reverse(Version, char[])} instead. This
183 * method will be removed in Lucene 4.0
186 public static void reverse( final char[] buffer ){
187 reverse( buffer, 0, buffer.length );
191 * Reverses the given input buffer in-place
192 * @param matchVersion See <a href="#version">above</a>
193 * @param buffer the input char array to reverse
195 public static void reverse(Version matchVersion, final char[] buffer) {
196 reverse(matchVersion, buffer, 0, buffer.length);
200 * Partially reverses the given input buffer in-place from offset 0
201 * up to the given length.
202 * @param buffer the input char array to reverse
203 * @param len the length in the buffer up to where the
204 * buffer should be reversed
205 * @deprecated use {@link #reverse(Version, char[], int)} instead. This
206 * method will be removed in Lucene 4.0
209 public static void reverse( final char[] buffer, final int len ){
210 reverse( buffer, 0, len );
214 * Partially reverses the given input buffer in-place from offset 0
215 * up to the given length.
216 * @param matchVersion See <a href="#version">above</a>
217 * @param buffer the input char array to reverse
218 * @param len the length in the buffer up to where the
219 * buffer should be reversed
221 public static void reverse(Version matchVersion, final char[] buffer,
223 reverse( matchVersion, buffer, 0, len );
227 * Partially reverses the given input buffer in-place from the given offset
228 * up to the given length.
229 * @param buffer the input char array to reverse
230 * @param start the offset from where to reverse the buffer
231 * @param len the length in the buffer up to where the
232 * buffer should be reversed
233 * @deprecated use {@link #reverse(Version, char[], int, int)} instead. This
234 * method will be removed in Lucene 4.0
237 public static void reverse(char[] buffer, int start, int len ) {
238 reverseUnicode3(buffer, start, len);
242 * @deprecated Remove this when support for 3.0 indexes is no longer needed.
245 private static void reverseUnicode3( char[] buffer, int start, int len ){
246 if( len <= 1 ) return;
248 for( int i = start; i < ( start + num ); i++ ){
250 buffer[i] = buffer[start * 2 + len - i - 1];
251 buffer[start * 2 + len - i - 1] = c;
256 * Partially reverses the given input buffer in-place from the given offset
257 * up to the given length.
258 * @param matchVersion See <a href="#version">above</a>
259 * @param buffer the input char array to reverse
260 * @param start the offset from where to reverse the buffer
261 * @param len the length in the buffer up to where the
262 * buffer should be reversed
264 public static void reverse(Version matchVersion, final char[] buffer,
265 final int start, final int len) {
266 if (!matchVersion.onOrAfter(Version.LUCENE_31)) {
267 reverseUnicode3(buffer, start, len);
270 /* modified version of Apache Harmony AbstractStringBuilder reverse0() */
273 int end = (start + len) - 1;
274 char frontHigh = buffer[start];
275 char endLow = buffer[end];
276 boolean allowFrontSur = true, allowEndSur = true;
277 final int mid = start + (len >> 1);
278 for (int i = start; i < mid; ++i, --end) {
279 final char frontLow = buffer[i + 1];
280 final char endHigh = buffer[end - 1];
281 final boolean surAtFront = allowFrontSur
282 && Character.isSurrogatePair(frontHigh, frontLow);
283 if (surAtFront && (len < 3)) {
284 // nothing to do since surAtFront is allowed and 1 char left
287 final boolean surAtEnd = allowEndSur
288 && Character.isSurrogatePair(endHigh, endLow);
289 allowFrontSur = allowEndSur = true;
290 if (surAtFront == surAtEnd) {
293 buffer[end] = frontLow;
294 buffer[--end] = frontHigh;
296 buffer[++i] = endLow;
297 frontHigh = buffer[i + 1];
298 endLow = buffer[end - 1];
300 // neither surrogates
301 buffer[end] = frontHigh;
303 frontHigh = frontLow;
308 // surrogate only at the front
309 buffer[end] = frontLow;
312 allowFrontSur = false;
314 // surrogate only at the end
315 buffer[end] = frontHigh;
317 frontHigh = frontLow;
322 if ((len & 0x01) == 1 && !(allowFrontSur && allowEndSur)) {
323 // only if odd length
324 buffer[end] = allowFrontSur ? endLow : frontHigh;