1 package org.apache.lucene.queryParser.standard.parser;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.util.Locale;
22 import org.apache.lucene.messages.MessageImpl;
23 import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
24 import org.apache.lucene.queryParser.core.parser.EscapeQuerySyntax;
25 import org.apache.lucene.queryParser.core.util.UnescapedCharSequence;
29 public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax {
31 private static final char[] wildcardChars = { '*', '?' };
33 private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" };
35 private static final String[] escapableTermChars = { "\"", "<", ">", "=",
36 "!", "(", ")", "^", "[", "{", ":", "]", "}", "~" };
38 // TODO: check what to do with these "*", "?", "\\"
39 private static final String[] escapableQuotedChars = { "\"" };
40 private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r",
41 "\f", "\b", "\u3000" };
42 private static final String[] escapableWordTokens = { "AND", "OR", "NOT",
43 "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };
45 private static final CharSequence escapeChar(CharSequence str, Locale locale) {
46 if (str == null || str.length() == 0)
49 CharSequence buffer = str;
51 // regular escapable Char for terms
52 for (int i = 0; i < escapableTermChars.length; i++) {
53 buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
57 // First Character of a term as more escaping chars
58 for (int i = 0; i < escapableTermExtraFirstChars.length; i++) {
59 if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) {
60 buffer = "\\" + buffer.charAt(0)
61 + buffer.subSequence(1, buffer.length());
69 private final CharSequence escapeQuoted(CharSequence str, Locale locale) {
70 if (str == null || str.length() == 0)
73 CharSequence buffer = str;
75 for (int i = 0; i < escapableQuotedChars.length; i++) {
76 buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
82 private static final CharSequence escapeTerm(CharSequence term, Locale locale) {
86 // Escape single Chars
87 term = escapeChar(term, locale);
88 term = escapeWhiteChar(term, locale);
90 // Escape Parser Words
91 for (int i = 0; i < escapableWordTokens.length; i++) {
92 if (escapableWordTokens[i].equalsIgnoreCase(term.toString()))
99 * replace with ignore case
102 * string to get replaced
104 * the old character sequence in lowercase
106 * the new character to prefix sequence1 in return string.
107 * @return the new String
109 private static CharSequence replaceIgnoreCase(CharSequence string,
110 CharSequence sequence1, CharSequence escapeChar, Locale locale) {
111 if (escapeChar == null || sequence1 == null || string == null)
112 throw new NullPointerException();
115 int count = string.length();
116 int sequence1Length = sequence1.length();
117 if (sequence1Length == 0) {
118 StringBuilder result = new StringBuilder((count + 1)
119 * escapeChar.length());
120 result.append(escapeChar);
121 for (int i = 0; i < count; i++) {
122 result.append(string.charAt(i));
123 result.append(escapeChar);
125 return result.toString();
129 StringBuilder result = new StringBuilder();
130 char first = sequence1.charAt(0);
131 int start = 0, copyStart = 0, firstIndex;
132 while (start < count) {
133 if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first,
136 boolean found = true;
137 if (sequence1.length() > 1) {
138 if (firstIndex + sequence1Length > count)
140 for (int i = 1; i < sequence1Length; i++) {
141 if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1
149 result.append(string.toString().substring(copyStart, firstIndex));
150 result.append(escapeChar);
151 result.append(string.toString().substring(firstIndex,
152 firstIndex + sequence1Length));
153 copyStart = start = firstIndex + sequence1Length;
155 start = firstIndex + 1;
158 if (result.length() == 0 && copyStart == 0)
160 result.append(string.toString().substring(copyStart));
161 return result.toString();
165 * escape all tokens that are part of the parser syntax on a given string
168 * string to get replaced
170 * locale to be used when performing string compares
171 * @return the new String
173 private static final CharSequence escapeWhiteChar(CharSequence str,
175 if (str == null || str.length() == 0)
178 CharSequence buffer = str;
180 for (int i = 0; i < escapableWhiteChars.length; i++) {
181 buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(),
187 public CharSequence escape(CharSequence text, Locale locale, Type type) {
188 if (text == null || text.length() == 0)
191 // escape wildcards and the escape char (this has to be perform before
193 // since we need to preserve the UnescapedCharSequence and escape the
194 // original escape chars
195 if (text instanceof UnescapedCharSequence) {
196 text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars);
198 text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars);
201 if (type == Type.STRING) {
202 return escapeQuoted(text, locale);
204 return escapeTerm(text, locale);
209 * Returns a String where the escape char has been removed, or kept only once
210 * if there was a double escape.
212 * Supports escaped unicode characters, e. g. translates <code>A</code> to
216 public static UnescapedCharSequence discardEscapeChar(CharSequence input)
217 throws ParseException {
218 // Create char array to hold unescaped char sequence
219 char[] output = new char[input.length()];
220 boolean[] wasEscaped = new boolean[input.length()];
222 // The length of the output can be less than the input
223 // due to discarded escape chars. This variable holds
224 // the actual length of the output
227 // We remember whether the last processed character was
228 // an escape character
229 boolean lastCharWasEscapeChar = false;
231 // The multiplier the current unicode digit must be multiplied with.
232 // E. g. the first digit must be multiplied with 16^3, the second with
234 int codePointMultiplier = 0;
236 // Used to calculate the codepoint of the escaped unicode character
239 for (int i = 0; i < input.length(); i++) {
240 char curChar = input.charAt(i);
241 if (codePointMultiplier > 0) {
242 codePoint += hexToInt(curChar) * codePointMultiplier;
243 codePointMultiplier >>>= 4;
244 if (codePointMultiplier == 0) {
245 output[length++] = (char) codePoint;
248 } else if (lastCharWasEscapeChar) {
249 if (curChar == 'u') {
250 // found an escaped unicode character
251 codePointMultiplier = 16 * 16 * 16;
253 // this character was escaped
254 output[length] = curChar;
255 wasEscaped[length] = true;
258 lastCharWasEscapeChar = false;
260 if (curChar == '\\') {
261 lastCharWasEscapeChar = true;
263 output[length] = curChar;
269 if (codePointMultiplier > 0) {
270 throw new ParseException(new MessageImpl(
271 QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
274 if (lastCharWasEscapeChar) {
275 throw new ParseException(new MessageImpl(
276 QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
279 return new UnescapedCharSequence(output, wasEscaped, 0, length);
282 /** Returns the numeric value of the hexadecimal character */
283 private static final int hexToInt(char c) throws ParseException {
284 if ('0' <= c && c <= '9') {
286 } else if ('a' <= c && c <= 'f') {
288 } else if ('A' <= c && c <= 'F') {
291 throw new ParseException(new MessageImpl(
292 QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));