+++ /dev/null
-package org.apache.lucene.queryParser.standard.parser;
-
-/**
- * Licensed to the Apache Software Foundation (ASF) under one or more
- * contributor license agreements. See the NOTICE file distributed with
- * this work for additional information regarding copyright ownership.
- * The ASF licenses this file to You under the Apache License, Version 2.0
- * (the "License"); you may not use this file except in compliance with
- * the License. You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.util.Locale;
-
-import org.apache.lucene.messages.MessageImpl;
-import org.apache.lucene.queryParser.core.messages.QueryParserMessages;
-import org.apache.lucene.queryParser.core.parser.EscapeQuerySyntax;
-import org.apache.lucene.queryParser.core.util.UnescapedCharSequence;
-
-/**
- */
-public class EscapeQuerySyntaxImpl implements EscapeQuerySyntax {
-
- private static final char[] wildcardChars = { '*', '?' };
-
- private static final String[] escapableTermExtraFirstChars = { "+", "-", "@" };
-
- private static final String[] escapableTermChars = { "\"", "<", ">", "=",
- "!", "(", ")", "^", "[", "{", ":", "]", "}", "~" };
-
- // TODO: check what to do with these "*", "?", "\\"
- private static final String[] escapableQuotedChars = { "\"" };
- private static final String[] escapableWhiteChars = { " ", "\t", "\n", "\r",
- "\f", "\b", "\u3000" };
- private static final String[] escapableWordTokens = { "AND", "OR", "NOT",
- "TO", "WITHIN", "SENTENCE", "PARAGRAPH", "INORDER" };
-
- private static final CharSequence escapeChar(CharSequence str, Locale locale) {
- if (str == null || str.length() == 0)
- return str;
-
- CharSequence buffer = str;
-
- // regular escapable Char for terms
- for (int i = 0; i < escapableTermChars.length; i++) {
- buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
- "\\", locale);
- }
-
- // First Character of a term as more escaping chars
- for (int i = 0; i < escapableTermExtraFirstChars.length; i++) {
- if (buffer.charAt(0) == escapableTermExtraFirstChars[i].charAt(0)) {
- buffer = "\\" + buffer.charAt(0)
- + buffer.subSequence(1, buffer.length());
- break;
- }
- }
-
- return buffer;
- }
-
- private final CharSequence escapeQuoted(CharSequence str, Locale locale) {
- if (str == null || str.length() == 0)
- return str;
-
- CharSequence buffer = str;
-
- for (int i = 0; i < escapableQuotedChars.length; i++) {
- buffer = replaceIgnoreCase(buffer, escapableTermChars[i].toLowerCase(),
- "\\", locale);
- }
- return buffer;
- }
-
- private static final CharSequence escapeTerm(CharSequence term, Locale locale) {
- if (term == null)
- return term;
-
- // Escape single Chars
- term = escapeChar(term, locale);
- term = escapeWhiteChar(term, locale);
-
- // Escape Parser Words
- for (int i = 0; i < escapableWordTokens.length; i++) {
- if (escapableWordTokens[i].equalsIgnoreCase(term.toString()))
- return "\\" + term;
- }
- return term;
- }
-
- /**
- * replace with ignore case
- *
- * @param string
- * string to get replaced
- * @param sequence1
- * the old character sequence in lowercase
- * @param escapeChar
- * the new character to prefix sequence1 in return string.
- * @return the new String
- */
- private static CharSequence replaceIgnoreCase(CharSequence string,
- CharSequence sequence1, CharSequence escapeChar, Locale locale) {
- if (escapeChar == null || sequence1 == null || string == null)
- throw new NullPointerException();
-
- // empty string case
- int count = string.length();
- int sequence1Length = sequence1.length();
- if (sequence1Length == 0) {
- StringBuilder result = new StringBuilder((count + 1)
- * escapeChar.length());
- result.append(escapeChar);
- for (int i = 0; i < count; i++) {
- result.append(string.charAt(i));
- result.append(escapeChar);
- }
- return result.toString();
- }
-
- // normal case
- StringBuilder result = new StringBuilder();
- char first = sequence1.charAt(0);
- int start = 0, copyStart = 0, firstIndex;
- while (start < count) {
- if ((firstIndex = string.toString().toLowerCase(locale).indexOf(first,
- start)) == -1)
- break;
- boolean found = true;
- if (sequence1.length() > 1) {
- if (firstIndex + sequence1Length > count)
- break;
- for (int i = 1; i < sequence1Length; i++) {
- if (string.toString().toLowerCase(locale).charAt(firstIndex + i) != sequence1
- .charAt(i)) {
- found = false;
- break;
- }
- }
- }
- if (found) {
- result.append(string.toString().substring(copyStart, firstIndex));
- result.append(escapeChar);
- result.append(string.toString().substring(firstIndex,
- firstIndex + sequence1Length));
- copyStart = start = firstIndex + sequence1Length;
- } else {
- start = firstIndex + 1;
- }
- }
- if (result.length() == 0 && copyStart == 0)
- return string;
- result.append(string.toString().substring(copyStart));
- return result.toString();
- }
-
- /**
- * escape all tokens that are part of the parser syntax on a given string
- *
- * @param str
- * string to get replaced
- * @param locale
- * locale to be used when performing string compares
- * @return the new String
- */
- private static final CharSequence escapeWhiteChar(CharSequence str,
- Locale locale) {
- if (str == null || str.length() == 0)
- return str;
-
- CharSequence buffer = str;
-
- for (int i = 0; i < escapableWhiteChars.length; i++) {
- buffer = replaceIgnoreCase(buffer, escapableWhiteChars[i].toLowerCase(),
- "\\", locale);
- }
- return buffer;
- }
-
- public CharSequence escape(CharSequence text, Locale locale, Type type) {
- if (text == null || text.length() == 0)
- return text;
-
- // escape wildcards and the escape char (this has to be perform before
- // anything else)
- // since we need to preserve the UnescapedCharSequence and escape the
- // original escape chars
- if (text instanceof UnescapedCharSequence) {
- text = ((UnescapedCharSequence) text).toStringEscaped(wildcardChars);
- } else {
- text = new UnescapedCharSequence(text).toStringEscaped(wildcardChars);
- }
-
- if (type == Type.STRING) {
- return escapeQuoted(text, locale);
- } else {
- return escapeTerm(text, locale);
- }
- }
-
- /**
- * Returns a String where the escape char has been removed, or kept only once
- * if there was a double escape.
- *
- * Supports escaped unicode characters, e. g. translates <code>A</code> to
- * <code>A</code>.
- *
- */
- public static UnescapedCharSequence discardEscapeChar(CharSequence input)
- throws ParseException {
- // Create char array to hold unescaped char sequence
- char[] output = new char[input.length()];
- boolean[] wasEscaped = new boolean[input.length()];
-
- // The length of the output can be less than the input
- // due to discarded escape chars. This variable holds
- // the actual length of the output
- int length = 0;
-
- // We remember whether the last processed character was
- // an escape character
- boolean lastCharWasEscapeChar = false;
-
- // The multiplier the current unicode digit must be multiplied with.
- // E. g. the first digit must be multiplied with 16^3, the second with
- // 16^2...
- int codePointMultiplier = 0;
-
- // Used to calculate the codepoint of the escaped unicode character
- int codePoint = 0;
-
- for (int i = 0; i < input.length(); i++) {
- char curChar = input.charAt(i);
- if (codePointMultiplier > 0) {
- codePoint += hexToInt(curChar) * codePointMultiplier;
- codePointMultiplier >>>= 4;
- if (codePointMultiplier == 0) {
- output[length++] = (char) codePoint;
- codePoint = 0;
- }
- } else if (lastCharWasEscapeChar) {
- if (curChar == 'u') {
- // found an escaped unicode character
- codePointMultiplier = 16 * 16 * 16;
- } else {
- // this character was escaped
- output[length] = curChar;
- wasEscaped[length] = true;
- length++;
- }
- lastCharWasEscapeChar = false;
- } else {
- if (curChar == '\\') {
- lastCharWasEscapeChar = true;
- } else {
- output[length] = curChar;
- length++;
- }
- }
- }
-
- if (codePointMultiplier > 0) {
- throw new ParseException(new MessageImpl(
- QueryParserMessages.INVALID_SYNTAX_ESCAPE_UNICODE_TRUNCATION));
- }
-
- if (lastCharWasEscapeChar) {
- throw new ParseException(new MessageImpl(
- QueryParserMessages.INVALID_SYNTAX_ESCAPE_CHARACTER));
- }
-
- return new UnescapedCharSequence(output, wasEscaped, 0, length);
- }
-
- /** Returns the numeric value of the hexadecimal character */
- private static final int hexToInt(char c) throws ParseException {
- if ('0' <= c && c <= '9') {
- return c - '0';
- } else if ('a' <= c && c <= 'f') {
- return c - 'a' + 10;
- } else if ('A' <= c && c <= 'F') {
- return c - 'A' + 10;
- } else {
- throw new ParseException(new MessageImpl(
- QueryParserMessages.INVALID_SYNTAX_ESCAPE_NONE_HEX_UNICODE, c));
- }
- }
-
-}