X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java?ds=sidebyside diff --git a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java b/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java deleted file mode 100644 index 7750114..0000000 --- a/lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java +++ /dev/null @@ -1,179 +0,0 @@ -package org.apache.lucene.analysis.synonym; - -/** - * Licensed to the Apache Software Foundation (ASF) under one or more - * contributor license agreements. See the NOTICE file distributed with - * this work for additional information regarding copyright ownership. - * The ASF licenses this file to You under the Apache License, Version 2.0 - * (the "License"); you may not use this file except in compliance with - * the License. You may obtain a copy of the License at - * - * http://www.apache.org/licenses/LICENSE-2.0 - * - * Unless required by applicable law or agreed to in writing, software - * distributed under the License is distributed on an "AS IS" BASIS, - * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - * See the License for the specific language governing permissions and - * limitations under the License. - */ - -import java.io.BufferedReader; -import java.io.IOException; -import java.io.LineNumberReader; -import java.io.Reader; -import java.text.ParseException; -import java.util.ArrayList; - -import org.apache.lucene.analysis.Analyzer; -import org.apache.lucene.util.CharsRef; - -/** - * Parser for the Solr synonyms format. - *
    - *
  1. Blank lines and lines starting with '#' are comments. - *
  2. Explicit mappings match any token sequence on the LHS of "=>" - * and replace with all alternatives on the RHS. These types of mappings - * ignore the expand parameter in the constructor. - * Example: - *
    i-pod, i pod => ipod
    - *
  3. Equivalent synonyms may be separated with commas and give - * no explicit mapping. In this case the mapping behavior will - * be taken from the expand parameter in the constructor. This allows - * the same synonym file to be used in different synonym handling strategies. - * Example: - *
    ipod, i-pod, i pod
    - * - *
  4. Multiple synonym mapping entries are merged. - * Example: - *
    - * foo => foo bar
    - * foo => baz

    - * is equivalent to

    - * foo => foo bar, baz - *
    - *
- * @lucene.experimental - */ -public class SolrSynonymParser extends SynonymMap.Builder { - private final boolean expand; - private final Analyzer analyzer; - - public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) { - super(dedup); - this.expand = expand; - this.analyzer = analyzer; - } - - public void add(Reader in) throws IOException, ParseException { - LineNumberReader br = new LineNumberReader(in); - try { - addInternal(br); - } catch (IllegalArgumentException e) { - ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0); - ex.initCause(e); - throw ex; - } finally { - br.close(); - } - } - - private void addInternal(BufferedReader in) throws IOException { - String line = null; - while ((line = in.readLine()) != null) { - if (line.length() == 0 || line.charAt(0) == '#') { - continue; // ignore empty lines and comments - } - - CharsRef inputs[]; - CharsRef outputs[]; - - // TODO: we could process this more efficiently. - String sides[] = split(line, "=>"); - if (sides.length > 1) { // explicit mapping - if (sides.length != 2) { - throw new IllegalArgumentException("more than one explicit mapping specified on the same line"); - } - String inputStrings[] = split(sides[0], ","); - inputs = new CharsRef[inputStrings.length]; - for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); - } - - String outputStrings[] = split(sides[1], ","); - outputs = new CharsRef[outputStrings.length]; - for (int i = 0; i < outputs.length; i++) { - outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef()); - } - } else { - String inputStrings[] = split(line, ","); - inputs = new CharsRef[inputStrings.length]; - for (int i = 0; i < inputs.length; i++) { - inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef()); - } - if (expand) { - outputs = inputs; - } else { - outputs = new CharsRef[1]; - outputs[0] = inputs[0]; - } - } - - // currently we include the term itself in the map, - // and use includeOrig = false always. - // this is how the existing filter does it, but its actually a bug, - // especially if combined with ignoreCase = true - for (int i = 0; i < inputs.length; i++) { - for (int j = 0; j < outputs.length; j++) { - add(inputs[i], outputs[j], false); - } - } - } - } - - private static String[] split(String s, String separator) { - ArrayList list = new ArrayList(2); - StringBuilder sb = new StringBuilder(); - int pos=0, end=s.length(); - while (pos < end) { - if (s.startsWith(separator,pos)) { - if (sb.length() > 0) { - list.add(sb.toString()); - sb=new StringBuilder(); - } - pos+=separator.length(); - continue; - } - - char ch = s.charAt(pos++); - if (ch=='\\') { - sb.append(ch); - if (pos>=end) break; // ERROR, or let it go? - ch = s.charAt(pos++); - } - - sb.append(ch); - } - - if (sb.length() > 0) { - list.add(sb.toString()); - } - - return list.toArray(new String[list.size()]); - } - - private String unescape(String s) { - if (s.indexOf("\\") >= 0) { - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < s.length(); i++) { - char ch = s.charAt(i); - if (ch == '\\' && i < s.length() - 1) { - sb.append(s.charAt(++i)); - } else { - sb.append(ch); - } - } - return sb.toString(); - } - return s; - } -}