1 package org.apache.lucene.analysis.synonym;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import java.io.BufferedReader;
21 import java.io.IOException;
22 import java.io.LineNumberReader;
23 import java.io.Reader;
24 import java.text.ParseException;
25 import java.util.ArrayList;
27 import org.apache.lucene.analysis.Analyzer;
28 import org.apache.lucene.util.CharsRef;
31 * Parser for the Solr synonyms format.
33 * <li> Blank lines and lines starting with '#' are comments.
34 * <li> Explicit mappings match any token sequence on the LHS of "=>"
35 * and replace with all alternatives on the RHS. These types of mappings
36 * ignore the expand parameter in the constructor.
38 * <blockquote>i-pod, i pod => ipod</blockquote>
39 * <li> Equivalent synonyms may be separated with commas and give
40 * no explicit mapping. In this case the mapping behavior will
41 * be taken from the expand parameter in the constructor. This allows
42 * the same synonym file to be used in different synonym handling strategies.
44 * <blockquote>ipod, i-pod, i pod</blockquote>
46 * <li> Multiple synonym mapping entries are merged.
51 * is equivalent to<br><br>
55 * @lucene.experimental
57 public class SolrSynonymParser extends SynonymMap.Builder {
58 private final boolean expand;
59 private final Analyzer analyzer;
61 public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
64 this.analyzer = analyzer;
67 public void add(Reader in) throws IOException, ParseException {
68 LineNumberReader br = new LineNumberReader(in);
71 } catch (IllegalArgumentException e) {
72 ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
80 private void addInternal(BufferedReader in) throws IOException {
82 while ((line = in.readLine()) != null) {
83 if (line.length() == 0 || line.charAt(0) == '#') {
84 continue; // ignore empty lines and comments
90 // TODO: we could process this more efficiently.
91 String sides[] = split(line, "=>");
92 if (sides.length > 1) { // explicit mapping
93 if (sides.length != 2) {
94 throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
96 String inputStrings[] = split(sides[0], ",");
97 inputs = new CharsRef[inputStrings.length];
98 for (int i = 0; i < inputs.length; i++) {
99 inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
102 String outputStrings[] = split(sides[1], ",");
103 outputs = new CharsRef[outputStrings.length];
104 for (int i = 0; i < outputs.length; i++) {
105 outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
108 String inputStrings[] = split(line, ",");
109 inputs = new CharsRef[inputStrings.length];
110 for (int i = 0; i < inputs.length; i++) {
111 inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
116 outputs = new CharsRef[1];
117 outputs[0] = inputs[0];
121 // currently we include the term itself in the map,
122 // and use includeOrig = false always.
123 // this is how the existing filter does it, but its actually a bug,
124 // especially if combined with ignoreCase = true
125 for (int i = 0; i < inputs.length; i++) {
126 for (int j = 0; j < outputs.length; j++) {
127 add(inputs[i], outputs[j], false);
133 private static String[] split(String s, String separator) {
134 ArrayList<String> list = new ArrayList<String>(2);
135 StringBuilder sb = new StringBuilder();
136 int pos=0, end=s.length();
138 if (s.startsWith(separator,pos)) {
139 if (sb.length() > 0) {
140 list.add(sb.toString());
141 sb=new StringBuilder();
143 pos+=separator.length();
147 char ch = s.charAt(pos++);
150 if (pos>=end) break; // ERROR, or let it go?
151 ch = s.charAt(pos++);
157 if (sb.length() > 0) {
158 list.add(sb.toString());
161 return list.toArray(new String[list.size()]);
164 private String unescape(String s) {
165 if (s.indexOf("\\") >= 0) {
166 StringBuilder sb = new StringBuilder();
167 for (int i = 0; i < s.length(); i++) {
168 char ch = s.charAt(i);
169 if (ch == '\\' && i < s.length() - 1) {
170 sb.append(s.charAt(++i));
175 return sb.toString();