lucene-java-3.4.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/synonym/SolrSynonymParser.java

   1 package org.apache.lucene.analysis.synonym;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import java.io.BufferedReader;
  21 import java.io.IOException;
  22 import java.io.LineNumberReader;
  23 import java.io.Reader;
  24 import java.text.ParseException;
  25 import java.util.ArrayList;
  26
  27 import org.apache.lucene.analysis.Analyzer;
  28 import org.apache.lucene.util.CharsRef;
  29
  30 /**
  31  * Parser for the Solr synonyms format.
  32  * <ol>
  33  *   <li> Blank lines and lines starting with '#' are comments.
  34  *   <li> Explicit mappings match any token sequence on the LHS of "=>"
  35  *        and replace with all alternatives on the RHS.  These types of mappings
  36  *        ignore the expand parameter in the constructor.
  37  *        Example:
  38  *        <blockquote>i-pod, i pod => ipod</blockquote>
  39  *   <li> Equivalent synonyms may be separated with commas and give
  40  *        no explicit mapping.  In this case the mapping behavior will
  41  *        be taken from the expand parameter in the constructor.  This allows
  42  *        the same synonym file to be used in different synonym handling strategies.
  43  *        Example:
  44  *        <blockquote>ipod, i-pod, i pod</blockquote>
  45  *
  46  *   <li> Multiple synonym mapping entries are merged.
  47  *        Example:
  48  *        <blockquote>
  49  *         foo => foo bar<br>
  50  *         foo => baz<br><br>
  51  *         is equivalent to<br><br>
  52  *         foo => foo bar, baz
  53  *        </blockquote>
  54  *  </ol>
  55  * @lucene.experimental
  56  */
  57 public class SolrSynonymParser extends SynonymMap.Builder {
  58   private final boolean expand;
  59   private final Analyzer analyzer;
  60
  61   public SolrSynonymParser(boolean dedup, boolean expand, Analyzer analyzer) {
  62     super(dedup);
  63     this.expand = expand;
  64     this.analyzer = analyzer;
  65   }
  66
  67   public void add(Reader in) throws IOException, ParseException {
  68     LineNumberReader br = new LineNumberReader(in);
  69     try {
  70       addInternal(br);
  71     } catch (IllegalArgumentException e) {
  72       ParseException ex = new ParseException("Invalid synonym rule at line " + br.getLineNumber(), 0);
  73       ex.initCause(e);
  74       throw ex;
  75     } finally {
  76       br.close();
  77     }
  78   }
  79
  80   private void addInternal(BufferedReader in) throws IOException {
  81     String line = null;
  82     while ((line = in.readLine()) != null) {
  83       if (line.length() == 0 || line.charAt(0) == '#') {
  84         continue; // ignore empty lines and comments
  85       }
  86
  87       CharsRef inputs[];
  88       CharsRef outputs[];
  89
  90       // TODO: we could process this more efficiently.
  91       String sides[] = split(line, "=>");
  92       if (sides.length > 1) { // explicit mapping
  93         if (sides.length != 2) {
  94           throw new IllegalArgumentException("more than one explicit mapping specified on the same line");
  95         }
  96         String inputStrings[] = split(sides[0], ",");
  97         inputs = new CharsRef[inputStrings.length];
  98         for (int i = 0; i < inputs.length; i++) {
  99           inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
 100         }
 101
 102         String outputStrings[] = split(sides[1], ",");
 103         outputs = new CharsRef[outputStrings.length];
 104         for (int i = 0; i < outputs.length; i++) {
 105           outputs[i] = analyze(analyzer, unescape(outputStrings[i]).trim(), new CharsRef());
 106         }
 107       } else {
 108         String inputStrings[] = split(line, ",");
 109         inputs = new CharsRef[inputStrings.length];
 110         for (int i = 0; i < inputs.length; i++) {
 111           inputs[i] = analyze(analyzer, unescape(inputStrings[i]).trim(), new CharsRef());
 112         }
 113         if (expand) {
 114           outputs = inputs;
 115         } else {
 116           outputs = new CharsRef[1];
 117           outputs[0] = inputs[0];
 118         }
 119       }
 120
 121       // currently we include the term itself in the map,
 122       // and use includeOrig = false always.
 123       // this is how the existing filter does it, but its actually a bug,
 124       // especially if combined with ignoreCase = true
 125       for (int i = 0; i < inputs.length; i++) {
 126         for (int j = 0; j < outputs.length; j++) {
 127           add(inputs[i], outputs[j], false);
 128         }
 129       }
 130     }
 131   }
 132
 133   private static String[] split(String s, String separator) {
 134     ArrayList<String> list = new ArrayList<String>(2);
 135     StringBuilder sb = new StringBuilder();
 136     int pos=0, end=s.length();
 137     while (pos < end) {
 138       if (s.startsWith(separator,pos)) {
 139         if (sb.length() > 0) {
 140           list.add(sb.toString());
 141           sb=new StringBuilder();
 142         }
 143         pos+=separator.length();
 144         continue;
 145       }
 146
 147       char ch = s.charAt(pos++);
 148       if (ch=='\\') {
 149         sb.append(ch);
 150         if (pos>=end) break;  // ERROR, or let it go?
 151         ch = s.charAt(pos++);
 152       }
 153
 154       sb.append(ch);
 155     }
 156
 157     if (sb.length() > 0) {
 158       list.add(sb.toString());
 159     }
 160
 161     return list.toArray(new String[list.size()]);
 162   }
 163
 164   private String unescape(String s) {
 165     if (s.indexOf("\\") >= 0) {
 166       StringBuilder sb = new StringBuilder();
 167       for (int i = 0; i < s.length(); i++) {
 168         char ch = s.charAt(i);
 169         if (ch == '\\' && i < s.length() - 1) {
 170           sb.append(s.charAt(++i));
 171         } else {
 172           sb.append(ch);
 173         }
 174       }
 175       return sb.toString();
 176     }
 177     return s;
 178   }
 179 }