lucene-java-3.5.0/lucene/contrib/analyzers/common/src/java/org/apache/lucene/analysis/hunspell/HunspellDictionary.java

   1 package org.apache.lucene.analysis.hunspell;
   2
   3 /**
   4  * Licensed to the Apache Software Foundation (ASF) under one or more
   5  * contributor license agreements.  See the NOTICE file distributed with
   6  * this work for additional information regarding copyright ownership.
   7  * The ASF licenses this file to You under the Apache License, Version 2.0
   8  * (the "License"); you may not use this file except in compliance with
   9  * the License.  You may obtain a copy of the License at
  10  *
  11  *     http://www.apache.org/licenses/LICENSE-2.0
  12  *
  13  * Unless required by applicable law or agreed to in writing, software
  14  * distributed under the License is distributed on an "AS IS" BASIS,
  15  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  16  * See the License for the specific language governing permissions and
  17  * limitations under the License.
  18  */
  19
  20 import org.apache.lucene.analysis.CharArrayMap;
  21 import org.apache.lucene.util.Version;
  22
  23 import java.io.*;
  24 import java.nio.charset.Charset;
  25 import java.nio.charset.CharsetDecoder;
  26 import java.text.ParseException;
  27 import java.util.ArrayList;
  28 import java.util.Arrays;
  29 import java.util.List;
  30 import java.util.Locale;
  31
  32 public class HunspellDictionary {
  33
  34   static final HunspellWord NOFLAGS = new HunspellWord();
  35
  36   private static final String PREFIX_KEY = "PFX";
  37   private static final String SUFFIX_KEY = "SFX";
  38   private static final String FLAG_KEY = "FLAG";
  39
  40   private static final String NUM_FLAG_TYPE = "num";
  41   private static final String UTF8_FLAG_TYPE = "UTF-8";
  42   private static final String LONG_FLAG_TYPE = "long";
  43
  44   private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
  45   private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
  46
  47   private static final boolean IGNORE_CASE_DEFAULT = false;
  48
  49   private CharArrayMap<List<HunspellWord>> words;
  50   private CharArrayMap<List<HunspellAffix>> prefixes;
  51   private CharArrayMap<List<HunspellAffix>> suffixes;
  52
  53   private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
  54   private boolean ignoreCase = IGNORE_CASE_DEFAULT;
  55
  56   private final Version version;
  57
  58   /**
  59    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
  60    * and dictionary files
  61    *
  62    * @param affix InputStream for reading the hunspell affix file
  63    * @param dictionary InputStream for reading the hunspell dictionary file
  64    * @param version Lucene Version
  65    * @throws IOException Can be thrown while reading from the InputStreams
  66    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
  67    */
  68   public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
  69     this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
  70   }
  71
  72   /**
  73    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
  74    * and dictionary files
  75    *
  76    * @param affix InputStream for reading the hunspell affix file
  77    * @param dictionary InputStream for reading the hunspell dictionary file
  78    * @param version Lucene Version
  79    * @param ignoreCase If true, dictionary matching will be case insensitive
  80    * @throws IOException Can be thrown while reading from the InputStreams
  81    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
  82    */
  83   public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
  84     this(affix, Arrays.asList(dictionary), version, ignoreCase);
  85   }
  86
  87   /**
  88    * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
  89    * and dictionary files
  90    *
  91    * @param affix InputStream for reading the hunspell affix file
  92    * @param dictionaries InputStreams for reading the hunspell dictionary file
  93    * @param version Lucene Version
  94    * @param ignoreCase If true, dictionary matching will be case insensitive
  95    * @throws IOException Can be thrown while reading from the InputStreams
  96    * @throws ParseException Can be thrown if the content of the files does not meet expected formats
  97    */
  98   public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
  99     this.version = version;
 100     this.ignoreCase = ignoreCase;
 101     String encoding = getDictionaryEncoding(affix);
 102     CharsetDecoder decoder = getJavaEncoding(encoding);
 103     readAffixFile(affix, decoder);
 104     words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
 105     for (InputStream dictionary : dictionaries) {
 106       readDictionaryFile(dictionary, decoder);
 107     }
 108   }
 109
 110   /**
 111    * Looks up HunspellWords that match the String created from the given char array, offset and length
 112    *
 113    * @param word Char array to generate the String from
 114    * @param offset Offset in the char array that the String starts at
 115    * @param length Length from the offset that the String is
 116    * @return List of HunspellWords that match the generated String, or {@code null} if none are found
 117    */
 118   public List<HunspellWord> lookupWord(char word[], int offset, int length) {
 119     return words.get(word, offset, length);
 120   }
 121
 122   /**
 123    * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
 124    *
 125    * @param word Char array to generate the String from
 126    * @param offset Offset in the char array that the String starts at
 127    * @param length Length from the offset that the String is
 128    * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
 129    */
 130   public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
 131     return prefixes.get(word, offset, length);
 132   }
 133
 134   /**
 135    * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
 136    *
 137    * @param word Char array to generate the String from
 138    * @param offset Offset in the char array that the String starts at
 139    * @param length Length from the offset that the String is
 140    * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
 141    */
 142   public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
 143     return suffixes.get(word, offset, length);
 144   }
 145
 146   /**
 147    * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
 148    *
 149    * @param affixStream InputStream to read the content of the affix file from
 150    * @param decoder CharsetDecoder to decode the content of the file
 151    * @throws IOException Can be thrown while reading from the InputStream
 152    */
 153   private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
 154     prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
 155     suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
 156
 157     BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
 158     String line = null;
 159     while ((line = reader.readLine()) != null) {
 160       if (line.startsWith(PREFIX_KEY)) {
 161         parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
 162       } else if (line.startsWith(SUFFIX_KEY)) {
 163         parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
 164       } else if (line.startsWith(FLAG_KEY)) {
 165         // Assume that the FLAG line comes before any prefix or suffixes
 166         // Store the strategy so it can be used when parsing the dic file
 167         flagParsingStrategy = getFlagParsingStrategy(line);
 168       }
 169     }
 170     reader.close();
 171   }
 172
 173   /**
 174    * Parses a specific affix rule putting the result into the provided affix map
 175    *
 176    * @param affixes Map where the result of the parsing will be put
 177    * @param header Header line of the affix rule
 178    * @param reader BufferedReader to read the content of the rule from
 179    * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
 180    *                         pattern
 181    * @throws IOException Can be thrown while reading the rule
 182    */
 183   private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
 184                           String header,
 185                           BufferedReader reader,
 186                           String conditionPattern) throws IOException {
 187     String args[] = header.split("\\s+");
 188
 189     boolean crossProduct = args[2].equals("Y");
 190
 191     int numLines = Integer.parseInt(args[3]);
 192     for (int i = 0; i < numLines; i++) {
 193       String line = reader.readLine();
 194       String ruleArgs[] = line.split("\\s+");
 195
 196       HunspellAffix affix = new HunspellAffix();
 197
 198       affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
 199       affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
 200
 201       String affixArg = ruleArgs[3];
 202
 203       int flagSep = affixArg.lastIndexOf('/');
 204       if (flagSep != -1) {
 205         char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
 206         Arrays.sort(appendFlags);
 207         affix.setAppendFlags(appendFlags);
 208         affix.setAppend(affixArg.substring(0, flagSep));
 209       } else {
 210         affix.setAppend(affixArg);
 211       }
 212
 213       String condition = ruleArgs[4];
 214       affix.setCondition(condition, String.format(conditionPattern, condition));
 215       affix.setCrossProduct(crossProduct);
 216
 217       List<HunspellAffix> list = affixes.get(affix.getAppend());
 218       if (list == null) {
 219         list = new ArrayList<HunspellAffix>();
 220         affixes.put(affix.getAppend(), list);
 221       }
 222
 223       list.add(affix);
 224     }
 225   }
 226
 227   /**
 228    * Parses the encoding specificed in the affix file readable through the provided InputStream
 229    *
 230    * @param affix InputStream for reading the affix file
 231    * @return Encoding specified in the affix file
 232    * @throws IOException Can be thrown while reading from the InputStream
 233    * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
 234    */
 235   private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
 236     final StringBuilder encoding = new StringBuilder();
 237     for (;;) {
 238       encoding.setLength(0);
 239       int ch;
 240       while ((ch = affix.read()) >= 0) {
 241         if (ch == '\n') {
 242           break;
 243         }
 244         if (ch != '\r') {
 245           encoding.append((char)ch);
 246         }
 247       }
 248       if (
 249           encoding.length() == 0 || encoding.charAt(0) == '#' ||
 250           // this test only at the end as ineffective but would allow lines only containing spaces:
 251           encoding.toString().trim().length() == 0
 252       ) {
 253         if (ch < 0) {
 254           throw new ParseException("Unexpected end of affix file.", 0);
 255         }
 256         continue;
 257       }
 258       if ("SET ".equals(encoding.substring(0, 4))) {
 259         // cleanup the encoding string, too (whitespace)
 260         return encoding.substring(4).trim();
 261       }
 262       throw new ParseException("The first non-comment line in the affix file must "+
 263           "be a 'SET charset', was: '" + encoding +"'", 0);
 264     }
 265   }
 266
 267   /**
 268    * Retrieves the CharsetDecoder for the given encoding.  Note, This isn't perfect as I think ISCII-DEVANAGARI and
 269    * MICROSOFT-CP1251 etc are allowed...
 270    *
 271    * @param encoding Encoding to retrieve the CharsetDecoder for
 272    * @return CharSetDecoder for the given encoding
 273    */
 274   private CharsetDecoder getJavaEncoding(String encoding) {
 275     Charset charset = Charset.forName(encoding);
 276     return charset.newDecoder();
 277   }
 278
 279   /**
 280    * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
 281    *
 282    * @param flagLine Line containing the flag information
 283    * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
 284    */
 285   private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
 286     String flagType = flagLine.substring(5);
 287
 288     if (NUM_FLAG_TYPE.equals(flagType)) {
 289       return new NumFlagParsingStrategy();
 290     } else if (UTF8_FLAG_TYPE.equals(flagType)) {
 291       return new SimpleFlagParsingStrategy();
 292     } else if (LONG_FLAG_TYPE.equals(flagType)) {
 293       return new DoubleASCIIFlagParsingStrategy();
 294     }
 295
 296     throw new IllegalArgumentException("Unknown flag type: " + flagType);
 297   }
 298
 299   /**
 300    * Reads the dictionary file through the provided InputStream, building up the words map
 301    *
 302    * @param dictionary InputStream to read the dictionary file through
 303    * @param decoder CharsetDecoder used to decode the contents of the file
 304    * @throws IOException Can be thrown while reading from the file
 305    */
 306   private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
 307     BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
 308     // TODO: don't create millions of strings.
 309     String line = reader.readLine(); // first line is number of entries
 310     int numEntries = Integer.parseInt(line);
 311
 312     // TODO: the flags themselves can be double-chars (long) or also numeric
 313     // either way the trick is to encode them as char... but they must be parsed differently
 314     while ((line = reader.readLine()) != null) {
 315       String entry;
 316       HunspellWord wordForm;
 317
 318       int flagSep = line.lastIndexOf('/');
 319       if (flagSep == -1) {
 320         wordForm = NOFLAGS;
 321         entry = line;
 322       } else {
 323         // note, there can be comments (morph description) after a flag.
 324         // we should really look for any whitespace
 325         int end = line.indexOf('\t', flagSep);
 326         if (end == -1)
 327           end = line.length();
 328
 329
 330         wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
 331         Arrays.sort(wordForm.getFlags());
 332         entry = line.substring(0, flagSep);
 333         if(ignoreCase) {
 334           entry = entry.toLowerCase(Locale.ENGLISH);
 335         }
 336       }
 337
 338       List<HunspellWord> entries = words.get(entry);
 339       if (entries == null) {
 340         entries = new ArrayList<HunspellWord>();
 341         words.put(entry, entries);
 342       }
 343       entries.add(wordForm);
 344     }
 345   }
 346
 347   public Version getVersion() {
 348     return version;
 349   }
 350
 351   /**
 352    * Abstraction of the process of parsing flags taken from the affix and dic files
 353    */
 354   private static abstract class FlagParsingStrategy {
 355
 356     /**
 357      * Parses the given String into a single flag
 358      *
 359      * @param rawFlag String to parse into a flag
 360      * @return Parsed flag
 361      */
 362     char parseFlag(String rawFlag) {
 363       return parseFlags(rawFlag)[0];
 364     }
 365
 366     /**
 367      * Parses the given String into multiple flags
 368      *
 369      * @param rawFlags String to parse into flags
 370      * @return Parsed flags
 371      */
 372     abstract char[] parseFlags(String rawFlags);
 373   }
 374
 375   /**
 376    * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
 377    * Can be used with both the ASCII and UTF-8 flag types.
 378    */
 379   private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
 380     /**
 381      * {@inheritDoc}
 382      */
 383     public char[] parseFlags(String rawFlags) {
 384       return rawFlags.toCharArray();
 385     }
 386   }
 387
 388   /**
 389    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form.  In the case
 390    * of multiple flags, each number is separated by a comma.
 391    */
 392   private static class NumFlagParsingStrategy extends FlagParsingStrategy {
 393     /**
 394      * {@inheritDoc}
 395      */
 396     public char[] parseFlags(String rawFlags) {
 397       String[] rawFlagParts = rawFlags.trim().split(",");
 398       char[] flags = new char[rawFlagParts.length];
 399
 400       for (int i = 0; i < rawFlagParts.length; i++) {
 401         // note, removing the trailing X/leading I for nepali... what is the rule here?!
 402         flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
 403       }
 404
 405       return flags;
 406     }
 407   }
 408
 409   /**
 410    * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
 411    * must be combined into a single character.
 412    *
 413    * TODO (rmuir) test
 414    */
 415   private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
 416
 417     /**
 418      * {@inheritDoc}
 419      */
 420     public char[] parseFlags(String rawFlags) {
 421       if (rawFlags.length() == 0) {
 422         return new char[0];
 423       }
 424
 425       StringBuilder builder = new StringBuilder();
 426       for (int i = 0; i < rawFlags.length(); i+=2) {
 427         char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
 428         builder.append(cookedFlag);
 429       }
 430
 431       char flags[] = new char[builder.length()];
 432       builder.getChars(0, builder.length(), flags, 0);
 433       return flags;
 434     }
 435   }
 436
 437   public boolean isIgnoreCase() {
 438     return ignoreCase;
 439   }
 440 }