1 package org.apache.lucene.analysis.hunspell;
4 * Licensed to the Apache Software Foundation (ASF) under one or more
5 * contributor license agreements. See the NOTICE file distributed with
6 * this work for additional information regarding copyright ownership.
7 * The ASF licenses this file to You under the Apache License, Version 2.0
8 * (the "License"); you may not use this file except in compliance with
9 * the License. You may obtain a copy of the License at
11 * http://www.apache.org/licenses/LICENSE-2.0
13 * Unless required by applicable law or agreed to in writing, software
14 * distributed under the License is distributed on an "AS IS" BASIS,
15 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 * See the License for the specific language governing permissions and
17 * limitations under the License.
20 import org.apache.lucene.analysis.CharArrayMap;
21 import org.apache.lucene.util.Version;
24 import java.nio.charset.Charset;
25 import java.nio.charset.CharsetDecoder;
26 import java.text.ParseException;
27 import java.util.ArrayList;
28 import java.util.Arrays;
29 import java.util.List;
30 import java.util.Locale;
32 public class HunspellDictionary {
34 static final HunspellWord NOFLAGS = new HunspellWord();
36 private static final String PREFIX_KEY = "PFX";
37 private static final String SUFFIX_KEY = "SFX";
38 private static final String FLAG_KEY = "FLAG";
40 private static final String NUM_FLAG_TYPE = "num";
41 private static final String UTF8_FLAG_TYPE = "UTF-8";
42 private static final String LONG_FLAG_TYPE = "long";
44 private static final String PREFIX_CONDITION_REGEX_PATTERN = "%s.*";
45 private static final String SUFFIX_CONDITION_REGEX_PATTERN = ".*%s";
47 private static final boolean IGNORE_CASE_DEFAULT = false;
49 private CharArrayMap<List<HunspellWord>> words;
50 private CharArrayMap<List<HunspellAffix>> prefixes;
51 private CharArrayMap<List<HunspellAffix>> suffixes;
53 private FlagParsingStrategy flagParsingStrategy = new SimpleFlagParsingStrategy(); // Default flag parsing strategy
54 private boolean ignoreCase = IGNORE_CASE_DEFAULT;
56 private final Version version;
59 * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
60 * and dictionary files
62 * @param affix InputStream for reading the hunspell affix file
63 * @param dictionary InputStream for reading the hunspell dictionary file
64 * @param version Lucene Version
65 * @throws IOException Can be thrown while reading from the InputStreams
66 * @throws ParseException Can be thrown if the content of the files does not meet expected formats
68 public HunspellDictionary(InputStream affix, InputStream dictionary, Version version) throws IOException, ParseException {
69 this(affix, Arrays.asList(dictionary), version, IGNORE_CASE_DEFAULT);
73 * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
74 * and dictionary files
76 * @param affix InputStream for reading the hunspell affix file
77 * @param dictionary InputStream for reading the hunspell dictionary file
78 * @param version Lucene Version
79 * @param ignoreCase If true, dictionary matching will be case insensitive
80 * @throws IOException Can be thrown while reading from the InputStreams
81 * @throws ParseException Can be thrown if the content of the files does not meet expected formats
83 public HunspellDictionary(InputStream affix, InputStream dictionary, Version version, boolean ignoreCase) throws IOException, ParseException {
84 this(affix, Arrays.asList(dictionary), version, ignoreCase);
88 * Creates a new HunspellDictionary containing the information read from the provided InputStreams to hunspell affix
89 * and dictionary files
91 * @param affix InputStream for reading the hunspell affix file
92 * @param dictionaries InputStreams for reading the hunspell dictionary file
93 * @param version Lucene Version
94 * @param ignoreCase If true, dictionary matching will be case insensitive
95 * @throws IOException Can be thrown while reading from the InputStreams
96 * @throws ParseException Can be thrown if the content of the files does not meet expected formats
98 public HunspellDictionary(InputStream affix, List<InputStream> dictionaries, Version version, boolean ignoreCase) throws IOException, ParseException {
99 this.version = version;
100 this.ignoreCase = ignoreCase;
101 String encoding = getDictionaryEncoding(affix);
102 CharsetDecoder decoder = getJavaEncoding(encoding);
103 readAffixFile(affix, decoder);
104 words = new CharArrayMap<List<HunspellWord>>(version, 65535 /* guess */, this.ignoreCase);
105 for (InputStream dictionary : dictionaries) {
106 readDictionaryFile(dictionary, decoder);
111 * Looks up HunspellWords that match the String created from the given char array, offset and length
113 * @param word Char array to generate the String from
114 * @param offset Offset in the char array that the String starts at
115 * @param length Length from the offset that the String is
116 * @return List of HunspellWords that match the generated String, or {@code null} if none are found
118 public List<HunspellWord> lookupWord(char word[], int offset, int length) {
119 return words.get(word, offset, length);
123 * Looks up HunspellAffix prefixes that have an append that matches the String created from the given char array, offset and length
125 * @param word Char array to generate the String from
126 * @param offset Offset in the char array that the String starts at
127 * @param length Length from the offset that the String is
128 * @return List of HunspellAffix prefixes with an append that matches the String, or {@code null} if none are found
130 public List<HunspellAffix> lookupPrefix(char word[], int offset, int length) {
131 return prefixes.get(word, offset, length);
135 * Looks up HunspellAffix suffixes that have an append that matches the String created from the given char array, offset and length
137 * @param word Char array to generate the String from
138 * @param offset Offset in the char array that the String starts at
139 * @param length Length from the offset that the String is
140 * @return List of HunspellAffix suffixes with an append that matches the String, or {@code null} if none are found
142 public List<HunspellAffix> lookupSuffix(char word[], int offset, int length) {
143 return suffixes.get(word, offset, length);
147 * Reads the affix file through the provided InputStream, building up the prefix and suffix maps
149 * @param affixStream InputStream to read the content of the affix file from
150 * @param decoder CharsetDecoder to decode the content of the file
151 * @throws IOException Can be thrown while reading from the InputStream
153 private void readAffixFile(InputStream affixStream, CharsetDecoder decoder) throws IOException {
154 prefixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
155 suffixes = new CharArrayMap<List<HunspellAffix>>(version, 8, ignoreCase);
157 BufferedReader reader = new BufferedReader(new InputStreamReader(affixStream, decoder));
159 while ((line = reader.readLine()) != null) {
160 if (line.startsWith(PREFIX_KEY)) {
161 parseAffix(prefixes, line, reader, PREFIX_CONDITION_REGEX_PATTERN);
162 } else if (line.startsWith(SUFFIX_KEY)) {
163 parseAffix(suffixes, line, reader, SUFFIX_CONDITION_REGEX_PATTERN);
164 } else if (line.startsWith(FLAG_KEY)) {
165 // Assume that the FLAG line comes before any prefix or suffixes
166 // Store the strategy so it can be used when parsing the dic file
167 flagParsingStrategy = getFlagParsingStrategy(line);
174 * Parses a specific affix rule putting the result into the provided affix map
176 * @param affixes Map where the result of the parsing will be put
177 * @param header Header line of the affix rule
178 * @param reader BufferedReader to read the content of the rule from
179 * @param conditionPattern {@link String#format(String, Object...)} pattern to be used to generate the condition regex
181 * @throws IOException Can be thrown while reading the rule
183 private void parseAffix(CharArrayMap<List<HunspellAffix>> affixes,
185 BufferedReader reader,
186 String conditionPattern) throws IOException {
187 String args[] = header.split("\\s+");
189 boolean crossProduct = args[2].equals("Y");
191 int numLines = Integer.parseInt(args[3]);
192 for (int i = 0; i < numLines; i++) {
193 String line = reader.readLine();
194 String ruleArgs[] = line.split("\\s+");
196 HunspellAffix affix = new HunspellAffix();
198 affix.setFlag(flagParsingStrategy.parseFlag(ruleArgs[1]));
199 affix.setStrip(ruleArgs[2].equals("0") ? "" : ruleArgs[2]);
201 String affixArg = ruleArgs[3];
203 int flagSep = affixArg.lastIndexOf('/');
205 char appendFlags[] = flagParsingStrategy.parseFlags(affixArg.substring(flagSep + 1));
206 Arrays.sort(appendFlags);
207 affix.setAppendFlags(appendFlags);
208 affix.setAppend(affixArg.substring(0, flagSep));
210 affix.setAppend(affixArg);
213 String condition = ruleArgs[4];
214 affix.setCondition(condition, String.format(conditionPattern, condition));
215 affix.setCrossProduct(crossProduct);
217 List<HunspellAffix> list = affixes.get(affix.getAppend());
219 list = new ArrayList<HunspellAffix>();
220 affixes.put(affix.getAppend(), list);
228 * Parses the encoding specificed in the affix file readable through the provided InputStream
230 * @param affix InputStream for reading the affix file
231 * @return Encoding specified in the affix file
232 * @throws IOException Can be thrown while reading from the InputStream
233 * @throws ParseException Thrown if the first non-empty non-comment line read from the file does not adhere to the format {@code SET <encoding>}
235 private String getDictionaryEncoding(InputStream affix) throws IOException, ParseException {
236 final StringBuilder encoding = new StringBuilder();
238 encoding.setLength(0);
240 while ((ch = affix.read()) >= 0) {
245 encoding.append((char)ch);
249 encoding.length() == 0 || encoding.charAt(0) == '#' ||
250 // this test only at the end as ineffective but would allow lines only containing spaces:
251 encoding.toString().trim().length() == 0
254 throw new ParseException("Unexpected end of affix file.", 0);
258 if ("SET ".equals(encoding.substring(0, 4))) {
259 // cleanup the encoding string, too (whitespace)
260 return encoding.substring(4).trim();
262 throw new ParseException("The first non-comment line in the affix file must "+
263 "be a 'SET charset', was: '" + encoding +"'", 0);
268 * Retrieves the CharsetDecoder for the given encoding. Note, This isn't perfect as I think ISCII-DEVANAGARI and
269 * MICROSOFT-CP1251 etc are allowed...
271 * @param encoding Encoding to retrieve the CharsetDecoder for
272 * @return CharSetDecoder for the given encoding
274 private CharsetDecoder getJavaEncoding(String encoding) {
275 Charset charset = Charset.forName(encoding);
276 return charset.newDecoder();
280 * Determines the appropriate {@link FlagParsingStrategy} based on the FLAG definiton line taken from the affix file
282 * @param flagLine Line containing the flag information
283 * @return FlagParsingStrategy that handles parsing flags in the way specified in the FLAG definiton
285 private FlagParsingStrategy getFlagParsingStrategy(String flagLine) {
286 String flagType = flagLine.substring(5);
288 if (NUM_FLAG_TYPE.equals(flagType)) {
289 return new NumFlagParsingStrategy();
290 } else if (UTF8_FLAG_TYPE.equals(flagType)) {
291 return new SimpleFlagParsingStrategy();
292 } else if (LONG_FLAG_TYPE.equals(flagType)) {
293 return new DoubleASCIIFlagParsingStrategy();
296 throw new IllegalArgumentException("Unknown flag type: " + flagType);
300 * Reads the dictionary file through the provided InputStream, building up the words map
302 * @param dictionary InputStream to read the dictionary file through
303 * @param decoder CharsetDecoder used to decode the contents of the file
304 * @throws IOException Can be thrown while reading from the file
306 private void readDictionaryFile(InputStream dictionary, CharsetDecoder decoder) throws IOException {
307 BufferedReader reader = new BufferedReader(new InputStreamReader(dictionary, decoder));
308 // TODO: don't create millions of strings.
309 String line = reader.readLine(); // first line is number of entries
310 int numEntries = Integer.parseInt(line);
312 // TODO: the flags themselves can be double-chars (long) or also numeric
313 // either way the trick is to encode them as char... but they must be parsed differently
314 while ((line = reader.readLine()) != null) {
316 HunspellWord wordForm;
318 int flagSep = line.lastIndexOf('/');
323 // note, there can be comments (morph description) after a flag.
324 // we should really look for any whitespace
325 int end = line.indexOf('\t', flagSep);
330 wordForm = new HunspellWord(flagParsingStrategy.parseFlags(line.substring(flagSep + 1, end)));
331 Arrays.sort(wordForm.getFlags());
332 entry = line.substring(0, flagSep);
334 entry = entry.toLowerCase(Locale.ENGLISH);
338 List<HunspellWord> entries = words.get(entry);
339 if (entries == null) {
340 entries = new ArrayList<HunspellWord>();
341 words.put(entry, entries);
343 entries.add(wordForm);
347 public Version getVersion() {
352 * Abstraction of the process of parsing flags taken from the affix and dic files
354 private static abstract class FlagParsingStrategy {
357 * Parses the given String into a single flag
359 * @param rawFlag String to parse into a flag
360 * @return Parsed flag
362 char parseFlag(String rawFlag) {
363 return parseFlags(rawFlag)[0];
367 * Parses the given String into multiple flags
369 * @param rawFlags String to parse into flags
370 * @return Parsed flags
372 abstract char[] parseFlags(String rawFlags);
376 * Simple implementation of {@link FlagParsingStrategy} that treats the chars in each String as a individual flags.
377 * Can be used with both the ASCII and UTF-8 flag types.
379 private static class SimpleFlagParsingStrategy extends FlagParsingStrategy {
383 public char[] parseFlags(String rawFlags) {
384 return rawFlags.toCharArray();
389 * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded in its numerical form. In the case
390 * of multiple flags, each number is separated by a comma.
392 private static class NumFlagParsingStrategy extends FlagParsingStrategy {
396 public char[] parseFlags(String rawFlags) {
397 String[] rawFlagParts = rawFlags.trim().split(",");
398 char[] flags = new char[rawFlagParts.length];
400 for (int i = 0; i < rawFlagParts.length; i++) {
401 // note, removing the trailing X/leading I for nepali... what is the rule here?!
402 flags[i] = (char) Integer.parseInt(rawFlagParts[i].replaceAll("[^0-9]", ""));
410 * Implementation of {@link FlagParsingStrategy} that assumes each flag is encoded as two ASCII characters whose codes
411 * must be combined into a single character.
415 private static class DoubleASCIIFlagParsingStrategy extends FlagParsingStrategy {
420 public char[] parseFlags(String rawFlags) {
421 if (rawFlags.length() == 0) {
425 StringBuilder builder = new StringBuilder();
426 for (int i = 0; i < rawFlags.length(); i+=2) {
427 char cookedFlag = (char) ((int) rawFlags.charAt(i) + (int) rawFlags.charAt(i + 1));
428 builder.append(cookedFlag);
431 char flags[] = new char[builder.length()];
432 builder.getChars(0, builder.length(), flags, 0);
437 public boolean isIgnoreCase() {