2 * Licensed to the Apache Software Foundation (ASF) under one or more
3 * contributor license agreements. See the NOTICE file distributed with
4 * this work for additional information regarding copyright ownership.
5 * The ASF licenses this file to You under the Apache License, Version 2.0
6 * (the "License"); you may not use this file except in compliance with
7 * the License. You may obtain a copy of the License at
9 * http://www.apache.org/licenses/LICENSE-2.0
11 * Unless required by applicable law or agreed to in writing, software
12 * distributed under the License is distributed on an "AS IS" BASIS,
13 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 * See the License for the specific language governing permissions and
15 * limitations under the License.
18 package org.apache.lucene.analysis.compound.hyphenation;
21 import org.xml.sax.XMLReader;
22 import org.xml.sax.InputSource;
23 import org.xml.sax.SAXException;
24 import org.xml.sax.SAXParseException;
25 import org.xml.sax.helpers.DefaultHandler;
26 import org.xml.sax.Attributes;
30 import java.io.FileNotFoundException;
31 import java.io.IOException;
32 import java.net.MalformedURLException;
33 import java.util.ArrayList;
35 import javax.xml.parsers.SAXParserFactory;
38 * A SAX document handler to read and parse hyphenation patterns from a XML
41 * This class has been taken from the Apache FOP project (http://xmlgraphics.apache.org/fop/). They have been slightly modified.
43 public class PatternParser extends DefaultHandler implements PatternConsumer {
49 PatternConsumer consumer;
53 ArrayList<Object> exception;
59 static final int ELEM_CLASSES = 1;
61 static final int ELEM_EXCEPTIONS = 2;
63 static final int ELEM_PATTERNS = 3;
65 static final int ELEM_HYPHEN = 4;
67 public PatternParser() throws HyphenationException {
68 token = new StringBuilder();
69 parser = createParser();
70 parser.setContentHandler(this);
71 parser.setErrorHandler(this);
72 parser.setEntityResolver(this);
73 hyphenChar = '-'; // default
77 public PatternParser(PatternConsumer consumer) throws HyphenationException {
79 this.consumer = consumer;
82 public void setConsumer(PatternConsumer consumer) {
83 this.consumer = consumer;
87 * Parses a hyphenation pattern file.
89 * @param filename the filename
90 * @throws HyphenationException In case of an exception while parsing
92 public void parse(String filename) throws HyphenationException {
93 parse(new InputSource(filename));
97 * Parses a hyphenation pattern file.
99 * @param file the pattern file
100 * @throws HyphenationException In case of an exception while parsing
102 public void parse(File file) throws HyphenationException {
104 InputSource src = new InputSource(file.toURL().toExternalForm());
106 } catch (MalformedURLException e) {
107 throw new HyphenationException("Error converting the File '" + file
108 + "' to a URL: " + e.getMessage());
113 * Parses a hyphenation pattern file.
115 * @param source the InputSource for the file
116 * @throws HyphenationException In case of an exception while parsing
118 public void parse(InputSource source) throws HyphenationException {
120 parser.parse(source);
121 } catch (FileNotFoundException fnfe) {
122 throw new HyphenationException("File not found: " + fnfe.getMessage());
123 } catch (IOException ioe) {
124 throw new HyphenationException(ioe.getMessage());
125 } catch (SAXException e) {
126 throw new HyphenationException(errMsg);
131 * Creates a SAX parser using JAXP
133 * @return the created SAX parser
135 static XMLReader createParser() {
137 SAXParserFactory factory = SAXParserFactory.newInstance();
138 factory.setNamespaceAware(true);
139 return factory.newSAXParser().getXMLReader();
140 } catch (Exception e) {
141 throw new RuntimeException("Couldn't create XMLReader: " + e.getMessage());
145 protected String readToken(StringBuffer chars) {
147 boolean space = false;
149 for (i = 0; i < chars.length(); i++) {
150 if (Character.isWhitespace(chars.charAt(i))) {
157 // chars.delete(0,i);
158 for (int countr = i; countr < chars.length(); countr++) {
159 chars.setCharAt(countr - i, chars.charAt(countr));
161 chars.setLength(chars.length() - i);
162 if (token.length() > 0) {
163 word = token.toString();
169 for (i = 0; i < chars.length(); i++) {
170 if (Character.isWhitespace(chars.charAt(i))) {
175 token.append(chars.toString().substring(0, i));
176 // chars.delete(0,i);
177 for (int countr = i; countr < chars.length(); countr++) {
178 chars.setCharAt(countr - i, chars.charAt(countr));
180 chars.setLength(chars.length() - i);
182 word = token.toString();
190 protected static String getPattern(String word) {
191 StringBuilder pat = new StringBuilder();
192 int len = word.length();
193 for (int i = 0; i < len; i++) {
194 if (!Character.isDigit(word.charAt(i))) {
195 pat.append(word.charAt(i));
198 return pat.toString();
201 protected ArrayList<Object> normalizeException(ArrayList<?> ex) {
202 ArrayList<Object> res = new ArrayList<Object>();
203 for (int i = 0; i < ex.size(); i++) {
204 Object item = ex.get(i);
205 if (item instanceof String) {
206 String str = (String) item;
207 StringBuilder buf = new StringBuilder();
208 for (int j = 0; j < str.length(); j++) {
209 char c = str.charAt(j);
210 if (c != hyphenChar) {
213 res.add(buf.toString());
215 char[] h = new char[1];
217 // we use here hyphenChar which is not necessarily
218 // the one to be printed
219 res.add(new Hyphen(new String(h), null, null));
222 if (buf.length() > 0) {
223 res.add(buf.toString());
232 protected String getExceptionWord(ArrayList<?> ex) {
233 StringBuilder res = new StringBuilder();
234 for (int i = 0; i < ex.size(); i++) {
235 Object item = ex.get(i);
236 if (item instanceof String) {
237 res.append((String) item);
239 if (((Hyphen) item).noBreak != null) {
240 res.append(((Hyphen) item).noBreak);
244 return res.toString();
247 protected static String getInterletterValues(String pat) {
248 StringBuilder il = new StringBuilder();
249 String word = pat + "a"; // add dummy letter to serve as sentinel
250 int len = word.length();
251 for (int i = 0; i < len; i++) {
252 char c = word.charAt(i);
253 if (Character.isDigit(c)) {
260 return il.toString();
264 // EntityResolver methods
267 public InputSource resolveEntity(String publicId, String systemId) {
268 // supply the internal hyphenation.dtd if possible
270 (systemId != null && systemId.matches("(?i).*\\bhyphenation.dtd\\b.*")) ||
271 ("hyphenation-info".equals(publicId))
273 // System.out.println(this.getClass().getResource("hyphenation.dtd").toExternalForm());
274 return new InputSource(this.getClass().getResource("hyphenation.dtd").toExternalForm());
280 // ContentHandler methods
284 * @see org.xml.sax.ContentHandler#startElement(java.lang.String,
285 * java.lang.String, java.lang.String, org.xml.sax.Attributes)
288 public void startElement(String uri, String local, String raw,
290 if (local.equals("hyphen-char")) {
291 String h = attrs.getValue("value");
292 if (h != null && h.length() == 1) {
293 hyphenChar = h.charAt(0);
295 } else if (local.equals("classes")) {
296 currElement = ELEM_CLASSES;
297 } else if (local.equals("patterns")) {
298 currElement = ELEM_PATTERNS;
299 } else if (local.equals("exceptions")) {
300 currElement = ELEM_EXCEPTIONS;
301 exception = new ArrayList<Object>();
302 } else if (local.equals("hyphen")) {
303 if (token.length() > 0) {
304 exception.add(token.toString());
306 exception.add(new Hyphen(attrs.getValue("pre"), attrs.getValue("no"),
307 attrs.getValue("post")));
308 currElement = ELEM_HYPHEN;
314 * @see org.xml.sax.ContentHandler#endElement(java.lang.String,
315 * java.lang.String, java.lang.String)
318 @SuppressWarnings("unchecked")
319 public void endElement(String uri, String local, String raw) {
321 if (token.length() > 0) {
322 String word = token.toString();
323 switch (currElement) {
325 consumer.addClass(word);
327 case ELEM_EXCEPTIONS:
329 exception = normalizeException(exception);
330 consumer.addException(getExceptionWord(exception),
331 (ArrayList) exception.clone());
334 consumer.addPattern(getPattern(word), getInterletterValues(word));
340 if (currElement != ELEM_HYPHEN) {
344 if (currElement == ELEM_HYPHEN) {
345 currElement = ELEM_EXCEPTIONS;
353 * @see org.xml.sax.ContentHandler#characters(char[], int, int)
355 @SuppressWarnings("unchecked")
357 public void characters(char ch[], int start, int length) {
358 StringBuffer chars = new StringBuffer(length);
359 chars.append(ch, start, length);
360 String word = readToken(chars);
361 while (word != null) {
362 // System.out.println("\"" + word + "\"");
363 switch (currElement) {
365 consumer.addClass(word);
367 case ELEM_EXCEPTIONS:
369 exception = normalizeException(exception);
370 consumer.addException(getExceptionWord(exception),
371 (ArrayList) exception.clone());
375 consumer.addPattern(getPattern(word), getInterletterValues(word));
378 word = readToken(chars);
384 * Returns a string of the location.
386 private String getLocationString(SAXParseException ex) {
387 StringBuilder str = new StringBuilder();
389 String systemId = ex.getSystemId();
390 if (systemId != null) {
391 int index = systemId.lastIndexOf('/');
393 systemId = systemId.substring(index + 1);
395 str.append(systemId);
398 str.append(ex.getLineNumber());
400 str.append(ex.getColumnNumber());
402 return str.toString();
404 } // getLocationString(SAXParseException):String
406 // PatternConsumer implementation for testing purposes
407 public void addClass(String c) {
408 System.out.println("class: " + c);
411 public void addException(String w, ArrayList<Object> e) {
412 System.out.println("exception: " + w + " : " + e.toString());
415 public void addPattern(String p, String v) {
416 System.out.println("pattern: " + p + " : " + v);
419 public static void main(String[] args) throws Exception {
420 if (args.length > 0) {
421 PatternParser pp = new PatternParser();