1 package org.apache.lucene.analysis.standard;
4 * Copyright 2001-2005 The Apache Software Foundation.
6 * Licensed under the Apache License, Version 2.0 (the "License");
7 * you may not use this file except in compliance with the License.
8 * You may obtain a copy of the License at
10 * http://www.apache.org/licenses/LICENSE-2.0
12 * Unless required by applicable law or agreed to in writing, software
13 * distributed under the License is distributed on an "AS IS" BASIS,
14 * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 * See the License for the specific language governing permissions and
16 * limitations under the License.
19 import java.io.BufferedReader;
21 import java.io.FileOutputStream;
22 import java.io.IOException;
23 import java.io.InputStreamReader;
24 import java.io.OutputStreamWriter;
25 import java.io.Writer;
27 import java.net.URLConnection;
28 import java.text.DateFormat;
29 import java.util.Date;
30 import java.util.Locale;
31 import java.util.SortedSet;
32 import java.util.TimeZone;
33 import java.util.TreeSet;
34 import java.util.regex.Matcher;
35 import java.util.regex.Pattern;
38 * Generates a file containing JFlex macros to accept valid ASCII TLDs
39 * (top level domains), for inclusion in JFlex grammars that can accept
42 * The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
43 * response is parsed, and the results are written out to a file containing
44 * a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
45 * forms of internationalized TLDs (output file cmdline arg #1).
47 public class GenerateJflexTLDMacros {
49 public static void main(String... args) throws Exception {
50 if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
51 System.err.println("Cmd line params:");
52 System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
53 + "<ZoneFileURL> <JFlexOutputFile>");
56 new GenerateJflexTLDMacros(args[0], args[1]).execute();
59 private static final String NL = System.getProperty("line.separator");
61 private static final String APACHE_LICENSE
63 + " * Copyright 2001-2005 The Apache Software Foundation." + NL
65 + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
66 + " * you may not use this file except in compliance with the License." + NL
67 + " * You may obtain a copy of the License at" + NL
69 + " * http://www.apache.org/licenses/LICENSE-2.0" + NL
71 + " * Unless required by applicable law or agreed to in writing, software" + NL
72 + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
73 + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
74 + " * See the License for the specific language governing permissions and" + NL
75 + " * limitations under the License." + NL
78 private static final Pattern TLD_PATTERN_1
79 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
80 private static final Pattern TLD_PATTERN_2
81 = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
82 private final URL tldFileURL;
83 private long tldFileLastModified = -1L;
84 private final File outputFile;
86 public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
88 this.tldFileURL = new URL(tldFileURL);
89 this.outputFile = new File(outputFile);
93 * Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
94 * writes a JFlex macro accepting any of them case-insensitively out to
95 * the specified output file.
97 * @throws IOException if there is a problem either downloading the database
98 * or writing out the output file.
100 public void execute() throws IOException {
101 final SortedSet<String> TLDs = getIANARootZoneDatabase();
103 System.err.println("Wrote " + TLDs.size() + " top level domains to '"
104 + outputFile + "'.");
108 * Downloads the IANA Root Zone Database.
109 * @return downcased sorted set of ASCII TLDs
110 * @throws java.io.IOException if there is a problem downloading the database
112 private SortedSet<String> getIANARootZoneDatabase() throws IOException {
113 final SortedSet<String> TLDs = new TreeSet<String>();
114 final URLConnection connection = tldFileURL.openConnection();
115 connection.setUseCaches(false);
116 connection.addRequestProperty("Cache-Control", "no-cache");
117 connection.connect();
118 tldFileLastModified = connection.getLastModified();
119 BufferedReader reader = new BufferedReader
120 (new InputStreamReader(connection.getInputStream(), "US-ASCII"));
123 while (null != (line = reader.readLine())) {
124 Matcher matcher = TLD_PATTERN_1.matcher(line);
125 if (matcher.matches()) {
126 TLDs.add(matcher.group(1).toLowerCase(Locale.US));
128 matcher = TLD_PATTERN_2.matcher(line);
129 if (matcher.matches()) {
130 TLDs.add(matcher.group(1).toLowerCase(Locale.US));
141 * Writes a file containing a JFlex macro that will accept any of the given
142 * TLDs case-insensitively.
144 * @param ASCIITLDs The downcased sorted set of top level domains to accept
145 * @throws IOException if there is an error writing the output file
147 private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
148 final DateFormat dateFormat = DateFormat.getDateTimeInstance
149 (DateFormat.FULL, DateFormat.FULL, Locale.US);
150 dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
151 final Writer writer = new OutputStreamWriter
152 (new FileOutputStream(outputFile), "UTF-8");
154 writer.write(APACHE_LICENSE);
155 writer.write("// Generated from IANA Root Zone Database <");
156 writer.write(tldFileURL.toString());
159 if (tldFileLastModified > 0L) {
160 writer.write("// file version from ");
161 writer.write(dateFormat.format(tldFileLastModified));
164 writer.write("// generated on ");
165 writer.write(dateFormat.format(new Date()));
167 writer.write("// by ");
168 writer.write(this.getClass().getName());
171 writer.write("ASCIITLD = \".\" (");
173 boolean isFirst = true;
174 for (String ASCIITLD : ASCIITLDs) {
182 writer.write(getCaseInsensitiveRegex(ASCIITLD));
185 writer.write("\t) \".\"? // Accept trailing root (empty) domain");
194 * Returns a regex that will accept the given ASCII TLD case-insensitively.
196 * @param ASCIITLD The ASCII TLD to generate a regex for
197 * @return a regex that will accept the given ASCII TLD case-insensitively
199 private String getCaseInsensitiveRegex(String ASCIITLD) {
200 StringBuilder builder = new StringBuilder();
201 for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
202 char ch = ASCIITLD.charAt(pos);
203 if (Character.isDigit(ch) || ch == '-') {
206 builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
209 return builder.toString();