--- /dev/null
+package org.apache.lucene.analysis.standard;
+
+/*
+ * Copyright 2001-2005 The Apache Software Foundation.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+import java.io.BufferedReader;
+import java.io.File;
+import java.io.FileOutputStream;
+import java.io.IOException;
+import java.io.InputStreamReader;
+import java.io.OutputStreamWriter;
+import java.io.Writer;
+import java.net.URL;
+import java.net.URLConnection;
+import java.text.DateFormat;
+import java.util.Date;
+import java.util.Locale;
+import java.util.SortedSet;
+import java.util.TimeZone;
+import java.util.TreeSet;
+import java.util.regex.Matcher;
+import java.util.regex.Pattern;
+
+/**
+ * Generates a file containing JFlex macros to accept valid ASCII TLDs
+ * (top level domains), for inclusion in JFlex grammars that can accept
+ * domain names.
+ * <p/>
+ * The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
+ * response is parsed, and the results are written out to a file containing
+ * a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
+ * forms of internationalized TLDs (output file cmdline arg #1).
+ */
+public class GenerateJflexTLDMacros {
+
+ public static void main(String... args) throws Exception {
+ if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
+ System.err.println("Cmd line params:");
+ System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
+ + "<ZoneFileURL> <JFlexOutputFile>");
+ System.exit(1);
+ }
+ new GenerateJflexTLDMacros(args[0], args[1]).execute();
+ }
+
+ private static final String NL = System.getProperty("line.separator");
+
+ private static final String APACHE_LICENSE
+ = "/*" + NL
+ + " * Copyright 2001-2005 The Apache Software Foundation." + NL
+ + " *" + NL
+ + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
+ + " * you may not use this file except in compliance with the License." + NL
+ + " * You may obtain a copy of the License at" + NL
+ + " *" + NL
+ + " * http://www.apache.org/licenses/LICENSE-2.0" + NL
+ + " *" + NL
+ + " * Unless required by applicable law or agreed to in writing, software" + NL
+ + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
+ + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
+ + " * See the License for the specific language governing permissions and" + NL
+ + " * limitations under the License." + NL
+ + " */" + NL + NL;
+
+ private static final Pattern TLD_PATTERN_1
+ = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
+ private static final Pattern TLD_PATTERN_2
+ = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
+ private final URL tldFileURL;
+ private long tldFileLastModified = -1L;
+ private final File outputFile;
+
+ public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
+ throws Exception {
+ this.tldFileURL = new URL(tldFileURL);
+ this.outputFile = new File(outputFile);
+ }
+
+ /**
+ * Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
+ * writes a JFlex macro accepting any of them case-insensitively out to
+ * the specified output file.
+ *
+ * @throws IOException if there is a problem either downloading the database
+ * or writing out the output file.
+ */
+ public void execute() throws IOException {
+ final SortedSet<String> TLDs = getIANARootZoneDatabase();
+ writeOutput(TLDs);
+ System.err.println("Wrote " + TLDs.size() + " top level domains to '"
+ + outputFile + "'.");
+ }
+
+ /**
+ * Downloads the IANA Root Zone Database.
+ * @return downcased sorted set of ASCII TLDs
+ * @throws java.io.IOException if there is a problem downloading the database
+ */
+ private SortedSet<String> getIANARootZoneDatabase() throws IOException {
+ final SortedSet<String> TLDs = new TreeSet<String>();
+ final URLConnection connection = tldFileURL.openConnection();
+ connection.setUseCaches(false);
+ connection.addRequestProperty("Cache-Control", "no-cache");
+ connection.connect();
+ tldFileLastModified = connection.getLastModified();
+ BufferedReader reader = new BufferedReader
+ (new InputStreamReader(connection.getInputStream(), "US-ASCII"));
+ try {
+ String line;
+ while (null != (line = reader.readLine())) {
+ Matcher matcher = TLD_PATTERN_1.matcher(line);
+ if (matcher.matches()) {
+ TLDs.add(matcher.group(1).toLowerCase(Locale.US));
+ } else {
+ matcher = TLD_PATTERN_2.matcher(line);
+ if (matcher.matches()) {
+ TLDs.add(matcher.group(1).toLowerCase(Locale.US));
+ }
+ }
+ }
+ } finally {
+ reader.close();
+ }
+ return TLDs;
+ }
+
+ /**
+ * Writes a file containing a JFlex macro that will accept any of the given
+ * TLDs case-insensitively.
+ *
+ * @param ASCIITLDs The downcased sorted set of top level domains to accept
+ * @throws IOException if there is an error writing the output file
+ */
+ private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
+ final DateFormat dateFormat = DateFormat.getDateTimeInstance
+ (DateFormat.FULL, DateFormat.FULL, Locale.US);
+ dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
+ final Writer writer = new OutputStreamWriter
+ (new FileOutputStream(outputFile), "UTF-8");
+ try {
+ writer.write(APACHE_LICENSE);
+ writer.write("// Generated from IANA Root Zone Database <");
+ writer.write(tldFileURL.toString());
+ writer.write(">");
+ writer.write(NL);
+ if (tldFileLastModified > 0L) {
+ writer.write("// file version from ");
+ writer.write(dateFormat.format(tldFileLastModified));
+ writer.write(NL);
+ }
+ writer.write("// generated on ");
+ writer.write(dateFormat.format(new Date()));
+ writer.write(NL);
+ writer.write("// by ");
+ writer.write(this.getClass().getName());
+ writer.write(NL);
+ writer.write(NL);
+ writer.write("ASCIITLD = \".\" (");
+ writer.write(NL);
+ boolean isFirst = true;
+ for (String ASCIITLD : ASCIITLDs) {
+ writer.write("\t");
+ if (isFirst) {
+ isFirst = false;
+ writer.write(" ");
+ } else {
+ writer.write("| ");
+ }
+ writer.write(getCaseInsensitiveRegex(ASCIITLD));
+ writer.write(NL);
+ }
+ writer.write("\t) \".\"? // Accept trailing root (empty) domain");
+ writer.write(NL);
+ writer.write(NL);
+ } finally {
+ writer.close();
+ }
+ }
+
+ /**
+ * Returns a regex that will accept the given ASCII TLD case-insensitively.
+ *
+ * @param ASCIITLD The ASCII TLD to generate a regex for
+ * @return a regex that will accept the given ASCII TLD case-insensitively
+ */
+ private String getCaseInsensitiveRegex(String ASCIITLD) {
+ StringBuilder builder = new StringBuilder();
+ for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
+ char ch = ASCIITLD.charAt(pos);
+ if (Character.isDigit(ch) || ch == '-') {
+ builder.append(ch);
+ } else {
+ builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
+ }
+ }
+ return builder.toString();
+ }
+}