+++ /dev/null
-package org.apache.lucene.analysis.standard;
-
-/*
- * Copyright 2001-2005 The Apache Software Foundation.
- *
- * Licensed under the Apache License, Version 2.0 (the "License");
- * you may not use this file except in compliance with the License.
- * You may obtain a copy of the License at
- *
- * http://www.apache.org/licenses/LICENSE-2.0
- *
- * Unless required by applicable law or agreed to in writing, software
- * distributed under the License is distributed on an "AS IS" BASIS,
- * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- * See the License for the specific language governing permissions and
- * limitations under the License.
- */
-
-import java.io.BufferedReader;
-import java.io.File;
-import java.io.FileOutputStream;
-import java.io.IOException;
-import java.io.InputStreamReader;
-import java.io.OutputStreamWriter;
-import java.io.Writer;
-import java.net.URL;
-import java.net.URLConnection;
-import java.text.DateFormat;
-import java.util.Date;
-import java.util.Locale;
-import java.util.SortedSet;
-import java.util.TimeZone;
-import java.util.TreeSet;
-import java.util.regex.Matcher;
-import java.util.regex.Pattern;
-
-/**
- * Generates a file containing JFlex macros to accept valid ASCII TLDs
- * (top level domains), for inclusion in JFlex grammars that can accept
- * domain names.
- * <p/>
- * The IANA Root Zone Database is queried via HTTP from URL cmdline arg #0, the
- * response is parsed, and the results are written out to a file containing
- * a JFlex macro that will accept all valid ASCII-only TLDs, including punycode
- * forms of internationalized TLDs (output file cmdline arg #1).
- */
-public class GenerateJflexTLDMacros {
-
- public static void main(String... args) throws Exception {
- if (args.length != 2 || args[0].equals("--help") || args[0].equals("-help")) {
- System.err.println("Cmd line params:");
- System.err.println("\tjava " + GenerateJflexTLDMacros.class.getName()
- + "<ZoneFileURL> <JFlexOutputFile>");
- System.exit(1);
- }
- new GenerateJflexTLDMacros(args[0], args[1]).execute();
- }
-
- private static final String NL = System.getProperty("line.separator");
-
- private static final String APACHE_LICENSE
- = "/*" + NL
- + " * Copyright 2001-2005 The Apache Software Foundation." + NL
- + " *" + NL
- + " * Licensed under the Apache License, Version 2.0 (the \"License\");" + NL
- + " * you may not use this file except in compliance with the License." + NL
- + " * You may obtain a copy of the License at" + NL
- + " *" + NL
- + " * http://www.apache.org/licenses/LICENSE-2.0" + NL
- + " *" + NL
- + " * Unless required by applicable law or agreed to in writing, software" + NL
- + " * distributed under the License is distributed on an \"AS IS\" BASIS," + NL
- + " * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied." + NL
- + " * See the License for the specific language governing permissions and" + NL
- + " * limitations under the License." + NL
- + " */" + NL + NL;
-
- private static final Pattern TLD_PATTERN_1
- = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+NS\\s+.*");
- private static final Pattern TLD_PATTERN_2
- = Pattern.compile("([-A-Za-z0-9]+)\\.\\s+\\d+\\s+IN\\s+NS\\s+.*");
- private final URL tldFileURL;
- private long tldFileLastModified = -1L;
- private final File outputFile;
-
- public GenerateJflexTLDMacros(String tldFileURL, String outputFile)
- throws Exception {
- this.tldFileURL = new URL(tldFileURL);
- this.outputFile = new File(outputFile);
- }
-
- /**
- * Downloads the IANA Root Zone Database, extracts the ASCII TLDs, then
- * writes a JFlex macro accepting any of them case-insensitively out to
- * the specified output file.
- *
- * @throws IOException if there is a problem either downloading the database
- * or writing out the output file.
- */
- public void execute() throws IOException {
- final SortedSet<String> TLDs = getIANARootZoneDatabase();
- writeOutput(TLDs);
- System.err.println("Wrote " + TLDs.size() + " top level domains to '"
- + outputFile + "'.");
- }
-
- /**
- * Downloads the IANA Root Zone Database.
- * @return downcased sorted set of ASCII TLDs
- * @throws java.io.IOException if there is a problem downloading the database
- */
- private SortedSet<String> getIANARootZoneDatabase() throws IOException {
- final SortedSet<String> TLDs = new TreeSet<String>();
- final URLConnection connection = tldFileURL.openConnection();
- connection.setUseCaches(false);
- connection.addRequestProperty("Cache-Control", "no-cache");
- connection.connect();
- tldFileLastModified = connection.getLastModified();
- BufferedReader reader = new BufferedReader
- (new InputStreamReader(connection.getInputStream(), "US-ASCII"));
- try {
- String line;
- while (null != (line = reader.readLine())) {
- Matcher matcher = TLD_PATTERN_1.matcher(line);
- if (matcher.matches()) {
- TLDs.add(matcher.group(1).toLowerCase(Locale.US));
- } else {
- matcher = TLD_PATTERN_2.matcher(line);
- if (matcher.matches()) {
- TLDs.add(matcher.group(1).toLowerCase(Locale.US));
- }
- }
- }
- } finally {
- reader.close();
- }
- return TLDs;
- }
-
- /**
- * Writes a file containing a JFlex macro that will accept any of the given
- * TLDs case-insensitively.
- *
- * @param ASCIITLDs The downcased sorted set of top level domains to accept
- * @throws IOException if there is an error writing the output file
- */
- private void writeOutput(SortedSet<String> ASCIITLDs) throws IOException {
- final DateFormat dateFormat = DateFormat.getDateTimeInstance
- (DateFormat.FULL, DateFormat.FULL, Locale.US);
- dateFormat.setTimeZone(TimeZone.getTimeZone("UTC"));
- final Writer writer = new OutputStreamWriter
- (new FileOutputStream(outputFile), "UTF-8");
- try {
- writer.write(APACHE_LICENSE);
- writer.write("// Generated from IANA Root Zone Database <");
- writer.write(tldFileURL.toString());
- writer.write(">");
- writer.write(NL);
- if (tldFileLastModified > 0L) {
- writer.write("// file version from ");
- writer.write(dateFormat.format(tldFileLastModified));
- writer.write(NL);
- }
- writer.write("// generated on ");
- writer.write(dateFormat.format(new Date()));
- writer.write(NL);
- writer.write("// by ");
- writer.write(this.getClass().getName());
- writer.write(NL);
- writer.write(NL);
- writer.write("ASCIITLD = \".\" (");
- writer.write(NL);
- boolean isFirst = true;
- for (String ASCIITLD : ASCIITLDs) {
- writer.write("\t");
- if (isFirst) {
- isFirst = false;
- writer.write(" ");
- } else {
- writer.write("| ");
- }
- writer.write(getCaseInsensitiveRegex(ASCIITLD));
- writer.write(NL);
- }
- writer.write("\t) \".\"? // Accept trailing root (empty) domain");
- writer.write(NL);
- writer.write(NL);
- } finally {
- writer.close();
- }
- }
-
- /**
- * Returns a regex that will accept the given ASCII TLD case-insensitively.
- *
- * @param ASCIITLD The ASCII TLD to generate a regex for
- * @return a regex that will accept the given ASCII TLD case-insensitively
- */
- private String getCaseInsensitiveRegex(String ASCIITLD) {
- StringBuilder builder = new StringBuilder();
- for (int pos = 0 ; pos < ASCIITLD.length() ; ++pos) {
- char ch = ASCIITLD.charAt(pos);
- if (Character.isDigit(ch) || ch == '-') {
- builder.append(ch);
- } else {
- builder.append("[").append(ch).append(Character.toUpperCase(ch)).append("]");
- }
- }
- return builder.toString();
- }
-}