lucene-java-3.5.0/lucene/src/test/org/apache/lucene/analysis/generateJavaUnicodeWordBreakTest.pl

   1 #!/usr/bin/perl
   2
   3 # Licensed to the Apache Software Foundation (ASF) under one or more
   4 # contributor license agreements.  See the NOTICE file distributed with
   5 # this work for additional information regarding copyright ownership.
   6 # The ASF licenses this file to You under the Apache License, Version 2.0
   7 # (the "License"); you may not use this file except in compliance with
   8 # the License.  You may obtain a copy of the License at
   9 #
  10 #     http://www.apache.org/licenses/LICENSE-2.0
  11 #
  12 # Unless required by applicable law or agreed to in writing, software
  13 # distributed under the License is distributed on an "AS IS" BASIS,
  14 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  15 # See the License for the specific language governing permissions and
  16 # limitations under the License.
  17
  18 use warnings;
  19 use strict;
  20 use File::Spec;
  21 use Getopt::Long;
  22 use LWP::UserAgent;
  23
  24 my ($volume, $directory, $script_name) = File::Spec->splitpath($0);
  25
  26 my $version = '';
  27 unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) {
  28   print STDERR "Usage: $script_name -v <version>\n";
  29   print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n"
  30       if ($version);
  31   exit 1;
  32 }
  33 my $url_prefix = "http://www.unicode.org/Public/${version}/ucd";
  34 my $scripts_url = "${url_prefix}/Scripts.txt";
  35 my $line_break_url = "${url_prefix}/LineBreak.txt";
  36 my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt";
  37 my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt";
  38 my $underscore_version = $version;
  39 $underscore_version =~ s/\./_/g;
  40 my $class_name = "WordBreakTestUnicode_${underscore_version}";
  41 my $output_filename = "${class_name}.java";
  42 my $header =<<"__HEADER__";
  43 package org.apache.lucene.analysis;
  44
  45 /**
  46  * Licensed to the Apache Software Foundation (ASF) under one or more
  47  * contributor license agreements.  See the NOTICE file distributed with
  48  * this work for additional information regarding copyright ownership.
  49  * The ASF licenses this file to You under the Apache License, Version 2.0
  50  * (the "License"); you may not use this file except in compliance with
  51  * the License.  You may obtain a copy of the License at
  52  *
  53  *     http://www.apache.org/licenses/LICENSE-2.0
  54  *
  55  * Unless required by applicable law or agreed to in writing, software
  56  * distributed under the License is distributed on an "AS IS" BASIS,
  57  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  58  * See the License for the specific language governing permissions and
  59  * limitations under the License.
  60  */
  61
  62 import org.junit.Ignore;
  63
  64 /**
  65  * This class was automatically generated by ${script_name}
  66  * from: ${url_prefix}/auxiliary/WordBreakTest.txt
  67  *
  68  * WordBreakTest.txt indicates the points in the provided character sequences
  69  * at which conforming implementations must and must not break words.  This
  70  * class tests for expected token extraction from each of the test sequences
  71  * in WordBreakTest.txt, where the expected tokens are those character
  72  * sequences bounded by word breaks and containing at least one character
  73  * from one of the following character sets:
  74  *
  75  *    \\p{Script = Han}                (From $scripts_url)
  76  *    \\p{Script = Hiragana}
  77  *    \\p{LineBreak = Complex_Context} (From $line_break_url)
  78  *    \\p{WordBreak = ALetter}         (From $word_break_url)
  79  *    \\p{WordBreak = Katakana}
  80  *    \\p{WordBreak = Numeric}         (Excludes full-width Arabic digits)
  81  *    [\\uFF10-\\uFF19]                 (Full-width Arabic digits)
  82  */
  83 \@Ignore
  84 public class ${class_name} extends BaseTokenStreamTestCase {
  85
  86   public void test(Analyzer analyzer) throws Exception {
  87 __HEADER__
  88
  89 my $codepoints = [];
  90 map { $codepoints->[$_] = 1 } (0xFF10..0xFF19);
  91 # Complex_Context is an alias for 'SA', which is used in LineBreak.txt
  92 # Using lowercase versions of property value names to allow for case-
  93 # insensitive comparison with the names in the Unicode data files.
  94 parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1});
  95 parse_Unicode_data_file($scripts_url, $codepoints,
  96                         {'han' => 1, 'hiragana' => 1});
  97 parse_Unicode_data_file($word_break_url, $codepoints,
  98                         {'aletter' => 1, 'katakana' => 1, 'numeric' => 1});
  99 my @tests = split /\r?\n/, get_URL_content($word_break_test_url);
 100
 101 my $output_path = File::Spec->catpath($volume, $directory, $output_filename);
 102 open OUT, ">$output_path"
 103   || die "Error opening '$output_path' for writing: $!";
 104
 105 print STDERR "Writing '$output_path'...";
 106
 107 print OUT $header;
 108
 109 for my $line (@tests) {
 110   next if ($line =~ /^\s*\#/);
 111   # ÷ 0001 × 0300 ÷  #  ÷ [0.2] <START OF HEADING> (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3]
 112   my ($sequence) = $line =~ /^(.*?)\s*\#/;
 113   print OUT "    // $line\n";
 114   $sequence =~ s/\s*÷\s*$//; # Trim trailing break character
 115   my $test_string = $sequence;
 116   $test_string =~ s/\s*÷\s*/\\u/g;
 117   $test_string =~ s/\s*×\s*/\\u/g;
 118   $test_string =~ s/\\u000A/\\n/g;
 119   $test_string =~ s/\\u000D/\\r/g;
 120   $sequence =~ s/^\s*÷\s*//; # Trim leading break character
 121   my @tokens = ();
 122   for my $candidate (split /\s*÷\s*/, $sequence) {
 123     my @chars = ();
 124     my $has_wanted_char = 0;
 125     while ($candidate =~ /([0-9A-F]+)/gi) {
 126       push @chars, $1;
 127       unless ($has_wanted_char) {
 128         $has_wanted_char = 1 if (defined($codepoints->[hex($1)]));
 129       }
 130     }
 131     if ($has_wanted_char) {
 132       push @tokens, '"'.join('', map { "\\u$_" } @chars).'"';
 133     }
 134   }
 135   print OUT "    assertAnalyzesTo(analyzer, \"${test_string}\",\n";
 136   print OUT "                     new String[] { ";
 137   print OUT join(", ", @tokens), " });\n\n";
 138 }
 139
 140 print OUT "  }\n}\n";
 141 close OUT;
 142 print STDERR "done.\n";
 143
 144
 145 # sub parse_Unicode_data_file
 146 #
 147 # Downloads and parses the specified Unicode data file, parses it, and
 148 # extracts code points assigned any of the given property values, defining
 149 # the corresponding array position in the passed-in target array.
 150 #
 151 # Takes in the following parameters:
 152 #
 153 #  - URL of the Unicode data file to download and parse
 154 #  - Reference to target array
 155 #  - Reference to hash of property values to get code points for
 156 #
 157 sub parse_Unicode_data_file {
 158   my $url = shift;
 159   my $target = shift;
 160   my $wanted_property_values = shift;
 161   my $content = get_URL_content($url);
 162   print STDERR "Parsing '$url'...";
 163   my @lines = split /\r?\n/, $content;
 164   for (@lines) {
 165     s/\s*#.*//;         # Strip trailing comments
 166     s/\s+$//;           # Strip trailing space
 167     next unless (/\S/); # Skip empty lines
 168     my ($start, $end, $property_value);
 169     if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) {
 170       # 00AA       ; LATIN
 171       $start = $end = hex $1;
 172       $property_value = lc $2; # Property value names are case-insensitive
 173     } elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) {
 174       # 0AE6..0AEF ; Gujarati
 175       $start = hex $1;
 176       $end = hex $2;
 177       $property_value = lc $3; # Property value names are case-insensitive
 178     } else {
 179       next;
 180     }
 181     if (defined($wanted_property_values->{$property_value})) {
 182       for my $code_point ($start..$end) {
 183         $target->[$code_point] = 1;
 184       }
 185     }
 186   }
 187   print STDERR "done.\n";
 188 }
 189
 190 # sub get_URL_content
 191 #
 192 # Retrieves and returns the content of the given URL.
 193 #
 194 sub get_URL_content {
 195   my $url = shift;
 196   print STDERR "Retrieving '$url'...";
 197   my $user_agent = LWP::UserAgent->new;
 198   my $request = HTTP::Request->new(GET => $url);
 199   my $response = $user_agent->request($request);
 200   unless ($response->is_success) {
 201     print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n";
 202     exit 1;
 203   }
 204   print STDERR "done.\n";
 205   return $response->content;
 206 }