X-Git-Url: https://git.mdrn.pl/pylucene.git/blobdiff_plain/a2e61f0c04805cfcb8706176758d1283c7e3a55c..aaeed5504b982cf3545252ab528713250aa33eed:/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/generateJavaUnicodeWordBreakTest.pl diff --git a/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/generateJavaUnicodeWordBreakTest.pl b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/generateJavaUnicodeWordBreakTest.pl new file mode 100644 index 0000000..bff17a6 --- /dev/null +++ b/lucene-java-3.5.0/lucene/backwards/src/test/org/apache/lucene/analysis/generateJavaUnicodeWordBreakTest.pl @@ -0,0 +1,206 @@ +#!/usr/bin/perl + +# Licensed to the Apache Software Foundation (ASF) under one or more +# contributor license agreements. See the NOTICE file distributed with +# this work for additional information regarding copyright ownership. +# The ASF licenses this file to You under the Apache License, Version 2.0 +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +use warnings; +use strict; +use File::Spec; +use Getopt::Long; +use LWP::UserAgent; + +my ($volume, $directory, $script_name) = File::Spec->splitpath($0); + +my $version = ''; +unless (GetOptions("version=s" => \$version) && $version =~ /\d+\.\d+\.\d+/) { + print STDERR "Usage: $script_name -v \n"; + print STDERR "\tversion must be of the form X.Y.Z, e.g. 5.2.0\n" + if ($version); + exit 1; +} +my $url_prefix = "http://www.unicode.org/Public/${version}/ucd"; +my $scripts_url = "${url_prefix}/Scripts.txt"; +my $line_break_url = "${url_prefix}/LineBreak.txt"; +my $word_break_url = "${url_prefix}/auxiliary/WordBreakProperty.txt"; +my $word_break_test_url = "${url_prefix}/auxiliary/WordBreakTest.txt"; +my $underscore_version = $version; +$underscore_version =~ s/\./_/g; +my $class_name = "WordBreakTestUnicode_${underscore_version}"; +my $output_filename = "${class_name}.java"; +my $header =<<"__HEADER__"; +package org.apache.lucene.analysis; + +/** + * Licensed to the Apache Software Foundation (ASF) under one or more + * contributor license agreements. See the NOTICE file distributed with + * this work for additional information regarding copyright ownership. + * The ASF licenses this file to You under the Apache License, Version 2.0 + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +import org.junit.Ignore; + +/** + * This class was automatically generated by ${script_name} + * from: ${url_prefix}/auxiliary/WordBreakTest.txt + * + * WordBreakTest.txt indicates the points in the provided character sequences + * at which conforming implementations must and must not break words. This + * class tests for expected token extraction from each of the test sequences + * in WordBreakTest.txt, where the expected tokens are those character + * sequences bounded by word breaks and containing at least one character + * from one of the following character sets: + * + * \\p{Script = Han} (From $scripts_url) + * \\p{Script = Hiragana} + * \\p{LineBreak = Complex_Context} (From $line_break_url) + * \\p{WordBreak = ALetter} (From $word_break_url) + * \\p{WordBreak = Katakana} + * \\p{WordBreak = Numeric} (Excludes full-width Arabic digits) + * [\\uFF10-\\uFF19] (Full-width Arabic digits) + */ +\@Ignore +public class ${class_name} extends BaseTokenStreamTestCase { + + public void test(Analyzer analyzer) throws Exception { +__HEADER__ + +my $codepoints = []; +map { $codepoints->[$_] = 1 } (0xFF10..0xFF19); +# Complex_Context is an alias for 'SA', which is used in LineBreak.txt +# Using lowercase versions of property value names to allow for case- +# insensitive comparison with the names in the Unicode data files. +parse_Unicode_data_file($line_break_url, $codepoints, {'sa' => 1}); +parse_Unicode_data_file($scripts_url, $codepoints, + {'han' => 1, 'hiragana' => 1}); +parse_Unicode_data_file($word_break_url, $codepoints, + {'aletter' => 1, 'katakana' => 1, 'numeric' => 1}); +my @tests = split /\r?\n/, get_URL_content($word_break_test_url); + +my $output_path = File::Spec->catpath($volume, $directory, $output_filename); +open OUT, ">$output_path" + || die "Error opening '$output_path' for writing: $!"; + +print STDERR "Writing '$output_path'..."; + +print OUT $header; + +for my $line (@tests) { + next if ($line =~ /^\s*\#/); + # ÷ 0001 × 0300 ÷ # ÷ [0.2] (Other) × [4.0] COMBINING GRAVE ACCENT (Extend_FE) ÷ [0.3] + my ($sequence) = $line =~ /^(.*?)\s*\#/; + print OUT " // $line\n"; + $sequence =~ s/\s*÷\s*$//; # Trim trailing break character + my $test_string = $sequence; + $test_string =~ s/\s*÷\s*/\\u/g; + $test_string =~ s/\s*×\s*/\\u/g; + $test_string =~ s/\\u000A/\\n/g; + $test_string =~ s/\\u000D/\\r/g; + $sequence =~ s/^\s*÷\s*//; # Trim leading break character + my @tokens = (); + for my $candidate (split /\s*÷\s*/, $sequence) { + my @chars = (); + my $has_wanted_char = 0; + while ($candidate =~ /([0-9A-F]+)/gi) { + push @chars, $1; + unless ($has_wanted_char) { + $has_wanted_char = 1 if (defined($codepoints->[hex($1)])); + } + } + if ($has_wanted_char) { + push @tokens, '"'.join('', map { "\\u$_" } @chars).'"'; + } + } + print OUT " assertAnalyzesTo(analyzer, \"${test_string}\",\n"; + print OUT " new String[] { "; + print OUT join(", ", @tokens), " });\n\n"; +} + +print OUT " }\n}\n"; +close OUT; +print STDERR "done.\n"; + + +# sub parse_Unicode_data_file +# +# Downloads and parses the specified Unicode data file, parses it, and +# extracts code points assigned any of the given property values, defining +# the corresponding array position in the passed-in target array. +# +# Takes in the following parameters: +# +# - URL of the Unicode data file to download and parse +# - Reference to target array +# - Reference to hash of property values to get code points for +# +sub parse_Unicode_data_file { + my $url = shift; + my $target = shift; + my $wanted_property_values = shift; + my $content = get_URL_content($url); + print STDERR "Parsing '$url'..."; + my @lines = split /\r?\n/, $content; + for (@lines) { + s/\s*#.*//; # Strip trailing comments + s/\s+$//; # Strip trailing space + next unless (/\S/); # Skip empty lines + my ($start, $end, $property_value); + if (/^([0-9A-F]{4,5})\s*;\s*(.+)/i) { + # 00AA ; LATIN + $start = $end = hex $1; + $property_value = lc $2; # Property value names are case-insensitive + } elsif (/^([0-9A-F]{4,5})..([0-9A-F]{4,5})\s*;\s*(.+)/i) { + # 0AE6..0AEF ; Gujarati + $start = hex $1; + $end = hex $2; + $property_value = lc $3; # Property value names are case-insensitive + } else { + next; + } + if (defined($wanted_property_values->{$property_value})) { + for my $code_point ($start..$end) { + $target->[$code_point] = 1; + } + } + } + print STDERR "done.\n"; +} + +# sub get_URL_content +# +# Retrieves and returns the content of the given URL. +# +sub get_URL_content { + my $url = shift; + print STDERR "Retrieving '$url'..."; + my $user_agent = LWP::UserAgent->new; + my $request = HTTP::Request->new(GET => $url); + my $response = $user_agent->request($request); + unless ($response->is_success) { + print STDERR "Failed to download '$url':\n\t",$response->status_line,"\n"; + exit 1; + } + print STDERR "done.\n"; + return $response->content; +}