librarian/font-optimizer/Font/Subsetter/create-data.pl

   1 use strict;
   2 use warnings;
   3
   4 use Unicode::Normalize;
   5
   6 print <<EOF;
   7 package Font::Subsetter::NormalizationData;
   8 use strict;
   9 use warnings;
  10 our \@data = (
  11 EOF
  12
  13 # Output is:
  14 #  [x, a,b,c...],
  15 # where the codepoint x is the NFC normalization of the string a,b,c
  16
  17 my @data;
  18
  19 open my $f, '/usr/lib/perl5/5.8.8/unicore/UnicodeData.txt' or die $!;
  20 while (<$f>) {
  21     my @c = split /;/, $_;
  22     # Find characters which canonically decompose (without any
  23     # compatibility tag "<foo>")
  24     next unless $c[5] and $c[5] !~ /^</;
  25
  26     {
  27         # Print the character and its maximally-decomposed codepoints,
  28         # if they re-compose into it
  29         my @x = split //, Unicode::Normalize::NFD(chr hex $c[0]);
  30         push @data, [hex $c[0], map ord, @x]
  31             if Unicode::Normalize::NFC(join '', @x) eq chr hex $c[0];
  32     }
  33
  34     # Try to find all other strings that can become this character under NFC:
  35     # If the maximal decomposition is "abc", we might want the strings
  36     # "NFC(ab) c", "NFC(ac) b", etc, so attempt all permutations of abc
  37     # and then try all groupings to apply NFC to
  38
  39     my @norm = split //, Unicode::Normalize::NFD(chr hex $c[0]);
  40     if (@norm == 3) {
  41         my ($a, $b, $c) = @norm;
  42         for my $cs (permut([$a, $b, $c], [])) { # all permutations
  43             for my $cs2 ('ab c', 'a bc') { # all groupings
  44                 my @x = map Unicode::Normalize::NFC($_), map { s/(.)/$cs->[ord($1)-ord('a')]/eg; $_ } split / /, $cs2;
  45                 # If NFC didn't collapse everything into single characters, this string is not interesting
  46                 next if grep length != 1, @x;
  47                 # If the string doesn't NFC into the desired character, it's not interesting
  48                 next unless Unicode::Normalize::NFC(join '', @x) eq chr hex $c[0];
  49                 # This string is good
  50                 push @data, [hex $c[0], map ord, @x];
  51             }
  52         }
  53     } elsif (@norm == 4) {
  54         my ($a, $b, $c, $d) = @norm;
  55         for my $cs (permut([$a, $b, $c, $d], [])) {
  56             for my $cs2 ('ab c d', 'a bc d', 'a b cd', 'ab cd', 'abc d', 'a bcd') {
  57                 my @x = map Unicode::Normalize::NFC($_), map { s/(.)/$cs->[ord($1)-ord('a')]/eg; $_ } split / /, $cs2;
  58                 next if grep length != 1, @x;
  59                 next unless Unicode::Normalize::NFC(join '', @x) eq chr hex $c[0];
  60                 push @data, [hex $c[0], map ord, @x];
  61             }
  62         }
  63     } elsif (@norm > 4) {
  64         die "\@norm too big";
  65     }
  66 }
  67
  68 print uniq(map "[".join(',', @$_)."],\n", @data);
  69
  70 print <<EOF;
  71 );
  72
  73 1;
  74 EOF
  75
  76 sub permut {
  77     my @r;
  78     my @head = @{ $_[0] };
  79     my @tail = @{ $_[1] };
  80     unless (@head) {
  81         push @r, \@tail;
  82     } else {
  83         for my $i (0 .. $#head) {
  84             my @newhead = @head;
  85             my @newtail = @tail;
  86             unshift(@newtail, splice(@newhead, $i, 1));
  87             push @r, permut([@newhead], [@newtail]);
  88         }
  89     }
  90     return @r;
  91 }
  92
  93 sub uniq {
  94     my @u;
  95     my %u;
  96     for (@_) {
  97         push @u, $_ unless $u{$_}++;
  98     }
  99     @u;
 100 }