diff options
Diffstat (limited to 'contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl')
-rwxr-xr-x | contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl | 586 |
1 files changed, 586 insertions, 0 deletions
diff --git a/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl b/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl new file mode 100755 index 0000000..fe81648 --- /dev/null +++ b/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl @@ -0,0 +1,586 @@ +#! /usr/local/bin/perl -w +# $Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $ +# +# Copyright (c) 2000,2001 Japan Network Information Center. +# All rights reserved. +# +# By using this file, you agree to the terms and conditions set forth bellow. +# +# LICENSE TERMS AND CONDITIONS +# +# The following License Terms and Conditions apply, unless a different +# license is obtained from Japan Network Information Center ("JPNIC"), +# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda, +# Chiyoda-ku, Tokyo 101-0047, Japan. +# +# 1. Use, Modification and Redistribution (including distribution of any +# modified or derived work) in source and/or binary forms is permitted +# under this License Terms and Conditions. +# +# 2. Redistribution of source code must retain the copyright notices as they +# appear in each source code file, this License Terms and Conditions. +# +# 3. Redistribution in binary form must reproduce the Copyright Notice, +# this License Terms and Conditions, in the documentation and/or other +# materials provided with the distribution. For the purposes of binary +# distribution the "Copyright Notice" refers to the following language: +# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved." +# +# 4. The name of JPNIC may not be used to endorse or promote products +# derived from this Software without specific prior written approval of +# JPNIC. +# +# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC +# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT +# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A +# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE +# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR +# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF +# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR +# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, +# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR +# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF +# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES. +# + +# +# Generate lib/unicodedata.c from UnicodeData.txt, +# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt, +# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/. +# + +use strict; +use lib qw(.); + +use Getopt::Long; +use UCD; +use SparseMap; + +use constant UCS_MAX => 0x110000; +use constant END_BIT => 0x80000000; + +my $DECOMP_COMPAT_BIT = 0x8000; + +my $CASEMAP_FINAL_BIT = 0x1; +my $CASEMAP_NONFINAL_BIT = 0x2; +my $CASEMAP_LAST_BIT = 0x10; + +my $LETTER_BIT = 1; +my $NSPMARK_BIT = 2; + +(my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/; + +my @default_bits = (9, 7, 5); +#my @default_bits = (7, 7, 7); +my @canon_class_bits = @default_bits; +my @decomp_bits = @default_bits; +my @comp_bits = @default_bits; +my @folding_bits = @default_bits; +my @casemap_bits = @default_bits; +my @casemap_ctx_bits = @default_bits; + +my $prefix = ''; +my $dir = '.'; +my $unicodedatafile = 'UnicodeData.txt'; +my $exclusionfile = 'CompositionExclusions.txt'; +my $specialcasefile = 'SpecialCasing.txt'; +my $casefoldingfile = 'CaseFolding.txt'; +my $verbose; + +GetOptions('dir|d=s' => \$dir, + 'unicodedata|u=s' => \$unicodedatafile, + 'exclude|e=s' => \$exclusionfile, + 'specialcase|s=s' => \$specialcasefile, + 'casefold|c=s' => \$casefoldingfile, + 'prefix|p=s' => \$prefix, + 'verbose|v' => \$verbose, +) or usage(); + +foreach my $r (\$unicodedatafile, \$exclusionfile, + \$specialcasefile, \$casefoldingfile) { + $$r = "$dir/$$r" unless $$r =~ m|^/|; +} + +my %exclusions; +my %lower_special; +my %upper_special; + +my @decomp_data; +my @comp_data; +my @toupper_data; +my @tolower_data; +my @folding_data; + +# +# Create Mapping/Bitmap objects. +# + +# canonical class +my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# canonical/compatibility decomposition +my $decomp = SparseMap::Int->new(BITS => [@decomp_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# canonical composition +my $comp = SparseMap::Int->new(BITS => [@comp_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# uppercase/lowercase +my $upper = SparseMap::Int->new(BITS => [@casemap_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); +my $lower = SparseMap::Int->new(BITS => [@casemap_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# final/nonfinal context +my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# casefolding +my $folding = SparseMap::Int->new(BITS => [@folding_bits], + MAX => UCS_MAX, + MAPALL => 1, + DEFAULT => 0); + +# +# Read datafiles. +# + +read_exclusion_file(); +read_specialcasing_file(); +read_unicodedata_file(); +read_casefolding_file(); + +print_header(); +print_canon_class(); +print_composition(); +print_decomposition(); +print_casemap(); +print_casemap_context(); +print_casefolding(); + +exit; + +sub usage { + print STDERR <<"END"; +Usage: $0 [options..] + options: + -d DIR directory where Unicode Character Data files resides [./] + -u FILE name of the UnicodeData file [UnicodeData.txt] + -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt] + -s FILE name of the SpecialCasing file [SpecialCasing.txt] + -c FILE name of the CaseFolding file [CaseFolding.txt] +END + exit 1; +} + +# +# read_exclusion_file -- read CompositionExclusions-1.txt. +# +sub read_exclusion_file { + open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n"; + while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) { + my %data = UCD::CompositionExclusions::parseline($_); + $exclusions{$data{CODE}} = 1; + } + close EXCLUDE; +} + +# +# read_specialcasing_file -- read SpecialCasing.txt +# +sub read_specialcasing_file { + open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n"; + while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) { + my %data = UCD::SpecialCasing::parseline($_); + my $code = $data{CODE}; + my $lower = $data{LOWER}; + my $upper = $data{UPPER}; + my $cond = $data{CONDITION} || ''; + + next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/; + + if (defined $cond && (@$lower > 1 || $lower->[0] != $code) + or @$lower > 1 or $lower->[0] != $code) { + $lower_special{$code} = [$lower, $cond]; + } + if (defined $cond && (@$upper > 1 || $upper->[0] != $code) + or @$upper > 1 or $upper->[0] != $code) { + $upper_special{$code} = [$upper, $cond]; + } + } + close SPCASE; +} + +# +# read_unicodedata_file -- read UnicodeData.txt +# +sub read_unicodedata_file { + open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n"; + + @decomp_data = (0); + @toupper_data = (0); + @tolower_data = (0); + + my @comp_cand; # canonical composition candidates + my %nonstarter; + + while ($_ = UCD::UnicodeData::getline(\*UCD)) { + my %data = UCD::UnicodeData::parseline($_); + my $code = $data{CODE}; + + # combining class + if ($data{CLASS} > 0) { + $nonstarter{$code} = 1; + $canon_class->add($code, $data{CLASS}); + } + + # uppercasing + if (exists $upper_special{$code} or defined $data{UPPER}) { + my $offset = @toupper_data; + my @casedata; + + $upper->add($code, $offset); + if (exists $upper_special{$code}) { + push @casedata, $upper_special{$code}; + } + if (defined $data{UPPER}) { + push @casedata, $data{UPPER}; + } + push @toupper_data, casemap_data(@casedata); + } + + # lowercasing + if (exists $lower_special{$code} or defined $data{LOWER}) { + my $offset = @tolower_data; + my @casedata; + + $lower->add($code, $offset); + if (exists $lower_special{$code}) { + push @casedata, $lower_special{$code}; + } + if (defined $data{LOWER}) { + push @casedata, $data{LOWER}; + } + push @tolower_data, casemap_data(@casedata); + } + + # composition/decomposition + if ($data{DECOMP}) { + my ($tag, @decomp) = @{$data{DECOMP}}; + my $offset = @decomp_data; + + # composition + if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) { + # canonical composition candidate + push @comp_cand, [$code, @decomp]; + } + + # decomposition + if ($tag ne '') { + # compatibility decomposition + $offset |= $DECOMP_COMPAT_BIT; + } + $decomp->add($code, $offset); + push @decomp_data, @decomp; + $decomp_data[-1] |= END_BIT; + + } + + # final/nonfinal context + if ($data{CATEGORY} =~ /L[ult]/) { + $casemap_ctx->add($code, $LETTER_BIT); + } elsif ($data{CATEGORY} eq 'Mn') { + $casemap_ctx->add($code, $NSPMARK_BIT); + } + } + close UCD; + + # Eliminate composition candidates whose decomposition starts with + # a non-starter. + @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand; + + @comp_data = ([0, 0, 0]); + my $last_code = -1; + my $last_offset = @comp_data; + for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) { + if ($r->[1] != $last_code) { + $comp->add($last_code, + ($last_offset | ((@comp_data - $last_offset)<<16))) + unless $last_code == -1; + $last_code = $r->[1]; + $last_offset = @comp_data; + } + push @comp_data, $r; + } + $comp->add($last_code, + ($last_offset | ((@comp_data - $last_offset)<<16))); +} + +sub casemap_data { + my @data = @_; + my @result = (); + while (@data > 0) { + my $r = shift @data; + my $flag = 0; + if (ref $r) { + if ($r->[1] eq 'FINAL') { + $flag |= $CASEMAP_FINAL_BIT; + } elsif ($r->[1] eq 'NON_FINAL') { + $flag |= $CASEMAP_NONFINAL_BIT; + } elsif ($r->[1] ne '') { + die "unknown condition \"", $r->[1], "\"\n"; + } + } + $flag |= $CASEMAP_LAST_BIT if @data == 0; + push @result, $flag; + push @result, (ref $r) ? @{$r->[0]} : $r; + $result[-1] |= END_BIT; + } + @result; +} + +# +# read_casefolding_file -- read CaseFolding.txt +# +sub read_casefolding_file { + open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n"; + + # dummy. + @folding_data = (0); + + while ($_ = UCD::CaseFolding::getline(\*FOLD)) { + my %data = UCD::CaseFolding::parseline($_); + + $folding->add($data{CODE}, scalar(@folding_data)); + push @folding_data, @{$data{MAP}}; + $folding_data[-1] |= END_BIT; + } + close FOLD; +} + +sub print_header { + print <<"END"; +/* \$Id\$ */ +/* $myid */ +/* + * Do not edit this file! + * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt, + * SpecialCasing.txt and CaseFolding.txt. + */ + +END +} + +# +# print_canon_class -- generate data for canonical class +# +sub print_canon_class { + $canon_class->fix(); + print STDERR "** cannon_class\n", $canon_class->stat() if $verbose; + + print <<"END"; + +/* + * Canonical Class + */ + +END + print_bits("CANON_CLASS", @canon_class_bits); + print "\n"; + print $canon_class->cprog(NAME => "${prefix}canon_class"); +} + +# +# print_composition -- generate data for canonical composition +# +sub print_composition { + $comp->fix(); + print STDERR "** composition\n", $comp->stat() if $verbose; + + print <<"END"; + +/* + * Canonical Composition + */ + +END + print_bits("CANON_COMPOSE", @comp_bits); + print "\n"; + print $comp->cprog(NAME => "${prefix}compose"); + print <<"END"; + +static const struct composition ${prefix}compose_seq[] = { +END + my $i = 0; + foreach my $r (@comp_data) { + if ($i % 2 == 0) { + print "\n" if $i != 0; + print "\t"; + } + printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0]; + $i++; + } + print "\n};\n\n"; +} + +# +# print_decomposition -- generate data for canonical/compatibility +# decomposition +# +sub print_decomposition { + $decomp->fix(); + print STDERR "** decomposition\n", $decomp->stat() if $verbose; + + print <<"END"; + +/* + * Canonical/Compatibility Decomposition + */ + +END + print_bits("DECOMP", @decomp_bits); + print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n"; + + print $decomp->cprog(NAME => "${prefix}decompose"); + + print "static const unsigned long ${prefix}decompose_seq[] = {\n"; + print_ulseq(@decomp_data); + print "};\n\n"; +} + +# +# print_casemap -- generate data for case mapping +# +sub print_casemap { + $upper->fix(); + $lower->fix(); + print STDERR "** upper mapping\n", $upper->stat() if $verbose; + print STDERR "** lower mapping\n", $lower->stat() if $verbose; + + print <<"END"; + +/* + * Lowercase <-> Uppercase mapping + */ + +/* + * Flags for special case mapping. + */ +#define CMF_FINAL $CASEMAP_FINAL_BIT +#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT +#define CMF_LAST $CASEMAP_LAST_BIT +#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL) + +END + print_bits("CASEMAP", @casemap_bits); + print "\n"; + print $upper->cprog(NAME => "${prefix}toupper"); + print $lower->cprog(NAME => "${prefix}tolower"); + + print "static const unsigned long ${prefix}toupper_seq[] = {\n"; + print_ulseq(@toupper_data); + print "};\n\n"; + + print "static const unsigned long ${prefix}tolower_seq[] = {\n"; + print_ulseq(@tolower_data); + print "};\n\n"; +} + +# +# print_casefolding -- generate data for case folding +# +sub print_casefolding { + $folding->fix(); + print STDERR "** case folding\n", $folding->stat() if $verbose; + + print <<"END"; + +/* + * Case Folding + */ + +END + print_bits("CASE_FOLDING", @folding_bits); + print "\n"; + print $folding->cprog(NAME => "${prefix}case_folding"); + + print "static const unsigned long ${prefix}case_folding_seq[] = {\n"; + print_ulseq(@folding_data); + print "};\n\n"; +} + +# +# print_casemap_context -- gerarate data for determining context +# (final/non-final) +# +sub print_casemap_context { + $casemap_ctx->fix(); + print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose; + + print <<"END"; + +/* + * Cased characters and non-spacing marks (for casemap context) + */ + +END + + print_bits("CASEMAP_CTX", @casemap_ctx_bits); + print <<"END"; + +#define CTX_CASED $LETTER_BIT +#define CTX_NSM $NSPMARK_BIT + +END + print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx"); +} + +sub sprint_composition_hash { + my $i = 0; + my $s = ''; + foreach my $r (@_) { + if ($i % 2 == 0) { + $s .= "\n" if $i != 0; + $s .= "\t"; + } + $s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r}; + $i++; + } + $s; +} + +sub print_bits { + my $prefix = shift; + my $i = 0; + foreach my $bit (@_) { + print "#define ${prefix}_BITS_$i\t$bit\n"; + $i++; + } +} + +sub print_ulseq { + my $i = 0; + foreach my $v (@_) { + if ($i % 4 == 0) { + print "\n" if $i != 0; + print "\t"; + } + printf "0x%08x, ", $v; + $i++; + } + print "\n"; +} |