summaryrefslogtreecommitdiffstats
path: root/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl
diff options
context:
space:
mode:
Diffstat (limited to 'contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl')
-rwxr-xr-xcontrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl586
1 files changed, 586 insertions, 0 deletions
diff --git a/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl b/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl
new file mode 100755
index 0000000..fe81648
--- /dev/null
+++ b/contrib/idn/idnkit-1.0-src/util/generate_normalize_data.pl
@@ -0,0 +1,586 @@
+#! /usr/local/bin/perl -w
+# $Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $
+#
+# Copyright (c) 2000,2001 Japan Network Information Center.
+# All rights reserved.
+#
+# By using this file, you agree to the terms and conditions set forth bellow.
+#
+# LICENSE TERMS AND CONDITIONS
+#
+# The following License Terms and Conditions apply, unless a different
+# license is obtained from Japan Network Information Center ("JPNIC"),
+# a Japanese association, Kokusai-Kougyou-Kanda Bldg 6F, 2-3-4 Uchi-Kanda,
+# Chiyoda-ku, Tokyo 101-0047, Japan.
+#
+# 1. Use, Modification and Redistribution (including distribution of any
+# modified or derived work) in source and/or binary forms is permitted
+# under this License Terms and Conditions.
+#
+# 2. Redistribution of source code must retain the copyright notices as they
+# appear in each source code file, this License Terms and Conditions.
+#
+# 3. Redistribution in binary form must reproduce the Copyright Notice,
+# this License Terms and Conditions, in the documentation and/or other
+# materials provided with the distribution. For the purposes of binary
+# distribution the "Copyright Notice" refers to the following language:
+# "Copyright (c) 2000-2002 Japan Network Information Center. All rights reserved."
+#
+# 4. The name of JPNIC may not be used to endorse or promote products
+# derived from this Software without specific prior written approval of
+# JPNIC.
+#
+# 5. Disclaimer/Limitation of Liability: THIS SOFTWARE IS PROVIDED BY JPNIC
+# "AS IS" AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
+# LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A
+# PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL JPNIC BE LIABLE
+# FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+# CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+# SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR
+# BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY,
+# WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR
+# OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF
+# ADVISED OF THE POSSIBILITY OF SUCH DAMAGES.
+#
+
+#
+# Generate lib/unicodedata.c from UnicodeData.txt,
+# CompositionExclusions-1.txt, SpecialCasing.txt and CaseFolding.txt,
+# all of them available from ftp://ftp.unicode.org/Public/UNIDATA/.
+#
+
+use strict;
+use lib qw(.);
+
+use Getopt::Long;
+use UCD;
+use SparseMap;
+
+use constant UCS_MAX => 0x110000;
+use constant END_BIT => 0x80000000;
+
+my $DECOMP_COMPAT_BIT = 0x8000;
+
+my $CASEMAP_FINAL_BIT = 0x1;
+my $CASEMAP_NONFINAL_BIT = 0x2;
+my $CASEMAP_LAST_BIT = 0x10;
+
+my $LETTER_BIT = 1;
+my $NSPMARK_BIT = 2;
+
+(my $myid = '$Id: generate_normalize_data.pl,v 1.1.1.1 2003/06/04 00:27:55 marka Exp $') =~ s/\$([^\$]+)\$/\$-$1-\$/;
+
+my @default_bits = (9, 7, 5);
+#my @default_bits = (7, 7, 7);
+my @canon_class_bits = @default_bits;
+my @decomp_bits = @default_bits;
+my @comp_bits = @default_bits;
+my @folding_bits = @default_bits;
+my @casemap_bits = @default_bits;
+my @casemap_ctx_bits = @default_bits;
+
+my $prefix = '';
+my $dir = '.';
+my $unicodedatafile = 'UnicodeData.txt';
+my $exclusionfile = 'CompositionExclusions.txt';
+my $specialcasefile = 'SpecialCasing.txt';
+my $casefoldingfile = 'CaseFolding.txt';
+my $verbose;
+
+GetOptions('dir|d=s' => \$dir,
+ 'unicodedata|u=s' => \$unicodedatafile,
+ 'exclude|e=s' => \$exclusionfile,
+ 'specialcase|s=s' => \$specialcasefile,
+ 'casefold|c=s' => \$casefoldingfile,
+ 'prefix|p=s' => \$prefix,
+ 'verbose|v' => \$verbose,
+) or usage();
+
+foreach my $r (\$unicodedatafile, \$exclusionfile,
+ \$specialcasefile, \$casefoldingfile) {
+ $$r = "$dir/$$r" unless $$r =~ m|^/|;
+}
+
+my %exclusions;
+my %lower_special;
+my %upper_special;
+
+my @decomp_data;
+my @comp_data;
+my @toupper_data;
+my @tolower_data;
+my @folding_data;
+
+#
+# Create Mapping/Bitmap objects.
+#
+
+# canonical class
+my $canon_class = SparseMap::Int->new(BITS => [@canon_class_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+# canonical/compatibility decomposition
+my $decomp = SparseMap::Int->new(BITS => [@decomp_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+# canonical composition
+my $comp = SparseMap::Int->new(BITS => [@comp_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+# uppercase/lowercase
+my $upper = SparseMap::Int->new(BITS => [@casemap_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+my $lower = SparseMap::Int->new(BITS => [@casemap_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+# final/nonfinal context
+my $casemap_ctx = SparseMap::Int->new(BITS => [@casemap_ctx_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+# casefolding
+my $folding = SparseMap::Int->new(BITS => [@folding_bits],
+ MAX => UCS_MAX,
+ MAPALL => 1,
+ DEFAULT => 0);
+
+#
+# Read datafiles.
+#
+
+read_exclusion_file();
+read_specialcasing_file();
+read_unicodedata_file();
+read_casefolding_file();
+
+print_header();
+print_canon_class();
+print_composition();
+print_decomposition();
+print_casemap();
+print_casemap_context();
+print_casefolding();
+
+exit;
+
+sub usage {
+ print STDERR <<"END";
+Usage: $0 [options..]
+ options:
+ -d DIR directory where Unicode Character Data files resides [./]
+ -u FILE name of the UnicodeData file [UnicodeData.txt]
+ -e FILE name of the CompositionExclusion file [CompositionExclusions-1.txt]
+ -s FILE name of the SpecialCasing file [SpecialCasing.txt]
+ -c FILE name of the CaseFolding file [CaseFolding.txt]
+END
+ exit 1;
+}
+
+#
+# read_exclusion_file -- read CompositionExclusions-1.txt.
+#
+sub read_exclusion_file {
+ open EXCLUDE, $exclusionfile or die "cannot open $exclusionfile: $!\n";
+ while ($_ = UCD::CompositionExclusions::getline(\*EXCLUDE)) {
+ my %data = UCD::CompositionExclusions::parseline($_);
+ $exclusions{$data{CODE}} = 1;
+ }
+ close EXCLUDE;
+}
+
+#
+# read_specialcasing_file -- read SpecialCasing.txt
+#
+sub read_specialcasing_file {
+ open SPCASE, $specialcasefile or die "cannot open $specialcasefile: $!\n";
+ while ($_ = UCD::SpecialCasing::getline(\*SPCASE)) {
+ my %data = UCD::SpecialCasing::parseline($_);
+ my $code = $data{CODE};
+ my $lower = $data{LOWER};
+ my $upper = $data{UPPER};
+ my $cond = $data{CONDITION} || '';
+
+ next unless $cond eq '' or $cond =~ /^(NON_)?FINAL/;
+
+ if (defined $cond && (@$lower > 1 || $lower->[0] != $code)
+ or @$lower > 1 or $lower->[0] != $code) {
+ $lower_special{$code} = [$lower, $cond];
+ }
+ if (defined $cond && (@$upper > 1 || $upper->[0] != $code)
+ or @$upper > 1 or $upper->[0] != $code) {
+ $upper_special{$code} = [$upper, $cond];
+ }
+ }
+ close SPCASE;
+}
+
+#
+# read_unicodedata_file -- read UnicodeData.txt
+#
+sub read_unicodedata_file {
+ open UCD, $unicodedatafile or die "cannot open $unicodedatafile: $!\n";
+
+ @decomp_data = (0);
+ @toupper_data = (0);
+ @tolower_data = (0);
+
+ my @comp_cand; # canonical composition candidates
+ my %nonstarter;
+
+ while ($_ = UCD::UnicodeData::getline(\*UCD)) {
+ my %data = UCD::UnicodeData::parseline($_);
+ my $code = $data{CODE};
+
+ # combining class
+ if ($data{CLASS} > 0) {
+ $nonstarter{$code} = 1;
+ $canon_class->add($code, $data{CLASS});
+ }
+
+ # uppercasing
+ if (exists $upper_special{$code} or defined $data{UPPER}) {
+ my $offset = @toupper_data;
+ my @casedata;
+
+ $upper->add($code, $offset);
+ if (exists $upper_special{$code}) {
+ push @casedata, $upper_special{$code};
+ }
+ if (defined $data{UPPER}) {
+ push @casedata, $data{UPPER};
+ }
+ push @toupper_data, casemap_data(@casedata);
+ }
+
+ # lowercasing
+ if (exists $lower_special{$code} or defined $data{LOWER}) {
+ my $offset = @tolower_data;
+ my @casedata;
+
+ $lower->add($code, $offset);
+ if (exists $lower_special{$code}) {
+ push @casedata, $lower_special{$code};
+ }
+ if (defined $data{LOWER}) {
+ push @casedata, $data{LOWER};
+ }
+ push @tolower_data, casemap_data(@casedata);
+ }
+
+ # composition/decomposition
+ if ($data{DECOMP}) {
+ my ($tag, @decomp) = @{$data{DECOMP}};
+ my $offset = @decomp_data;
+
+ # composition
+ if ($tag eq '' and @decomp > 1 and not exists $exclusions{$code}) {
+ # canonical composition candidate
+ push @comp_cand, [$code, @decomp];
+ }
+
+ # decomposition
+ if ($tag ne '') {
+ # compatibility decomposition
+ $offset |= $DECOMP_COMPAT_BIT;
+ }
+ $decomp->add($code, $offset);
+ push @decomp_data, @decomp;
+ $decomp_data[-1] |= END_BIT;
+
+ }
+
+ # final/nonfinal context
+ if ($data{CATEGORY} =~ /L[ult]/) {
+ $casemap_ctx->add($code, $LETTER_BIT);
+ } elsif ($data{CATEGORY} eq 'Mn') {
+ $casemap_ctx->add($code, $NSPMARK_BIT);
+ }
+ }
+ close UCD;
+
+ # Eliminate composition candidates whose decomposition starts with
+ # a non-starter.
+ @comp_cand = grep {not exists $nonstarter{$_->[1]}} @comp_cand;
+
+ @comp_data = ([0, 0, 0]);
+ my $last_code = -1;
+ my $last_offset = @comp_data;
+ for my $r (sort {$a->[1] <=> $b->[1] || $a->[2] <=> $b->[2]} @comp_cand) {
+ if ($r->[1] != $last_code) {
+ $comp->add($last_code,
+ ($last_offset | ((@comp_data - $last_offset)<<16)))
+ unless $last_code == -1;
+ $last_code = $r->[1];
+ $last_offset = @comp_data;
+ }
+ push @comp_data, $r;
+ }
+ $comp->add($last_code,
+ ($last_offset | ((@comp_data - $last_offset)<<16)));
+}
+
+sub casemap_data {
+ my @data = @_;
+ my @result = ();
+ while (@data > 0) {
+ my $r = shift @data;
+ my $flag = 0;
+ if (ref $r) {
+ if ($r->[1] eq 'FINAL') {
+ $flag |= $CASEMAP_FINAL_BIT;
+ } elsif ($r->[1] eq 'NON_FINAL') {
+ $flag |= $CASEMAP_NONFINAL_BIT;
+ } elsif ($r->[1] ne '') {
+ die "unknown condition \"", $r->[1], "\"\n";
+ }
+ }
+ $flag |= $CASEMAP_LAST_BIT if @data == 0;
+ push @result, $flag;
+ push @result, (ref $r) ? @{$r->[0]} : $r;
+ $result[-1] |= END_BIT;
+ }
+ @result;
+}
+
+#
+# read_casefolding_file -- read CaseFolding.txt
+#
+sub read_casefolding_file {
+ open FOLD, $casefoldingfile or die "cannto open $casefoldingfile: $!\n";
+
+ # dummy.
+ @folding_data = (0);
+
+ while ($_ = UCD::CaseFolding::getline(\*FOLD)) {
+ my %data = UCD::CaseFolding::parseline($_);
+
+ $folding->add($data{CODE}, scalar(@folding_data));
+ push @folding_data, @{$data{MAP}};
+ $folding_data[-1] |= END_BIT;
+ }
+ close FOLD;
+}
+
+sub print_header {
+ print <<"END";
+/* \$Id\$ */
+/* $myid */
+/*
+ * Do not edit this file!
+ * This file is generated from UnicodeData.txt, CompositionExclusions-1.txt,
+ * SpecialCasing.txt and CaseFolding.txt.
+ */
+
+END
+}
+
+#
+# print_canon_class -- generate data for canonical class
+#
+sub print_canon_class {
+ $canon_class->fix();
+ print STDERR "** cannon_class\n", $canon_class->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Canonical Class
+ */
+
+END
+ print_bits("CANON_CLASS", @canon_class_bits);
+ print "\n";
+ print $canon_class->cprog(NAME => "${prefix}canon_class");
+}
+
+#
+# print_composition -- generate data for canonical composition
+#
+sub print_composition {
+ $comp->fix();
+ print STDERR "** composition\n", $comp->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Canonical Composition
+ */
+
+END
+ print_bits("CANON_COMPOSE", @comp_bits);
+ print "\n";
+ print $comp->cprog(NAME => "${prefix}compose");
+ print <<"END";
+
+static const struct composition ${prefix}compose_seq[] = {
+END
+ my $i = 0;
+ foreach my $r (@comp_data) {
+ if ($i % 2 == 0) {
+ print "\n" if $i != 0;
+ print "\t";
+ }
+ printf "{ 0x%08x, 0x%08x }, ", $r->[2], $r->[0];
+ $i++;
+ }
+ print "\n};\n\n";
+}
+
+#
+# print_decomposition -- generate data for canonical/compatibility
+# decomposition
+#
+sub print_decomposition {
+ $decomp->fix();
+ print STDERR "** decomposition\n", $decomp->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Canonical/Compatibility Decomposition
+ */
+
+END
+ print_bits("DECOMP", @decomp_bits);
+ print "#define DECOMP_COMPAT\t$DECOMP_COMPAT_BIT\n\n";
+
+ print $decomp->cprog(NAME => "${prefix}decompose");
+
+ print "static const unsigned long ${prefix}decompose_seq[] = {\n";
+ print_ulseq(@decomp_data);
+ print "};\n\n";
+}
+
+#
+# print_casemap -- generate data for case mapping
+#
+sub print_casemap {
+ $upper->fix();
+ $lower->fix();
+ print STDERR "** upper mapping\n", $upper->stat() if $verbose;
+ print STDERR "** lower mapping\n", $lower->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Lowercase <-> Uppercase mapping
+ */
+
+/*
+ * Flags for special case mapping.
+ */
+#define CMF_FINAL $CASEMAP_FINAL_BIT
+#define CMF_NONFINAL $CASEMAP_NONFINAL_BIT
+#define CMF_LAST $CASEMAP_LAST_BIT
+#define CMF_CTXDEP (CMF_FINAL|CMF_NONFINAL)
+
+END
+ print_bits("CASEMAP", @casemap_bits);
+ print "\n";
+ print $upper->cprog(NAME => "${prefix}toupper");
+ print $lower->cprog(NAME => "${prefix}tolower");
+
+ print "static const unsigned long ${prefix}toupper_seq[] = {\n";
+ print_ulseq(@toupper_data);
+ print "};\n\n";
+
+ print "static const unsigned long ${prefix}tolower_seq[] = {\n";
+ print_ulseq(@tolower_data);
+ print "};\n\n";
+}
+
+#
+# print_casefolding -- generate data for case folding
+#
+sub print_casefolding {
+ $folding->fix();
+ print STDERR "** case folding\n", $folding->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Case Folding
+ */
+
+END
+ print_bits("CASE_FOLDING", @folding_bits);
+ print "\n";
+ print $folding->cprog(NAME => "${prefix}case_folding");
+
+ print "static const unsigned long ${prefix}case_folding_seq[] = {\n";
+ print_ulseq(@folding_data);
+ print "};\n\n";
+}
+
+#
+# print_casemap_context -- gerarate data for determining context
+# (final/non-final)
+#
+sub print_casemap_context {
+ $casemap_ctx->fix();
+ print STDERR "** casemap context\n", $casemap_ctx->stat() if $verbose;
+
+ print <<"END";
+
+/*
+ * Cased characters and non-spacing marks (for casemap context)
+ */
+
+END
+
+ print_bits("CASEMAP_CTX", @casemap_ctx_bits);
+ print <<"END";
+
+#define CTX_CASED $LETTER_BIT
+#define CTX_NSM $NSPMARK_BIT
+
+END
+ print $casemap_ctx->cprog(NAME => "${prefix}casemap_ctx");
+}
+
+sub sprint_composition_hash {
+ my $i = 0;
+ my $s = '';
+ foreach my $r (@_) {
+ if ($i % 2 == 0) {
+ $s .= "\n" if $i != 0;
+ $s .= "\t";
+ }
+ $s .= sprintf "{0x%04x, 0x%04x, 0x%04x}, ", @{$r};
+ $i++;
+ }
+ $s;
+}
+
+sub print_bits {
+ my $prefix = shift;
+ my $i = 0;
+ foreach my $bit (@_) {
+ print "#define ${prefix}_BITS_$i\t$bit\n";
+ $i++;
+ }
+}
+
+sub print_ulseq {
+ my $i = 0;
+ foreach my $v (@_) {
+ if ($i % 4 == 0) {
+ print "\n" if $i != 0;
+ print "\t";
+ }
+ printf "0x%08x, ", $v;
+ $i++;
+ }
+ print "\n";
+}