summaryrefslogtreecommitdiffstats
path: root/utils/uniqkanji.pl
blob: fe8391290a1c54f6862ee0a7adb83d58e4451153 (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
#!/usr/bin/perl

print STDERR "Scanning message and help files and surveys which multibyte chars are used...\n";

while (<>) {
    &entry_kanji($_);
}
    
print STDERR "\n$count unique chars\n";

$fillzero = 0;
$charset = 0;
$jismode = $bigfivemode = 0;

print STDERR "Spit out the list of characters being used in the input.\n";

foreach $i (sort(keys(%usedkanji)))
{
   print chr($i / 256) . chr($i % 256) . "\n";
}


sub entry_kanji
{
    local($line) = @_;
    local($i, $len, $c, $kchar);

    $len = length($line);

    for ($i = 0; $i < $len; $i++) {
	$line =~ s/^(.)//;
	$c = ord($1);
	if ($c >= 0xa0 && $c <= 0xff) {
	    $line =~ s/^(.)//;
	    $kchar = $c * 256 + ord($1);
	    $i++;
	    if (!$usedkanji{$kchar}) {
		$usedkanji{$kchar} = 1;
		printf(STDERR "%04x ", $kchar);
		$count++;
	    }
	}
    }
}