#!/usr/bin/perl # # Produce a codepage matching table. For each 8-bit character, list # a primary and an alternate match (the latter used for case-insensitive # matching.) # # Usage: # cptable.pl UnicodeData console-cp.txt filesystem-cp.txt output.cp # # Note: for the format of the UnicodeData file, see: # http://www.unicode.org/Public/UNIDATA/UCD.html # ($ucd, $cpco, $cpfs, $cpout) = @ARGV; if (!defined($cpout)) { die "Usage: $0 UnicodeData console-cp.txt fs-cp.txt output.cp\n"; } %ucase = (); %lcase = (); %tcase = (); %decomp = (); open(UCD, '<', $ucd) or die "$0: could not open unicode data: $ucd: $!\n"; while (defined($line = <UCD>)) { chomp $line; @f = split(/;/, $line); $n = hex $f[0]; $ucase{$n} = ($f[12] ne '') ? hex $f[12] : $n; $lcase{$n} = ($f[13] ne '') ? hex $f[13] : $n; $tcase{$n} = ($f[14] ne '') ? hex $f[14] : $n; if ($f[5] =~ /^[0-9A-F\s]+$/) { # This character has a canonical decomposition. # The regular expression rejects angle brackets, so other # decompositions aren't permitted. $decomp{$n} = []; foreach my $dch (split(' ', $f[5])) { push(@{$decomp{$n}}, hex $dch); } } } close(UCD); # # Filesystem and console codepages. The filesystem codepage is used # for FAT shortnames, whereas the console codepage is whatever is used # on the screen and keyboard. # @xtab = (undef) x 256; %tabx = (); open(CPFS, '<', $cpfs) or die "$0: could not open fs codepage: $cpfs: $!\n"; while (defined($line = <CPFS>)) { $line =~ s/\s*(\#.*|)$//; @f = split(/\s+/, $line); next if (scalar @f != 2); next if (hex $f[0] > 255); $xtab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode $tabx{hex $f[1]} = hex $f[0]; # Unicode -> Codepage } close(CPFS); @ytab = (undef) x 256; %taby = (); open(CPCO, '<', $cpco) or die "$0: could not open console codepage: $cpco: $!\n"; while (defined($line = <CPCO>)) { $line =~ s/\s*(\#.*|)$//; @f = split(/\s+/, $line); next if (scalar @f != 2); next if (hex $f[0] > 255); $ytab[hex $f[0]] = hex $f[1]; # Codepage -> Unicode $taby{hex $f[1]} = hex $f[0]; # Unicode -> Codepage } close(CPCO); open(CPOUT, '>', $cpout) or die "$0: could not open output file: $cpout: $!\n"; # # Magic number, in anticipation of being able to load these # files dynamically... # print CPOUT pack("VV", 0x58a8b3d4, 0x51d21eb1); # Header fields available for future use... print CPOUT pack("VVVVVV", 0, 0, 0, 0, 0, 0); # # Self (shortname) uppercase table. # This depends both on the console codepage and the filesystem codepage; # the logical transcoding operation is: # # $tabx{$ucase{$ytab[$i]}} # # ... where @ytab is console codepage -> Unicode and # %tabx is Unicode -> filesystem codepage. # @uctab = (undef) x 256; for ($i = 0; $i < 256; $i++) { $uuc = $ucase{$ytab[$i]}; # Unicode upper case if (defined($tabx{$uuc})) { # Straight-forward conversion $u = $tabx{$uuc}; } elsif (defined($tabx{${$decomp{$uuc}}[0]})) { # Upper case equivalent stripped of accents $u = $tabx{${$decomp{$uuc}}[0]}; } else { # No equivalent at all found. Assume it is a lower-case-only # character, like greek alpha in CP437. $u = $i; } $uctab[$i] = $u; print CPOUT pack("C", $u); } # # Self (shortname) lowercase table. # This depends both on the console codepage and the filesystem codepage; # the logical transcoding operation is: # # $taby{$lcase{$xtab[$i]}} # # ... where @ytab is console codepage -> Unicode and # %tabx is Unicode -> filesystem codepage. # @lctab = (undef) x 256; for ($i = 0; $i < 256; $i++) { $llc = $lcase{$xtab[$i]}; # Unicode lower case if (defined($l = $taby{$llc}) && $uctab[$l] == $i) { # Straight-forward conversion } elsif (defined($l = $tabx{${$decomp{$llc}}[0]}) && $uctab[$l] == $i) { # Lower case equivalent stripped of accents } else { # No equivalent at all found. Find *anything* that matches the # bijection criterion... for ($l = 0; $l < 256; $l++) { last if ($uctab[$l] == $i); } $l = $i if ($l == 256); # If nothing, we're screwed anyway... } $lctab[$i] = $l; print CPOUT pack("C", $l); } # # Unicode (longname) matching table. # This only depends on the console codepage. # $pp0 = ''; $pp1 = ''; for ($i = 0; $i < 256; $i++) { if (!defined($ytab[$i])) { $p0 = $p1 = 0xffff; } else { $p0 = $ytab[$i]; if ($ucase{$p0} != $p0) { $p1 = $ucase{$p0}; } elsif ($lcase{$p0} != $p0) { $p1 = $lcase{$p0}; } elsif ($tcase{$p0} != $p0) { $p1 = $tcase{$p0}; } else { $p1 = $p0; } } # Only the BMP is supported... $p0 = 0xffff if ($p0 > 0xffff); $p1 = 0xffff if ($p1 > 0xffff); $pp0 .= pack("v", $p0); $pp1 .= pack("v", $p1); } print CPOUT $pp0, $pp1; close (CPOUT);