#!/usr/bin/perl # # Generate a subset of the UnicodeData.txt file, available from # ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt # # Usage: # gensubset.pl [subset files] < UnicodeData.txt > MiniUCD.txt # %need_these = (); # Mark as needed all the characters mentioned in the relevant files foreach $file (@ARGV) { open(F, '<', $file) or die; while (defined($line = <F>)) { $line =~ s/\s*(\#.*|)$//; # Remove comments and final blanks @f = split(/\s+/, $line); next if (scalar @f != 2); $need_these{hex $f[1]}++; } close(F); } # Also mark as needed any case variants of those # (Note: this doesn't necessarily provide the full transitive closure, # but we shouldn't need it.) while (defined($line = <STDIN>)) { @f = split(/;/, $line); if ($f[0] =~ /^([0-9a-f]+)$/i) { $r = hex $f[0]; if ($need_these{$r}) { $need_these{hex $f[12]}++ if ($f[12] ne ''); $need_these{hex $f[13]}++ if ($f[13] ne ''); $need_these{hex $f[14]}++ if ($f[14] ne ''); } } } # Finally, write out the subset seek(STDIN, 0, 0); while (defined($line = <STDIN>)) { ($v, $l) = split(/;/, $line, 2); if ($v =~ /^([0-9a-f]+)\-([0-9a-f]+)$/i) { # This isn't actually the format... fix that if it ever matters $r1 = hex $1; $r2 = hex $2; } elsif ($v =~ /^([0-9a-f]+)$/i) { $r1 = $r2 = hex $1; } else { next; } for ($r = $r1; $r <= $r2; $r++) { printf "%04X;%s", $r, $l if ($need_these{$r}); } }