2003-12-13 19:27:53 +01:00
|
|
|
#!/usr/bin/perl
|
|
|
|
|
|
|
|
# print stats about in input stream that's assumed to be a dictionary.
|
|
|
|
# Counts and percentages of each letter, as well as total numbers of
|
|
|
|
# words. This is not part of the dictionary build process. I use it
|
|
|
|
# for creating info.txt files for new languages and debugging the
|
|
|
|
# creation of dictionaries from new wordlists.
|
|
|
|
#
|
|
|
|
# Something like this might form the basis for choosing counts and
|
|
|
|
# values for tiles without using the conventions established by
|
|
|
|
# Scrabble players. This isn't enough, though: the frequency of
|
|
|
|
# letter tuples and triples -- how often letters appear together -- is
|
|
|
|
# a better indicator than just letter count.
|
|
|
|
|
|
|
|
use strict;
|
|
|
|
|
|
|
|
my @wordSizeCounts;
|
2010-12-01 03:35:11 +01:00
|
|
|
my %letterCounts;
|
2003-12-13 19:27:53 +01:00
|
|
|
my $wordCount;
|
|
|
|
my $letterCount;
|
2010-12-01 03:35:11 +01:00
|
|
|
my $enc = "utf8"; # this could be a cmdline arg....
|
|
|
|
|
|
|
|
if ( $enc ) {
|
|
|
|
binmode( STDOUT, ":encoding($enc)" ) ;
|
|
|
|
binmode( STDIN, ":encoding($enc)" ) ;
|
|
|
|
}
|
2003-12-13 19:27:53 +01:00
|
|
|
|
|
|
|
while (<>) {
|
|
|
|
|
|
|
|
chomp;
|
|
|
|
|
|
|
|
++$wordSizeCounts[length];
|
|
|
|
++$wordCount;
|
|
|
|
|
|
|
|
foreach my $letter (split( / */ ) ) {
|
2010-12-01 03:35:11 +01:00
|
|
|
my $ii = ord($letter);
|
2003-12-14 20:52:49 +01:00
|
|
|
# special-case the bogus chars we add for "specials"
|
2010-12-01 03:35:11 +01:00
|
|
|
die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
|
|
|
|
++$letterCounts{$letter};
|
2003-12-13 19:27:53 +01:00
|
|
|
++$letterCount;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
print "Number of words: $wordCount\n";
|
|
|
|
print "Number of letters: $letterCount\n\n";
|
|
|
|
|
|
|
|
|
|
|
|
print "**** word sizes ****\n";
|
|
|
|
print "SIZE COUNT PERCENT\n";
|
2004-07-21 04:04:57 +02:00
|
|
|
for ( my $i = 1 ; $i <= 99; ++$i ) {
|
2003-12-13 19:27:53 +01:00
|
|
|
my $count = $wordSizeCounts[$i];
|
|
|
|
if ( $count > 0 ) {
|
|
|
|
my $pct = (100.00 * $count)/$wordCount;
|
|
|
|
printf "%2d %5d %.2f\n", $i, $count, $pct;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
print "\n\n**** Letter counts ****\n";
|
2008-02-23 23:00:40 +01:00
|
|
|
print " ASCII ORD HEX PCT (of $letterCount)\n";
|
2008-02-20 04:44:31 +01:00
|
|
|
my $lineNo = 1;
|
2010-12-01 03:35:11 +01:00
|
|
|
foreach my $key (sort keys %letterCounts) {
|
|
|
|
my $count = $letterCounts{$key};
|
|
|
|
my $pct = (100.00 * $count) / $letterCount;
|
|
|
|
printf( "%2d: %3s %3d %x %5.2f (%d)\n",
|
|
|
|
$lineNo, $key, ord($key), ord($key), $pct, $count );
|
|
|
|
++$lineNo;
|
2003-12-13 19:27:53 +01:00
|
|
|
}
|
|
|
|
|
|
|
|
print "\n";
|