mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2024-12-30 10:26:58 +01:00
first checked in.
This commit is contained in:
parent
3251a8d1d8
commit
0721f63227
1 changed files with 64 additions and 0 deletions
64
dawg/dictstats.pl
Executable file
64
dawg/dictstats.pl
Executable file
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/perl
|
||||
|
||||
# print stats about in input stream that's assumed to be a dictionary.
|
||||
# Counts and percentages of each letter, as well as total numbers of
|
||||
# words. This is not part of the dictionary build process. I use it
|
||||
# for creating info.txt files for new languages and debugging the
|
||||
# creation of dictionaries from new wordlists.
|
||||
#
|
||||
# Something like this might form the basis for choosing counts and
|
||||
# values for tiles without using the conventions established by
|
||||
# Scrabble players. This isn't enough, though: the frequency of
|
||||
# letter tuples and triples -- how often letters appear together -- is
|
||||
# a better indicator than just letter count.
|
||||
|
||||
use strict;
|
||||
|
||||
my @wordSizeCounts;
|
||||
my @letterCounts;
|
||||
my $wordCount;
|
||||
my $letterCount;
|
||||
|
||||
while (<>) {
|
||||
|
||||
chomp;
|
||||
|
||||
++$wordSizeCounts[length];
|
||||
++$wordCount;
|
||||
|
||||
foreach my $letter (split( / */ ) ) {
|
||||
my $i = ord($letter);
|
||||
die "$0: this is a letter?: $i" if $i <= 32;
|
||||
++$letterCounts[$i];
|
||||
++$letterCount;
|
||||
}
|
||||
}
|
||||
|
||||
print "Number of words: $wordCount\n";
|
||||
print "Number of letters: $letterCount\n\n";
|
||||
|
||||
|
||||
print "**** word sizes ****\n";
|
||||
print "SIZE COUNT PERCENT\n";
|
||||
for ( my $i = 1 ; $i <= 15; ++$i ) {
|
||||
my $count = $wordSizeCounts[$i];
|
||||
if ( $count > 0 ) {
|
||||
my $pct = (100.00 * $count)/$wordCount;
|
||||
printf "%2d %5d %.2f\n", $i, $count, $pct;
|
||||
}
|
||||
}
|
||||
|
||||
|
||||
|
||||
print "\n\n**** Letter counts ****\n";
|
||||
print "ASCII ORD HEX PCT (of $letterCount)\n";
|
||||
for ( my $i = 32; $i < 255; ++$i ) {
|
||||
my $count = $letterCounts[$i];
|
||||
if ( $count > 0 ) {
|
||||
my $pct = (100.00 * $count) / $letterCount;
|
||||
printf( "%3s %3d %x %5.2f (%d)\n",
|
||||
chr($i), $i, $i, $pct, $count );
|
||||
}
|
||||
}
|
||||
|
||||
print "\n";
|
Loading…
Reference in a new issue