xwords/xwords4/dawg/dictstats.py
2024-01-20 16:10:46 -08:00

107 lines
3.3 KiB
Python
Executable file

#!/usr/bin/env python3
import argparse, sys
from collections import defaultdict
"""
print stats about in input stream that's assumed to be a dictionary.
Counts and percentages of each letter, as well as total numbers of
words. This is not part of the dictionary build process. I use it
for creating info.txt files for new languages and debugging the
creation of dictionaries from new wordlists.
Something like this might form the basis for choosing counts and
values for tiles without using the conventions established by
Scrabble players. This isn't enough, though: the frequency of
letter tuples and triples -- how often letters appear together -- is
a better indicator than just letter count.
"""
class Letter:
def __init__(self, ch):
self.ch = ch
self.count = 0
def increment(self): self.count += 1
def getChr(self): return self.ch
def getCount(self): return self.count
def format(self, total):
count = self.count
pct = (100.00 * count) / total
return '{: >6s} {:2d} {:x} {:5.2f} ({:d})' \
.format(self.ch, ord(self.ch), ord(self.ch), pct, self.count )
def mkParser():
parser = argparse.ArgumentParser()
parser.add_argument('--sort-by', dest = 'SORT', type = str, default = 'ASCII',
help = 'sort output by ASCII or COUNT')
parser.add_argument('--enc', dest = 'ENC', type = str, default = 'utf8',
help = 'encoding')
return parser
def main():
args = mkParser().parse_args()
letters = {}
wordSizeCounts = defaultdict(int)
# letterCounts = defaultdict(int)
wordCount = 0
letterCount = 0
sys.stdin.reconfigure(encoding=args.ENC)
for line in sys.stdin.readlines():
line = line.strip()
length = len(line)
wordSizeCounts[length] += 1
wordCount += 1
for letter in line:
ii = ord(letter)
assert ii > 32 or ii < 4 or ii == 0, 'letter {} in word {} out of range'.format(ii, line)
if not letter in letters: letters[letter] = Letter(letter)
letters[letter].increment()
letterCount += 1
print( 'Number of words: {}'.format(wordCount))
print( 'Number of letters: {}'.format(letterCount))
print('')
print( '**** word sizes ****' )
print( 'SIZE COUNT PERCENT' )
pctTotal = 0.0
wordTotal = 0
for ii in sorted(wordSizeCounts):
count = wordSizeCounts[ii]
wordTotal += count
pct = (100.00 * count)/wordCount
pctTotal += pct
print( '{:2d} {:6d} {:02.2f}'.format(ii, count, pct))
print( '-------------------------------' )
print(' {:6d} {:.2f}'.format( wordTotal, pctTotal))
print('')
print( '**** Letter counts ****' )
print( ' ASCII ORD HEX PCT (of {})'.format(letterCount))
if args.SORT == 'ASCII':
key = lambda letter: ord(letter.getChr())
elif args.SORT == 'COUNT':
key = lambda letter: -letter.getCount()
else:
print('error: bad sort arg: {}'.format(args.SORT))
sys.exit(1)
lineNo = 1
for letter in sorted(letters.values(), key=key):
print('{:2d}: {}'.format(lineNo, letter.format(letterCount)))
lineNo += 1
print('')
##############################################################################
if __name__ == '__main__':
main()