add sorting options

This commit is contained in:
Eric House 2022-01-16 21:26:59 -08:00
parent 9dd69e86e7
commit 7d869a6bda

View file

@ -1,6 +1,7 @@
#!/usr/bin/env python3 #!/usr/bin/env python3
import sys import argparse, sys
from collections import defaultdict
""" """
print stats about in input stream that's assumed to be a dictionary. print stats about in input stream that's assumed to be a dictionary.
@ -16,29 +17,52 @@ letter tuples and triples -- how often letters appear together -- is
a better indicator than just letter count. a better indicator than just letter count.
""" """
class Letter:
def __init__(self, ch):
self.ch = ch
self.count = 0
def increment(self): self.count += 1
def getChr(self): return self.ch
def getCount(self): return self.count
def format(self, total):
count = self.count
pct = (100.00 * count) / total
return '{: >6s} {:2d} {:x} {:5.2f} ({:d})' \
.format(self.ch, ord(self.ch), ord(self.ch), pct, self.count )
def mkParser():
parser = argparse.ArgumentParser()
parser.add_argument('--sort-by', dest = 'SORT', type = str, default = 'ASCII',
help = 'sort output by ASCII or COUNT')
parser.add_argument('--enc', dest = 'ENC', type = str, default = 'utf8',
help = 'encoding')
return parser
def main(): def main():
wordSizeCounts = {} args = mkParser().parse_args()
letterCounts = {}
letters = {}
wordSizeCounts = defaultdict(int)
# letterCounts = defaultdict(int)
wordCount = 0 wordCount = 0
letterCount = 0 letterCount = 0
enc = 'utf8' # this could be a cmdline arg....
for line in sys.stdin.readlines(): for line in sys.stdin.readlines():
line = line.strip() line = line.strip()
length = len(line) length = len(line)
if not length in wordSizeCounts: wordSizeCounts[length] = 0 # if not length in wordSizeCounts: wordSizeCounts[length] = 0
wordSizeCounts[length] += 1 wordSizeCounts[length] += 1
wordCount += 1 wordCount += 1
for letter in line: for letter in line:
ii = ord(letter) ii = ord(letter)
# perl did this: die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii) assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii)
if not letter in letterCounts: letterCounts[letter] = 0 if not letter in letters: letters[letter] = Letter(letter)
letterCounts[letter] += 1 letters[letter].increment()
letterCount += 1 letterCount += 1
print( 'Number of words: {}'.format(wordCount)) print( 'Number of words: {}'.format(wordCount))
@ -60,19 +84,21 @@ def main():
print(' {:6d} {:.2f}'.format( wordTotal, pctTotal)) print(' {:6d} {:.2f}'.format( wordTotal, pctTotal))
print('') print('')
lineNo = 1
pctTotal = 0.0
print( '**** Letter counts ****' ) print( '**** Letter counts ****' )
print( ' ASCII ORD HEX PCT (of {})'.format(letterCount)) print( ' ASCII ORD HEX PCT (of {})'.format(letterCount))
for letter in sorted(letterCounts):
count = letterCounts[letter] if args.SORT == 'ASCII':
pct = (100.00 * count) / letterCount key = lambda letter: ord(letter.getChr())
pctTotal += pct elif args.SORT == 'COUNT':
print( '{:2d}: {: >6s} {:2d} {:x} {:5.2f} ({:d})' \ key = lambda letter: -letter.getCount()
.format(lineNo, letter, ord(letter), ord(letter), pct, count ) ) else:
print('error: bad sort arg: {}'.format(args.SORT))
sys.exit(1)
lineNo = 1
for letter in sorted(letters.values(), key=key):
print('{:2d}: {}'.format(lineNo, letter.format(letterCount)))
lineNo += 1 lineNo += 1
print('percent total {:.2f}'.format( pctTotal))
print('') print('')
############################################################################## ##############################################################################