mirror of
git://xwords.git.sourceforge.net/gitroot/xwords/xwords
synced 2025-01-18 22:26:30 +01:00
add sorting options
This commit is contained in:
parent
9dd69e86e7
commit
7d869a6bda
1 changed files with 43 additions and 17 deletions
|
@ -1,6 +1,7 @@
|
||||||
#!/usr/bin/env python3
|
#!/usr/bin/env python3
|
||||||
|
|
||||||
import sys
|
import argparse, sys
|
||||||
|
from collections import defaultdict
|
||||||
|
|
||||||
"""
|
"""
|
||||||
print stats about in input stream that's assumed to be a dictionary.
|
print stats about in input stream that's assumed to be a dictionary.
|
||||||
|
@ -16,29 +17,52 @@ letter tuples and triples -- how often letters appear together -- is
|
||||||
a better indicator than just letter count.
|
a better indicator than just letter count.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
|
class Letter:
|
||||||
|
def __init__(self, ch):
|
||||||
|
self.ch = ch
|
||||||
|
self.count = 0
|
||||||
|
|
||||||
|
def increment(self): self.count += 1
|
||||||
|
def getChr(self): return self.ch
|
||||||
|
def getCount(self): return self.count
|
||||||
|
|
||||||
|
def format(self, total):
|
||||||
|
count = self.count
|
||||||
|
pct = (100.00 * count) / total
|
||||||
|
return '{: >6s} {:2d} {:x} {:5.2f} ({:d})' \
|
||||||
|
.format(self.ch, ord(self.ch), ord(self.ch), pct, self.count )
|
||||||
|
|
||||||
|
|
||||||
|
def mkParser():
|
||||||
|
parser = argparse.ArgumentParser()
|
||||||
|
parser.add_argument('--sort-by', dest = 'SORT', type = str, default = 'ASCII',
|
||||||
|
help = 'sort output by ASCII or COUNT')
|
||||||
|
parser.add_argument('--enc', dest = 'ENC', type = str, default = 'utf8',
|
||||||
|
help = 'encoding')
|
||||||
|
return parser
|
||||||
|
|
||||||
def main():
|
def main():
|
||||||
wordSizeCounts = {}
|
args = mkParser().parse_args()
|
||||||
letterCounts = {}
|
|
||||||
|
letters = {}
|
||||||
|
wordSizeCounts = defaultdict(int)
|
||||||
|
# letterCounts = defaultdict(int)
|
||||||
wordCount = 0
|
wordCount = 0
|
||||||
letterCount = 0
|
letterCount = 0
|
||||||
enc = 'utf8' # this could be a cmdline arg....
|
|
||||||
|
|
||||||
for line in sys.stdin.readlines():
|
for line in sys.stdin.readlines():
|
||||||
line = line.strip()
|
line = line.strip()
|
||||||
|
|
||||||
length = len(line)
|
length = len(line)
|
||||||
if not length in wordSizeCounts: wordSizeCounts[length] = 0
|
# if not length in wordSizeCounts: wordSizeCounts[length] = 0
|
||||||
wordSizeCounts[length] += 1
|
wordSizeCounts[length] += 1
|
||||||
wordCount += 1
|
wordCount += 1
|
||||||
|
|
||||||
for letter in line:
|
for letter in line:
|
||||||
ii = ord(letter)
|
ii = ord(letter)
|
||||||
# perl did this: die "$0: this is a letter?: $ii" if $ii <= 32 && $ii >= 4 && $ii != 0;
|
|
||||||
assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii)
|
assert ii > 32 or ii < 4 or ii == 0, 'letter {} out of range'.format(ii)
|
||||||
if not letter in letterCounts: letterCounts[letter] = 0
|
if not letter in letters: letters[letter] = Letter(letter)
|
||||||
letterCounts[letter] += 1
|
letters[letter].increment()
|
||||||
letterCount += 1
|
letterCount += 1
|
||||||
|
|
||||||
print( 'Number of words: {}'.format(wordCount))
|
print( 'Number of words: {}'.format(wordCount))
|
||||||
|
@ -60,19 +84,21 @@ def main():
|
||||||
print(' {:6d} {:.2f}'.format( wordTotal, pctTotal))
|
print(' {:6d} {:.2f}'.format( wordTotal, pctTotal))
|
||||||
print('')
|
print('')
|
||||||
|
|
||||||
lineNo = 1
|
|
||||||
pctTotal = 0.0
|
|
||||||
print( '**** Letter counts ****' )
|
print( '**** Letter counts ****' )
|
||||||
print( ' ASCII ORD HEX PCT (of {})'.format(letterCount))
|
print( ' ASCII ORD HEX PCT (of {})'.format(letterCount))
|
||||||
for letter in sorted(letterCounts):
|
|
||||||
count = letterCounts[letter]
|
if args.SORT == 'ASCII':
|
||||||
pct = (100.00 * count) / letterCount
|
key = lambda letter: ord(letter.getChr())
|
||||||
pctTotal += pct
|
elif args.SORT == 'COUNT':
|
||||||
print( '{:2d}: {: >6s} {:2d} {:x} {:5.2f} ({:d})' \
|
key = lambda letter: -letter.getCount()
|
||||||
.format(lineNo, letter, ord(letter), ord(letter), pct, count ) )
|
else:
|
||||||
|
print('error: bad sort arg: {}'.format(args.SORT))
|
||||||
|
sys.exit(1)
|
||||||
|
lineNo = 1
|
||||||
|
for letter in sorted(letters.values(), key=key):
|
||||||
|
print('{:2d}: {}'.format(lineNo, letter.format(letterCount)))
|
||||||
lineNo += 1
|
lineNo += 1
|
||||||
|
|
||||||
print('percent total {:.2f}'.format( pctTotal))
|
|
||||||
print('')
|
print('')
|
||||||
|
|
||||||
##############################################################################
|
##############################################################################
|
||||||
|
|
Loading…
Reference in a new issue