-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathdictogram.py
102 lines (83 loc) · 3.71 KB
/
dictogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
import random
class Dictogram:
def __init__(self, word_list):
'''Initializes the dictogram properties'''
self.word_list = word_list
self.dictionary_histogram = self.build_dictogram()
self.tokens = sum(self.dictionary_histogram.values())
self.types = self.unique_words()
def build_dictogram(self):
'''Creates a histogram dictionary using the word_list property and returns it'''
histogram = {}
for word in self.word_list:
histogram[word] = histogram.get(word, 0) + 1
return histogram
def frequency(self, word):
'''returns the frequency or count of the given word in the dictionary histogram'''
return self.dictionary_histogram.get(word, False)
def unique_words(self):
'''returns the number of unique words in the dictionary histogram, ie types'''
return len(self.dictionary_histogram)
def sample(self):
'''Randomly samples from the dictionary histogram based on the frequency, returns a word'''
random_value = random.randrange(0, self.tokens)
position = 0
for key in self.dictionary_histogram.keys():
position += self.dictionary_histogram[key]
if position > random_value:
return key
def dictogram_samples(self,count):
string = self.sample()
for _ in range(count - 1):
string += " " + self.sample()
return string
def print_dictogram(word_list):
'''Creates a dictionary based histogram (dictogram) and then prints out its properties and samples from it'''
print()
print('Dictionary Histogram:')
print('word list: {}'.format(word_list))
# Create a dictogram and display its contents
dictogram = Dictogram(word_list)
print('dictogram: {}'.format(dictogram.dictionary_histogram))
print('{} tokens, {} types'.format(dictogram.tokens, dictogram.types))
for word in word_list[-2:]:
freq = dictogram.frequency(word)
print('{!r} occurs {} times'.format(word, freq))
print()
print_dictogram_samples(dictogram)
def print_dictogram_samples(dictogram):
'''Compares sampled frequency to observed frequency'''
print('Dictionary Histogram samples:')
# Sample the histogram 10,000 times and count frequency of results
samples_list = [dictogram.sample() for _ in range(10000)]
samples_hist = Dictogram(samples_list)
print('samples: {}'.format(samples_hist.dictionary_histogram))
print()
print('Sampled frequency and error from observed frequency:')
header = '| word type | observed freq | sampled freq | error |'
divider = '-' * len(header)
print(divider)
print(header)
print(divider)
# Colors for error
green = '\033[32m'
yellow = '\033[33m'
red = '\033[31m'
reset = '\033[m'
# Check each word in original histogram
for word, count in dictogram.dictionary_histogram.items():
# Calculate word's observed frequency
observed_freq = count / dictogram.tokens
# Calculate word's sampled frequency
samples = samples_hist.frequency(word)
sampled_freq = samples / samples_hist.tokens
# Calculate error between word's sampled and observed frequency
error = (sampled_freq - observed_freq) / observed_freq
color = green if abs(error) < 0.05 else yellow if abs(error) < 0.1 else red
print('| {!r:<9} '.format(word)
+ '| {:>4} = {:>6.2%} '.format(count, observed_freq)
+ '| {:>4} = {:>6.2%} '.format(samples, sampled_freq)
+ '| {}{:>+7.2%}{} |'.format(color, error, reset))
print(divider)
print()
print_dictogram(['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish'])