-
Notifications
You must be signed in to change notification settings - Fork 0
/
test_dictogram.py
61 lines (53 loc) · 2.27 KB
/
test_dictogram.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
#to run type: pytest test_dictogram.py
from dictogram import Dictogram
# known inputs and their expected results
fish_words = ['one', 'fish', 'two', 'fish', 'red', 'fish', 'blue', 'fish']
fish_dict = {'one': 1, 'fish': 4, 'two': 1, 'red': 1, 'blue': 1}
def test_entries():
dictogram = Dictogram(fish_words).dictionary_histogram
# Verify histogram as dictionary of entries like {word: count}
assert len(dictogram) == 5
assert len(dictogram) == len(fish_dict)
def test_contains():
dictogram = Dictogram(fish_words).dictionary_histogram
# All of these words should be found
for word in fish_words:
assert word in dictogram
# None of these words should be found
for word in ('fishy', 'food'):
assert word not in dictogram
def test_frequency():
dictogram = Dictogram(fish_words)
# Verify frequency count of all words
assert dictogram.frequency('one') == 1
assert dictogram.frequency('two') == 1
assert dictogram.frequency('red') == 1
assert dictogram.frequency('blue') == 1
assert dictogram.frequency('fish') == 4
def test_tokens():
dictogram = Dictogram(fish_words)
# Verify total count of all word tokens
assert len(fish_words) == 8
assert dictogram.tokens == 8
def test_types():
dictogram = Dictogram(fish_words)
# Verify count of distinct word types
assert len(set(fish_words)) == 5
assert dictogram.types == 5
def test_sample():
dictogram = Dictogram(fish_words)
# Create a list of 10,000 word samples from histogram
samples_list = [dictogram.sample() for _ in range(10000)]
# Create a histogram to count frequency of each word
samples_hist = Dictogram(samples_list)
# Check each word in original histogram
for word, count in dictogram.dictionary_histogram.items():
# Calculate word's observed frequency
observed_freq = count / dictogram.tokens
# Calculate word's sampled frequency
samples = samples_hist.frequency(word)
sampled_freq = samples / samples_hist.tokens
# Verify word's sampled frequency is close to observed frequency
lower_bound = observed_freq * 0.9 # 10% below = 90% = 0.9
upper_bound = observed_freq * 1.1 # 10% above = 110% = 1.1
assert lower_bound <= sampled_freq <= upper_bound