forked from denisecase/datafun-01-getting-started
-
Notifications
You must be signed in to change notification settings - Fork 0
/
easy_stats.py
181 lines (145 loc) · 4.71 KB
/
easy_stats.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
"""
Purpose: Illustrate the built-in statistics module.
VS Code Menu / View / Command Palette / Python Interpreter
Must be 3.10 or greater to get the correlation and linear regression.
Uses only Python Standard Library modules.
@ uses statistics module for descriptive stats
@ uses sys module for checking Python version
"""
# ----------------- INSTRUCTOR GENERATED CODE -----------------
# Use this handy logger to document your work automatically
# import setup_logger function from instructor-generated module
from util_logger import setup_logger
# setup the logger using the current file name (a built-in variable)
logger, logname = setup_logger(__file__)
# ----------------- END INSTRUCTOR GENERATED CODE -----------------
# Import from Python Standard Library
import statistics
import sys
# Descriptive: Univariant Data..................................
# univariant data (one variable, many readings)
uni_data = [
105,
129,
87,
86,
111,
111,
89,
81,
108,
92,
110,
100,
75,
105,
103,
109,
76,
119,
99,
91,
103,
129,
106,
101,
84,
111,
74,
87,
86,
103,
103,
106,
86,
111,
75,
87,
102,
121,
111,
88,
89,
101,
106,
95,
103,
107,
101,
81,
109,
104,
]
logger.info("uni_data = " + str(uni_data))
# Descriptive: Averages and measures of central tendency
mean = statistics.mean(uni_data)
median = statistics.median(uni_data)
mode = statistics.mode(uni_data)
# log use variable colon formatting to avoid unnecessary digits (e.g. .2f)
logger.info(f"mean = {mean:.2f}")
logger.info(f"median = {median:.2f}")
logger.info(f"mode = {mode:.2f}")
# Descriptive: Measures of spread
var = statistics.variance(uni_data)
stdev = statistics.stdev(uni_data)
lowest = min(uni_data)
highest = max(uni_data)
# TODO: change to f-strings and display 2 decimal places (like we did above)
logger.info(f"var = {var:.2f}")
logger.info(f"stdev = {stdev:.2f}")
logger.info(f"lowest = {lowest:.2f}")
logger.info(f"highest = {highest:.2f}")
# Descriptive: Univariant Time Series Data.........................
# describe relationships
# univariant time series data (one variable over time)
# typically, x (or time) is independent and
# y is dependent on x (e.g. temperature vs hour of day)
xtimes = [1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12]
yvalues = [2, 5, 8, 20, 21, 23, 24, 27, 30, 31, 31, 32]
# if the lists are not the same size,
# log an error and quit the program
if len(xtimes) != len(yvalues):
logger.error("ERROR: The related sets are not the same size.")
logger.error(f" {len(xtimes)}!={len(yvalues)}")
quit()
# check the Python version before using the correlation function
logger.warn("Correlation requires Python version 3.10 or greater.")
logger.warn(f"Your version is {sys.version_info.major}.{sys.version_info.minor}")
# if the Python version is too old, we can quit now
if sys.version_info.minor < 10:
logger.error("Please update Python to 3.10 or greater")
logger.error("or use View / Command Palette / Python: Select Interpreter")
logger.error("to get a newer one.")
quit()
# If we're still here, use the new correlation function from the statistics module
xx_corr = statistics.correlation(xtimes, xtimes)
xy_corr = statistics.correlation(xtimes, yvalues)
# log the information
logger.info("Here's some time series data:")
logger.info(f"xtimes:{xtimes}")
logger.info(f"yvalues:{yvalues}")
logger.info(f"correlation between xtimes and xtimes = {xx_corr:.2f}")
logger.info(f"correlation between xtimes and yvalues = {xy_corr:.2f}")
# Calculate slope and intercept of a line
# Here's some bivariant data (two series of data)
arrayX = [-200, -150, -100, 50, 0, 50, 100, 150]
arrayY = [-240, -165, -99, 35, 19, 75, 130, 125]
# Call linear_regression() function -
# and get back 2 values: slope and intercept
# describing the 'best fit' line through the data
slope, intercept = statistics.linear_regression(arrayX, arrayY)
# Choose an x value off in the future (future x)
future_x = 250
# Extend the line out into the unknown future
# and read the value (of future y)
future_y = round(slope * future_x + intercept)
logger.info("Here's some bivariant data (2 variables, together):")
logger.info(f"x:{arrayX}")
logger.info(f"y:{arrayY}")
logger.info("Calculate the slope and intercept of a best fit straight line:")
logger.info(f" slope = {slope:.2f}")
logger.info(f" intercept = { intercept:.2f}")
logger.info("Let's use our best fit line to PREDICT a future value.")
logger.info(f" At future x = {future_x:d},")
logger.info(f" we predict the value of y will be { future_y:d}.")
logger.info("How'd we do? Does this make sense given the data?")