-
Notifications
You must be signed in to change notification settings - Fork 0
/
datavisualization.py
71 lines (49 loc) · 2.01 KB
/
datavisualization.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import math
def generateHist(data, bins, title, xlabel, ylabel):
plt.hist(data, bins, histtype='bar', rwidth=0.7)
plt.xlabel(xlabel)
plt.ylabel(ylabel)
plt.title(title)
plt.legend()
plt.show()
def getClusterStatistics(x_input):
total = 0
for i in x_input:
total += i
mean = total /len(x_input)
tmp = 0
for i in x_input:
tmp = (i - mean) ** 2
variance = tmp/(len(x_input)-1)
print(
"Mean: {}, Variance: {}".format(mean, variance)
)
return mean, variance
def getStats(filename, data_file, title = ""):
if len(filename) > 0:
data_file = pd.read_csv(filename)
results = {}
dataset = "Dataset 1"
col_name = 'num_states'
results[col_name] = getClusterStatistics(data_file[col_name])
bins = np.sort(list(data_file[col_name].unique()))
generateHist(data_file[col_name], bins, title + "Number of States, " + dataset, "Number of States Normalized", "Number of Users")
col_name = 'state_variance'
results[col_name] = getClusterStatistics(data_file[col_name])
bins = np.sort(list(data_file[col_name].unique()))
generateHist(data_file[col_name], bins, title + "State Variance, " + dataset, "Variance Normalized", "Number of Users")
col_name = '1_day_max_reviews'
results[col_name] = getClusterStatistics(data_file[col_name])
bins = np.sort(list(data_file[col_name].unique()))
generateHist(data_file[col_name], bins, title + "Max Reviews in a Day, " + dataset, "Number of Reviews Normalized", "Number of Users")
col_name = 'star_variance'
results[col_name] = getClusterStatistics(data_file[col_name])
bins = np.sort(list(data_file[col_name].unique()))
generateHist(data_file[col_name], bins, title + "Star Variance, " + dataset, "Number of Stars Normalized", "Number of Users")
print(results)
#def getDBSCANStatitstics():
if __name__ == '__main__':
getStats("cleaned_data_1000.csv",[])