-
Notifications
You must be signed in to change notification settings - Fork 1
/
preprocessing.py
87 lines (66 loc) · 2.64 KB
/
preprocessing.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
import numpy as np
# Returns encoded input and number of new features (bits needed)
def one_hot_encoding(input):
encoded_input = []
maxs = []
mins = []
# Compute range of integers representing the categorical
for feature in np.transpose(input):
maxs.append(max(feature[0]))
mins.append(min(feature[0]))
offsets = np.array(maxs) - np.array(mins) + 1
# Compute the number of bits needed
bits = sum(offsets)
# Compute the mapping of each pattern
for pattern in input:
one_hot_input = np.full(bits, 0)
feat_idx = 0
for feat, offset in zip(pattern[0]-np.array(mins), offsets):
one_hot_input[feat_idx + feat] = 1
feat_idx += offset
encoded_input.append([one_hot_input])
return np.array(encoded_input), bits
# Returns standardized input, menas and standard deviations of each feature (pre standardization)
def continuous_standardizer(input, mean = None, std = None):
standardized_input = []
if (mean is None):
mean = []
std = []
# Compute mean and standard deviation of each feature
for feature in np.transpose(input):
mean.append(np.mean(feature[0]))
std.append(np.std(feature[0]))
mean = np.array(mean)
std = np.array(std)
# Scale each pattern of the dataset
for pattern in input:
standardized_input.append((pattern - mean)/std)
return np.array(standardized_input), mean, std
else:
# Scale each pattern of the dataset
for pattern in input:
standardized_input.append((pattern - mean)/std)
return np.array(standardized_input)
# Returns normalized input, min and max of each fdeature (pre normalization)
def min_max_normalizer(input, maxs = None, mins = None):
normalized_input = []
if (maxs is None):
maxs = []
mins = []
# Compute min and max of each feature
for feature in np.transpose(input):
mins.append(min(feature[0]))
maxs.append(max(feature[0]))
mins = np.array(mins)
maxs = np.array(maxs)
diffs = maxs - mins
# Normalized each pattern of the dataset
for pattern in input:
normalized_input.append((pattern - mins)/diffs )
return np.array(normalized_input), mins, maxs
else:
diffs = maxs - mins
# Normalized each pattern of the dataset
for pattern in input:
normalized_input.append((pattern - mins)/diffs )
return np.array(normalized_input)