-
Notifications
You must be signed in to change notification settings - Fork 0
/
skeleton_code.py
311 lines (251 loc) · 12.1 KB
/
skeleton_code.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
import pandas as pd
import logging
import numpy as np
import sys
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split
from sklearn.preprocessing import MinMaxScaler
from sklearn.feature_selection import VarianceThreshold
### Assignment Owner: Tian Wang
#######################################
#### Normalization
def test_train_split(X, y, test_size = 0.2):
try: assert X.shape[0] == y.shape[0]
except AssertionError:
print('check X and y are numpy object with equal num rows')
return
train_size = round((1 - test_size) * len(y))
test_size = len(y) - train_size
random_arr = np.random.permutation(len(y))
train_id = random_arr[:train_size]
test_id = random_arr[train_size:]
return X[train_id,:], X[test_id,:], y[train_id], y[test_id]
def feature_normalization(train, test):
"""Rescale the data so that each feature in the training set is in
the interval [0,1], and apply the same transformations to the test
set, using the statistics computed on the training set.
Args:
train - training set, a 2D numpy array of size (num_instances, num_features)
test - test set, a 2D numpy array of size (num_instances, num_features)
Returns:
train_normalized - training set after normalization
test_normalized - test set after normalization
"""
# TODO
# remove zero variance features
selector = VarianceThreshold()
train = selector.fit_transform(train)
test = selector.transform(test)
# rescale with min-max scaling
scaler = MinMaxScaler()
scaler.fit(train)
train_normalized = scaler.transform(train)
test_normalized = scaler.transform(test)
return train_normalized, test_normalized
########################################
#### The square loss function
def compute_square_loss(X, y, theta):
"""
Given a set of X, y, theta, compute the square loss for predicting y with X*theta
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D array of size (num_features)
Returns:
loss - the square loss, scalar
"""
#TODO
z = np.matmul(X, theta) - y
m = len(y)
loss = (1 / m) * np.dot(np.transpose(z), z)
return loss
########################################
### compute the gradient of square loss function
def compute_square_loss_gradient(X, y, theta):
"""
Compute gradient of the square loss (as defined in compute_square_loss), at the point theta.
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)
Returns:
grad - gradient vector, 1D numpy array of size (num_features)
"""
#TODO
m = len(y)
z = np.transpose(np.matmul(X, theta) - y)
loss_grad = (2 / m) * np.matmul(z, X)
return loss_grad
###########################################
### Gradient Checker
#Getting the gradient calculation correct is often the trickiest part
#of any gradient-based optimization algorithm. Fortunately, it's very
#easy to check that the gradient calculation is correct using the
#definition of gradient.
#See http://ufldl.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization
def grad_checker(X, y, theta, epsilon=0.01, tolerance=1e-4):
"""Implement Gradient Checker
Check that the function compute_square_loss_gradient returns the
correct gradient for the given X, y, and theta.
Let d be the number of features. Here we numerically estimate the
gradient by approximating the directional derivative in each of
the d coordinate directions:
(e_1 = (1,0,0,...,0), e_2 = (0,1,0,...,0), ..., e_d = (0,...,0,1)
The approximation for the directional derivative of J at the point
theta in the direction e_i is given by:
( J(theta + epsilon * e_i) - J(theta - epsilon * e_i) ) / (2*epsilon).
We then look at the Euclidean distance between the gradient
computed using this approximation and the gradient computed by
compute_square_loss_gradient(X, y, theta). If the Euclidean
distance exceeds tolerance, we say the gradient is incorrect.
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)
epsilon - the epsilon used in approximation
tolerance - the tolerance error
Return:
A boolean value indicate whether the gradient is correct or not
"""
true_gradient = compute_square_loss_gradient(X, y, theta) #the true gradient
num_features = theta.shape[0]
#approx_grad = np.zeros(num_features) #Initialize the gradient we approximate
#TODO
limit = lambda h: (compute_square_loss(X, y, theta + epsilon * h)
- compute_square_loss(X, y, theta - epsilon * h)) / (2 * epsilon)
unit_vectors = [np.array([1 if i == index else 0 for i in range(num_features)])
for index in range(num_features)]
approx_grad = list(map(limit, unit_vectors))
norm = np.linalg.norm(true_gradient - np.array(approx_grad), 2)
return norm <= tolerance
#################################################
### Generic Gradient Checker
def generic_gradient_checker(X, y, theta, objective_func, gradient_func, epsilon=0.01, tolerance=1e-4):
"""
The functions takes objective_func and gradient_func as parameters. And check whether gradient_func(X, y, theta) returned
the true gradient for objective_func(X, y, theta).
Eg: In LSR, the objective_func = compute_square_loss, and gradient_func = compute_square_loss_gradient
"""
#TODO
####################################
#### Batch Gradient Descent
def batch_grad_descent(X, y, alpha=0.1, num_iter=1000, check_gradient=False):
"""
In this question you will implement batch gradient descent to
minimize the square loss objective
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
alpha - step size in gradient descent
num_iter - number of iterations to run
check_gradient - a boolean value indicating whether checking the gradient when updating
Returns:
theta_hist - store the the history of parameter vector in iteration, 2D numpy array of size (num_iter+1, num_features)
for instance, theta in iteration 0 should be theta_hist[0], theta in ieration (num_iter) is theta_hist[-1]
loss_hist - the history of objective function vector, 1D numpy array of size (num_iter+1)
"""
num_instances, num_features = X.shape[0], X.shape[1]
theta_hist = np.zeros((num_iter+1, num_features)) #Initialize theta_hist
loss_hist = np.zeros(num_iter+1) #initialize loss_hist
theta = np.zeroes(num_features) #initialize theta
#TODO
####################################
###Q2.4b: Implement backtracking line search in batch_gradient_descent
###Check http://en.wikipedia.org/wiki/Backtracking_line_search for details
#TODO
###################################################
### Compute the gradient of Regularized Batch Gradient Descent
def compute_regularized_square_loss_gradient(X, y, theta, lambda_reg):
"""
Compute the gradient of L2-regularized square loss function given X, y and theta
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
theta - the parameter vector, 1D numpy array of size (num_features)
lambda_reg - the regularization coefficient
Returns:
grad - gradient vector, 1D numpy array of size (num_features)
"""
#TODO
###################################################
### Batch Gradient Descent with regularization term
def regularized_grad_descent(X, y, alpha=0.1, lambda_reg=1, num_iter=1000):
"""
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
alpha - step size in gradient descent
lambda_reg - the regularization coefficient
numIter - number of iterations to run
Returns:
theta_hist - the history of parameter vector, 2D numpy array of size (num_iter+1, num_features)
loss_hist - the history of loss function without the regularization term, 1D numpy array.
"""
(num_instances, num_features) = X.shape
theta = np.zeros(num_features) #Initialize theta
theta_hist = np.zeros((num_iter+1, num_features)) #Initialize theta_hist
loss_hist = np.zeros(num_iter+1) #Initialize loss_hist
#TODO
#############################################
## Visualization of Regularized Batch Gradient Descent
##X-axis: log(lambda_reg)
##Y-axis: square_loss
#############################################
### Stochastic Gradient Descent
def stochastic_grad_descent(X, y, alpha=0.1, lambda_reg=1, num_iter=1000):
"""
In this question you will implement stochastic gradient descent with a regularization term
Args:
X - the feature vector, 2D numpy array of size (num_instances, num_features)
y - the label vector, 1D numpy array of size (num_instances)
alpha - string or float. step size in gradient descent
NOTE: In SGD, it's not always a good idea to use a fixed step size. Usually it's set to 1/sqrt(t) or 1/t
if alpha is a float, then the step size in every iteration is alpha.
if alpha == "1/sqrt(t)", alpha = 1/sqrt(t)
if alpha == "1/t", alpha = 1/t
lambda_reg - the regularization coefficient
num_iter - number of epochs (i.e number of times) to go through the whole training set
Returns:
theta_hist - the history of parameter vector, 3D numpy array of size (num_iter, num_instances, num_features)
loss hist - the history of regularized loss function vector, 2D numpy array of size(num_iter, num_instances)
"""
num_instances, num_features = X.shape[0], X.shape[1]
theta = np.ones(num_features) #Initialize theta
theta_hist = np.zeros((num_iter, num_instances, num_features)) #Initialize theta_hist
loss_hist = np.zeros((num_iter, num_instances)) #Initialize loss_hist
#TODO
################################################
### Visualization that compares the convergence speed of batch
###and stochastic gradient descent for various approaches to step_size
##X-axis: Step number (for gradient descent) or Epoch (for SGD)
##Y-axis: log(objective_function_value) and/or objective_function_value
def main():
#Loading the dataset
print('loading the dataset')
df = pd.read_csv('data.csv', delimiter=',')
X = df.values[:,:-1]
y = df.values[:,-1]
print('Split into Train and Test')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size =100, random_state=10)
print("Scaling all to [0, 1]")
X_train, X_test = feature_normalization(X_train, X_test)
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1)))) # Add bias term
X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1)))) # Add bias term
# TODO
#risk =
if __name__ == "__main__":
#main()
df = pd.read_csv('C:\\Users\\nsadeh\\Documents\\hw1\\hw1-sgd\\data.csv', delimiter=',')
X = df.values[:,:-1]
y = df.values[:,-1]
print('Split into Train and Test')
X_train, X_test, y_train, y_test = test_train_split(X, y)
print(X_train.shape)
print(X_test.shape)
print("Scaling all to [0, 1]")
X_train, X_test = feature_normalization(X_train, X_test)
X_train = np.hstack((X_train, np.ones((X_train.shape[0], 1)))) # Add bias term
X_test = np.hstack((X_test, np.ones((X_test.shape[0], 1)))) # Add bias term
theta = np.random.rand(X.shape[1] + 1)
print(compute_square_loss(X_train, y_train, theta))
print(grad_checker(X_train, y_train, theta, epsilon=0.0001))