-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathsgd.py
200 lines (156 loc) · 6.05 KB
/
sgd.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
import numpy as np
import matplotlib.pyplot as plt
# Set random seed for reproducibility
np.random.seed(42)
def loss_function(X):
"""
Computes the loss: (1 - X^T * X)^2
"""
product = np.trace(X.T @ X)
return (1 - product) ** 2
def gradient(X):
"""
Computes the gradient of the loss with respect to X
"""
product = np.trace(X.T @ X)
# Derivative of (1 - x^T * x)^2 with respect to X
return -4 * (1 - product) * X
def numerical_gradient(X, epsilon=1e-7):
"""
Computes the gradient numerically using finite differences
"""
grad = np.zeros_like(X)
# Compute partial derivative for each element
for i in range(X.shape[0]):
for j in range(X.shape[1]):
# Create small perturbation matrix
perturbation = np.zeros_like(X)
perturbation[i, j] = epsilon
# Central difference formula
forward = loss_function(X + perturbation)
backward = loss_function(X - perturbation)
grad[i, j] = (forward - backward) / (2 * epsilon)
return grad
def sgd_optimizer(learning_rate=0.01, n_iterations=100):
# Initialize random 2x2 matrix
X = np.random.randn(2, 2)
# Lists to store loss values and X values for plotting
loss_history = []
X_history = []
# SGD iterations
for i in range(n_iterations):
current_loss = loss_function(X)
loss_history.append(current_loss)
X_history.append(X.copy())
grad = gradient(X)
# Use numerical gradient instead of analytical gradient
# grad = numerical_gradient(X)
# Update X using gradient descent
X = X - learning_rate * grad
if (i + 1) % 10 == 0:
print(f"Iteration {i+1}: Loss = {current_loss:.6f}")
return X, loss_history, X_history
# Run optimization
final_X, loss_history, X_history = sgd_optimizer(learning_rate=0.01, n_iterations=100)
# Print final results
print("\nFinal X matrix:")
print(final_X)
print("\nFinal loss:", loss_function(final_X))
# Plot loss history
plt.figure(figsize=(10, 5))
plt.plot(loss_history)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Loss vs. Iteration')
plt.grid(True)
plt.show()
# Verify the result
product = np.trace(final_X.T @ final_X)
print("\nX^T * X =", product) # Should be close to 1 for optimal solution
def loss_function_with_l2(X, lambda_l2):
"""
Computes the loss with L2 regularization: (1 - X^T * X)^2 + lambda * ||X||^2
"""
main_loss = loss_function(X)
l2_reg = lambda_l2 * np.sum(X ** 2)
return main_loss + l2_reg
def sgd_l2_optimizer(learning_rate=0.01, n_iterations=100, lambda_l2=0.01, momentum=0.9):
X = np.random.randn(2, 2)
velocity = np.zeros_like(X)
loss_history = []
for i in range(n_iterations):
current_loss = loss_function_with_l2(X, lambda_l2)
loss_history.append(current_loss)
grad = gradient(X) + 2 * lambda_l2 * X # Add L2 gradient
velocity = momentum * velocity - learning_rate * grad
X = X + velocity
if (i + 1) % 10 == 0:
print(f"SGD+L2 Iteration {i+1}: Loss = {current_loss:.6f}")
return X, loss_history
def adam_l2_optimizer(learning_rate=0.01, n_iterations=100, lambda_l2=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
X = np.random.randn(2, 2)
m = np.zeros_like(X) # First moment
v = np.zeros_like(X) # Second moment
loss_history = []
for i in range(n_iterations):
current_loss = loss_function_with_l2(X, lambda_l2)
loss_history.append(current_loss)
grad = gradient(X) + 2 * lambda_l2 * X # Add L2 gradient
# Update moments
m = beta1 * m + (1 - beta1) * grad
v = beta2 * v + (1 - beta2) * (grad ** 2)
# Bias correction
m_hat = m / (1 - beta1 ** (i + 1))
v_hat = v / (1 - beta2 ** (i + 1))
# Update X
X = X - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
if (i + 1) % 10 == 0:
print(f"Adam+L2 Iteration {i+1}: Loss = {current_loss:.6f}")
return X, loss_history
def adamw_optimizer(learning_rate=0.01, n_iterations=100, weight_decay=0.01, beta1=0.9, beta2=0.999, epsilon=1e-8):
X = np.random.randn(2, 2)
m = np.zeros_like(X)
v = np.zeros_like(X)
loss_history = []
for i in range(n_iterations):
current_loss = loss_function(X) # Note: using original loss function
loss_history.append(current_loss)
grad = gradient(X)
# Update moments
m = beta1 * m + (1 - beta1) * grad
v = beta2 * v + (1 - beta2) * (grad ** 2)
# Bias correction
m_hat = m / (1 - beta1 ** (i + 1))
v_hat = v / (1 - beta2 ** (i + 1))
# AdamW update: separate weight decay
X = X * (1 - learning_rate * weight_decay)
X = X - learning_rate * m_hat / (np.sqrt(v_hat) + epsilon)
if (i + 1) % 10 == 0:
print(f"AdamW Iteration {i+1}: Loss = {current_loss:.6f}")
return X, loss_history
# Compare optimizers with adjusted learning rates and regularization
lr_sgd = 0.01
lr_adam = 0.001 # Reduced learning rate for Adam
lr_adamw = 0.001 # Reduced learning rate for AdamW
n_iterations = 100
lambda_l2 = 0.001 # Reduced L2 regularization strength
# Run all optimizers
X_sgd_l2, loss_sgd_l2 = sgd_l2_optimizer(lr_sgd, n_iterations, lambda_l2)
X_adam_l2, loss_adam_l2 = adam_l2_optimizer(lr_adam, n_iterations, lambda_l2)
X_adamw, loss_adamw = adamw_optimizer(lr_adamw, n_iterations, lambda_l2)
# Plot comparison
plt.figure(figsize=(12, 6))
plt.plot(loss_sgd_l2, label='SGD+L2')
plt.plot(loss_adam_l2, label='Adam+L2')
plt.plot(loss_adamw, label='AdamW')
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Loss vs. Iteration - Optimizer Comparison')
plt.legend()
plt.grid(True)
plt.show()
# Print final results
print("\nFinal Results:")
print("SGD+L2 final loss:", loss_sgd_l2[-1])
print("Adam+L2 final loss:", loss_adam_l2[-1])
print("AdamW final loss:", loss_adamw[-1])