-
Notifications
You must be signed in to change notification settings - Fork 0
/
function_fitting.py
216 lines (175 loc) · 5.39 KB
/
function_fitting.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
import pdb
import sys
import numpy as np
from matplotlib import pyplot as plt
path = "data/function_fitting/"
# Parse command line input if any
if len(sys.argv) == 3:
algo = sys.argv[1]
lp_regulizer = int(sys.argv[2])
if algo not in ("md", "gd") or lp_regulizer not in (0, 1, 2):
print("usage: function_fitting.py <algorithm> <regularizer> \n"
"\talgorithm:\tmd, gd \n"
"\tregularizer:\t1, 2")
sys.exit()
else:
algo = "md"
lp_regulizer = 0
# Hyperparameters
n_samples = 10
dim_regressor = 200
epochs = 120000
rla = 2e-4
rlb = 2e-4
norm_order = 1.01
activation_function = "ReLU"
# Function to be fitted
def f(x):
return x + np.sin(x)
def ReLU(x):
return (x>0)*x
def SiLU(x):
return x/(1 + np.exp(-x))
def grad_SiLU_b(x):
return (1 + np.exp(-x)*(1+x))/(1+np.exp(-x))**2
def grad_ReLU_b(x):
return (x>0)*1
def l2_reg(x):
return np.linalg.norm(x, ord=2)
def l1_reg(x):
return np.linalg.norm(x, ord=1)
def grad_l2(x):
return x/np.linalg.norm(x, ord=2)
def grad_l1(x):
return np.sign(x)
# Gradient of squared p-norm
def spnorm_grad(x, p):
# Define gradient as zero close to x=0 to avoid overflows
if np.linalg.norm(x, ord=p) < 1e-3:
return np.zeros(x.shape)
else:
return x * (abs(x)/np.linalg.norm(x, ord=p))**(p-2)
# Gradient of squared conjugate p-norm (q-norm)
def spnorm_conj_grad(x, p):
q = p/(p-1)
return spnorm_grad(x,q)
# Set random seed for repeatability
np.random.seed(99)
# Sample training and test set
x = np.random.uniform(0, 10, n_samples)
y = f(x)
xt = np.random.uniform(0, 10, n_samples)
yt = f(xt)
# Initialize parameters and errors
a = np.random.uniform(-0.1, 0.1, dim_regressor)
b = np.random.uniform(-10, 0, dim_regressor)
e = np.zeros(epochs)
et = np.zeros(epochs)
if activation_function == "SiLU":
activation = SiLU
grad_activation_b = grad_SiLU_b
else:
activation = ReLU
grad_activation_b = grad_ReLU_b
if algo == "gd":
if lp_regulizer == 2:
lambda_1 = 0.03
lambda_2 = 0.1
regulizer = l2_reg
grad_regulizer = grad_l2
elif lp_regulizer == 1:
lambda_1 = 0.005
lambda_2 = 0.02
regulizer = l1_reg
grad_regulizer = grad_l1
else:
lambda_1 = 0
lambda_2 = 0
regulizer = l1_reg
grad_regulizer = grad_l1
# Supertitle for plot
if lambda_1 == 0 and lambda_2 == 0:
optimization_details = "Gradient descent"
else:
optimization_details = f"""Gradient descent with $\ell_{lp_regulizer}$ regularization
$\\lambda_1 = {lambda_1}$, $\\lambda_2 = {lambda_2}$"""
for j in range(epochs):
d_a = 0
d_b = 0
for i in range(n_samples):
xi = x[i]
yi = y[i]
ReLU_i = activation(xi + b)
ReLU_i_a = ReLU_i@a
G_b_i = grad_activation_b(xi + b)
d_a += -2*(yi - ReLU_i_a)*ReLU_i
d_b += -2*(yi - ReLU_i_a)*G_b_i*a
e[j] += (yi-ReLU_i_a)**2
et[j] += (yt[i] - activation(xt[i] + b)@a)**2
e[j] /= n_samples
et[j] /= n_samples
# e[j] += lambda_1*regulizer(a) + lambda_2*regulizer(b)
# et[j] += lambda_1*regulizer(a) + lambda_2*regulizer(b)
a -= rla*(d_a/n_samples + lambda_1*grad_regulizer(a))
b -= rlb*(d_b/n_samples + lambda_2*grad_regulizer(b))
np.save(f"{path}gd_l{lp_regulizer}_training", e)
np.save(f"{path}gd_l{lp_regulizer}_test", et)
else:
optimization_details = f"Mirror descent with $\\psi(\\cdot) = \\frac{{1}}{{2}} \\|\\|\\cdot\\|\\|_{{{norm_order:.2f}}}^2$"
g_a = spnorm_grad(a, norm_order)
g_b = spnorm_grad(b, norm_order)
for j in range(epochs):
a = spnorm_conj_grad(g_a, norm_order)
b = spnorm_conj_grad(g_b, norm_order)
d_g_a = 0
d_g_b = 0
for i in range(n_samples):
xi = x[i]
yi = y[i]
ReLU_i = activation(xi + b)
ReLU_i_a = ReLU_i@a
G_b_i = grad_activation_b(xi + b)
d_g_a += -2*(yi - ReLU_i_a)*ReLU_i
d_g_b += -2*(yi - ReLU_i_a)*G_b_i*a
e[j] += (yi-ReLU_i_a)**2
et[j] += (yt[i] - activation(xt[i] + b)@a)**2
e[j] /= n_samples
et[j] /= n_samples
g_a -= rla*d_g_a/n_samples
g_b -= rlb*d_g_b/n_samples
np.save(f"{path}md_l{norm_order:.0f}_training", e)
np.save(f"{path}md_l{norm_order:.0f}_test", et)
plt.figure()
plt.subplot(2,2,1)
xp = np.linspace(-2, 12, 100)
yp = np.array([activation(xi + b)@a for xi in xp])
plt.plot(xp, yp)
plt.plot(xp, f(xp), linestyle="--")
plt.plot(x, y, "xr")
plt.legend(("Regressor approximation", "True function", "Training samples"))
plt.title("Function approximation")
plt.xlabel("$x$")
plt.ylabel("$y$")
plt.suptitle(optimization_details)
plt.subplot(2,2,2)
plt.semilogy(e)
plt.semilogy(et)
plt.ylim((1e-4, 30))
plt.legend(("Training error", "Test error"))
plt.xlabel("Iteration")
plt.ylabel(r"$\Vert \hat{y}-y \Vert_2^2$")
plt.title("Squared fitting error")
plt.subplot(2,2,3)
plt.hist(a, bins=50, range=(-0.5, 0.5))
plt.ylim((0, 80))
plt.xlabel(r"$\alpha$")
plt.ylabel("Frequency")
plt.title(r"$\alpha$ distribution")
plt.subplot(2,2,4)
plt.hist(b, bins=50, range=(-12,2))
plt.ylim((0, 20))
plt.xlabel(r"$\beta$")
plt.ylabel("Frequency")
plt.title(r"$\beta$ distribution")
plt.subplots_adjust(left=0.05, right=0.95, top=0.91, bottom=0.06)
plt.show()