-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathgradientdescent.py
186 lines (173 loc) · 8.97 KB
/
gradientdescent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
"""
Performs optimization to find optimal importance weights for the sources
of tokens using Gradient Descent
"""
import numpy as np
import json
class GradientDescentOptimizer:
@classmethod
def __init__( self, number_iterations, data_sources, seed=2 ):
self.number_iterations = number_iterations
self.sources = data_sources
self.best_similarity_score = 1.0
np.random.seed( seed )
@classmethod
def get_random_weights( self ):
"""
Initialize the uniform weight matrices
"""
weights = dict()
random_weights = np.random.rand( len( self.sources ), 1 )
random_weights = [ float( item ) / np.sum( random_weights ) for item in random_weights ]
for index, item in enumerate( self.sources ):
weights[ item ] = random_weights[ index ]
return weights
@classmethod
def normalize_weights( self, weights ):
"""
Normalize the weights
"""
sum_weights = np.sum( [ weights[ item ] for item in weights ] )
for source in weights:
weights[ source ] = weights[ source ] / float( sum_weights )
return weights
@classmethod
def update_weights( self, weights, gradients, prev_update, iteration, eta, gamma=0.9, lr_decay=0.1 ):
"""
Update weights with momentum and time-based decay of learning rate
"""
eta = eta / ( 1. + ( lr_decay * iteration ) )
for source in weights:
new_update = gamma * prev_update[ source ] + eta * gradients[ source ]
weights[ source ] = weights[ source ] - new_update
prev_update[ source ] = new_update
return self.normalize_weights( weights ), prev_update, eta
@classmethod
def update_weights_nag( self, weights, gradients, prev_update, iteration, eta, tool_index, similarity_matrix, n_tools, gamma=0.9, lr_decay=0.1 ):
"""
Update weights with momentum and time-based decay of learning rate
"""
eta = eta / ( 1. + ( lr_decay * iteration ) )
forward_weights = dict()
ideal_score = np.repeat( self.best_similarity_score, n_tools )
for source in weights:
forward_weights[ source ] = weights[ source ] + gamma * prev_update[ source ]
for source in weights:
tool_score = similarity_matrix[ source ][ tool_index ]
loss = forward_weights[ source ] * similarity_matrix[ source ][ tool_index ] - ideal_score
forward_gradient = 2 * np.dot( tool_score, loss ) / n_tools
new_update = gamma * prev_update[ source ] - eta * forward_gradient
weights[ source ] = weights[ source ] + new_update
prev_update[ source ] = new_update
return self.normalize_weights( weights ), prev_update, eta
@classmethod
def compute_loss( self, weight, uniform_weight, tool_scores, ideal_tool_score ):
"""
Compute loss between the ideal score and hypothesised similarity scores
"""
loss = weight * tool_scores - ideal_tool_score
uniform_loss = uniform_weight * tool_scores - ideal_tool_score
return loss, uniform_loss
@classmethod
def verify_gradients( self, ideal_score, weights, tool_index, sources_gradient, similarity_matrix, epsilon=1e-4 ):
"""
Find the approximation of the gradient using mean squared error
"""
derivative_sources = dict()
for source in weights:
weight = weights[ source ]
tools_scores_source = similarity_matrix[ source ][ tool_index ]
loss_0 = ( weight + epsilon ) * tools_scores_source - ideal_score
sq_error_0 = np.mean( loss_0 ** 2 )
loss_1 = ( weight - epsilon ) * tools_scores_source - ideal_score
sq_error_1 = np.mean( loss_1 ** 2 )
derivative_sources[ source ] = ( sq_error_0 - sq_error_1 ) / ( 2 * epsilon )
return derivative_sources
@classmethod
def gradient_descent( self, similarity_matrix, tools_list ):
"""
Apply gradient descent optimizer to find the weights for the sources of annotations of tools
"""
num_all_tools = len( tools_list )
tools_optimal_weights = dict()
cost_tools = dict()
uniform_cost_tools = dict()
learning_rates = dict()
gradients = dict()
learning_rates_iterations_tool = dict()
approx_gd_tools = dict()
actual_gd_tools = dict()
uniform_weight = 1. / len( self.sources )
for tool_index in range( num_all_tools ):
tool_id = tools_list[ tool_index ]
print "Tool index: %d and tool name: %s" % ( tool_index, tool_id )
# uniform weights to start with for each tool
weights = self.get_random_weights()
print weights
cost_iteration = list()
gradient_io_iteration = list()
gradient_nd_iteration = list()
gradient_ht_iteration = list()
uniform_cost_iteration = list()
approx_gd = list()
actual_gd = list()
lr_iteration = list()
lr_rates = list()
ideal_tool_score = np.repeat( self.best_similarity_score, num_all_tools )
prev_weights_updates = { self.sources[ 0 ]: 0.0, self.sources[ 1 ]: 0.0, self.sources[ 2 ]: 0.0 }
eta = 0.5e-1
# find optimal weights through these iterations
for iteration in range( self.number_iterations ):
sources_gradient = dict()
cost_sources = list()
uniform_cost_sources = list()
tool_similarity_scores = dict()
# compute gradient, loss and update weight for each source
for source in similarity_matrix:
weight = weights[ source ]
tools_score_source = similarity_matrix[ source ][ tool_index ]
# adjust for the position of the tool itself so that it does not account for the loss computation
tool_similarity_scores[ source ] = tools_score_source
# compute losses
loss, uniform_loss = self.compute_loss( weight, uniform_weight, tools_score_source, ideal_tool_score )
# compute average gradient
gradient = 2 * np.dot( tools_score_source, loss ) / num_all_tools
# gather gradient for a source
sources_gradient[ source ] = gradient
squared_loss = np.mean( loss ** 2 )
squared_uniform_loss = np.mean( uniform_loss ** 2 )
# add cost for a tool's source
cost_sources.append( squared_loss )
uniform_cost_sources.append( squared_uniform_loss )
mean_cost = np.mean( cost_sources )
derivative_sources = self.verify_gradients( ideal_tool_score, weights, tool_index, sources_gradient, similarity_matrix )
approx_gd.append( derivative_sources )
actual_gd.append( sources_gradient )
#weights, prev_weights_updates, learning_rate = self.update_weights( weights, sources_gradient, prev_weights_updates, iteration, eta )
weights, prev_weights_updates, learning_rate = self.update_weights_nag( weights, sources_gradient, prev_weights_updates, iteration, eta, tool_index, similarity_matrix, num_all_tools )
lr_iteration.append( learning_rate )
# gather cost for each iteration
cost_iteration.append( mean_cost )
# gather gradients
gradient_io_iteration.append( sources_gradient[ self.sources[ 0 ] ] )
gradient_nd_iteration.append( sources_gradient[ self.sources[ 1 ] ] )
gradient_ht_iteration.append( sources_gradient[ self.sources[ 2 ] ] )
uniform_cost_iteration.append( np.mean( uniform_cost_sources ) )
# optimal weights learned
print weights
print "=================================================="
tools_optimal_weights[ tool_id ] = weights
learning_rates[ tool_id ] = lr_iteration
cost_tools[ tool_id ] = cost_iteration
uniform_cost_tools[ tool_id ] = uniform_cost_iteration[ 0 ]
gradients[ tool_id ] = { self.sources[ 0 ]: gradient_io_iteration, self.sources[ 1 ]: gradient_nd_iteration, self.sources[ 2 ]: gradient_ht_iteration }
approx_gd_tools[ tool_id ] = approx_gd
actual_gd_tools[ tool_id ] = actual_gd
with open( "data/approx_gd_tools.json", 'w' ) as file:
file.write( json.dumps( approx_gd_tools ) )
with open( "data/actual_gd_tools.json", 'w' ) as file:
file.write( json.dumps( actual_gd_tools ) )
# write learning rates for all tools as json file iterations
with open( "data/learning_rates.json", 'w' ) as file:
file.write( json.dumps( learning_rates ) )
return tools_optimal_weights, cost_tools, learning_rates, uniform_cost_tools, gradients