-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathqlearning.py
397 lines (281 loc) · 14.4 KB
/
qlearning.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
#!/usr/bin/env python3
#-----------------------------------------------------------------------------------------------
# Imports
#-----------------------------------------------------------------------------------------------
import sys, os, pickle
import logging
import time
import numpy as np
from algorithms.rl_algorithm import RLAlgorithm
#-----------------------------------------------------------------------------------------------
# Functions
#-----------------------------------------------------------------------------------------------
def run_gym_q_learning_multi_agent(env, n_agents: int=1, render: bool=False, episodes: int=100, time_steps: int=10000, gamma: float=0.99, epsilon_max: float=1.0, epsilon_min: float=0.01, lr: float=0.7, decay: float=0.999, saved_path: str=None):
"""
function to run independent q-learning algorithm on a gym env
env is the gym env object
n_agents is the number of agents
render determines whether to render the env
episodes is the number of episodes to simulate
time steps is the maximum number of time steps per episode
returns obvs, actions, rewards and losses of all agents and time of each epsiode in seconds
"""
if n_agents < 1:
raise ValueError("Cannot have less than 1 agent")
elif n_agents < 2:
logging.error("Running multi-agent function with only 1 agent. Use single agent function for single agent environments")
#get env variables
n_actions = env.action_space.n #number of actions
#value range limited to -1000 to 1000 as maximum memory may be reach
#any env with memory requirement larger than this for observation space cannot be used with this implementation of qlearning
low = np.clip(env.observation_space.low, -1000, 1000) #minimum values of observation space
high = np.clip(env.observation_space.high, -1000, 1000) #maximum values of observation space
n_states = np.prod(high - low + 1, dtype=int) #number of discretised states
agents = [QLearning(n_states, n_actions, gamma=gamma, epsilon_max=epsilon_max, epsilon_min=epsilon_min, lr=lr, decay=decay, saved_path=saved_path) for i in range(n_agents)]
#init arrays to collect data
all_times = []
all_obvs = []
all_actions = []
all_rewards = []
#robot-maze env can save the path taken by the agents each episode
robot_paths = []
#render env if enabled
if render:
env.render()
for e in range(episodes):
obvs = env.reset()
states = np.zeros(n_agents, dtype=int)
next_states = np.zeros(n_agents, dtype=int)
for i in range(n_agents):
states[i] = agents[i].index_obv(obvs[i], low, high)
start_time = time.time()
ep_obvs = []
ep_actions = []
total_rewards = np.zeros(n_agents)
done = False
for t in range(time_steps):
if render:
env.render()
actions = np.zeros(n_agents, dtype=int)
for i in range(n_agents):
actions[i] = agents[i].get_action(states[i])
next_obvs, rewards, done, info = env.step(actions)
for i in range(n_agents):
next_states[i] = agents[i].index_obv(next_obvs[i], low, high)
agents[i].train(states[i], actions[i], rewards[i], next_states[i])
ep_obvs.append(actions)
ep_actions.append(obvs)
obvs = next_obvs
states = next_states
total_rewards += rewards
if done:
logging.info("Episode %u completed, after %u time steps, with total reward = %s", e, t, str(total_rewards))
ep_time = round((time.time() - start_time), 3)
all_times.append(ep_time)
all_obvs.append(ep_obvs)
all_actions.append(ep_actions)
all_rewards.append(total_rewards)
if env.unwrapped.spec.id[0:13] == "gym_robot_maze":
robot_paths.append(info["robot_path"])
break
elif t >= (time_steps - 1):
logging.info("Episode %u timed out, with total reward = %s", e, str(total_rewards))
ep_time = round((time.time() - start_time), 3)
all_times.append(ep_time)
all_obvs.append(ep_obvs)
all_actions.append(ep_actions)
all_rewards.append(total_rewards)
if env.unwrapped.spec.id[0:13] == "gym_robot_maze":
robot_paths.append(info["robot_path"])
break
if env.unwrapped.spec.id[0:5] == "maze-" and env.is_game_over():
sys.exit(0)
for i in range(n_agents):
agents[i].update_parameters(e)
return all_obvs, all_actions, all_rewards, robot_paths, all_times
def run_gym_q_learning_single_agent(env, render: bool=False, episodes: int=100, time_steps: int=10000, gamma: float=0.99, epsilon_max: float=1.0, epsilon_min: float=0.01, lr: float=0.7, decay: float=0.999, saved_path: str=None):
"""
function to run independent q-learning algorithm on a gym env
env is the gym env object
n_agents is the number of agents
render determines whether to render the env
episodes is the number of episodes to simulate
time steps is the maximum number of time steps per episode
returns obvs, actions, rewards and losses of all agents
"""
#get env variables
n_actions = env.action_space.n #number of actions
#value range limited to -1000 to 1000 as maximum memory may be reach
#any env with memory requirement larger than this for observation space cannot be used with this implementation of qlearning
low = np.clip(env.observation_space.low, -1000, 1000) #minimum values of observation space
high = np.clip(env.observation_space.high, -1000, 1000) #maximum values of observation space
n_states = round(np.prod(high - low + 1)) #number of discretised states
agent = QLearning(n_states, n_actions, gamma=gamma, epsilon_max=epsilon_max, epsilon_min=epsilon_min, lr=lr, decay=decay, saved_path=saved_path)
#init arrays to collect data
all_times = []
all_obvs = []
all_actions = []
all_rewards = []
#robot-maze env can save the path taken by the agents each episode
robot_paths = []
#render env if enabled
if render:
env.render()
for e in range(episodes):
obv = env.reset()
state = agent.index_obv(obv, low, high)
ep_obvs = []
ep_actions = []
total_reward = 0
done = False
for t in range(time_steps):
if render:
env.render()
action = agent.get_action(state)
next_obv, reward, done, _ = env.step(action)
next_state = agent.index_obv(next_obv, low, high)
agent.train(state, action, reward, next_state)
ep_obvs.append(obv)
ep_actions.append(action)
obv = next_obv
state = next_state
total_reward += reward
if done:
logging.info("Episode %u completed, after %u time steps, with total reward = %f", e, t, total_reward)
all_obvs.append(ep_obvs)
all_actions.append(ep_actions)
all_rewards.append(total_reward)
if env.unwrapped.spec.id[0:13] == "gym_robot_maze":
robot_paths.append(info["robot_path"])
break
elif t >= (time_steps - 1):
logging.info("Episode %u timed out, with total reward = %f", e, total_reward)
all_obvs.append(ep_obvs)
all_actions.append(ep_actions)
all_rewards.append(total_reward)
if env.unwrapped.spec.id[0:13] == "gym_robot_maze":
robot_paths.append(info["robot_path"])
break
if env.unwrapped.spec.id[0:5] == "maze-" and env.is_game_over():
sys.exit(0)
agent.update_parameters(e)
return all_obvs, all_actions, all_rewards, robot_paths, all_times
#-----------------------------------------------------------------------------------------------
# Classes
#-----------------------------------------------------------------------------------------------
class QLearning(RLAlgorithm):
"""
Class to contain Q-table and all parameters with methods to update Q-table and get actions
"""
def __init__(self, n_states: int, n_actions: int, gamma: float=0.99, epsilon_max: float=1.0, epsilon_min: float=0.01, lr: float=0.7, decay: float=0.999, saved_path: str=None):
"""
function to initalise the QLearning class
n_states is the number of discrete (discretised if continuous) states in the environment
n_actions is the number of discrete actions (q learning will only perform with discrete action space)
gamma is a float which is the discount factor of future rewards
epsilon max is the maximum exploration rate of the agent
epsilon min is the minimum exploration rate of the agent
lr is the learning rate of the agent
lr_decay is the rate at which the learning rate will decay exponentially
saved_path
"""
self.gamma = gamma
self.lr = lr
self.decay = decay
self.n_actions = n_actions
self.epsilon = epsilon_max
self._epsilon_max = epsilon_max
self._epsilon_min = epsilon_min
self._q_table = np.zeros((n_states, self.n_actions))
#load a saved model (q-table) if provided
if saved_path:
if os.path.isfile(saved_path):
with open(saved_path, "rb") as handle:
self._q_table = pickle.load(handle)
else:
raise FileNotFoundError
#-------------------------------------------------------------------------------------------
# Properties
#-------------------------------------------------------------------------------------------
@property
def epsilon(self) -> float:
return self._epsilon
@epsilon.setter
def epsilon(self, val: float):
if val < 0 or val > 1:
raise ValueError("epsilon (exploration probability) must have a value between 0 and 1 (inclusive).")
if not isinstance(val, float):
raise TypeError("epsilon (exploration probability) must be a float")
self._epsilon = val
@property
def epsilon_max(self) -> float:
return self._epsilon_max
@property
def epsilon_min(self) -> float:
return self._epsilon_min
@property
def q_table(self):
return self._q_table
#-------------------------------------------------------------------------------------------
# Methods
#-------------------------------------------------------------------------------------------
def save_model(self, path: str):
"""
function to save the model (q-table) to a file
path is a string of the path to the file where the model will be saved
"""
with open(path, "wb") as handle:
pickle.dump(self.q_table, handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_action(self, obv_i: int):
"""
function to get the action based on the current observation using an epsilon-greedy policy
obv_i is the current observation of the state indexed for the q_table (done using index_obv function)
returns the action to take as an int
Note: index_obv function should be done outside of this class to prevent performance issues
"""
#take random action with probability epsilon (explore rate)
if np.random.uniform(0, 1) < self.epsilon:
action = np.random.choice(self.n_actions)
else:
#policy is greedy
action = np.argmax(self.q_table[obv_i])
return action
def update_parameters(self, n_t: int):
"""
function to reduce value of learning parameters decaying exponentially at the rate of the decay property
n_t is the current episode number
"""
self.epsilon *= self.decay ** (n_t + 1)
self.lr *= self.decay ** (n_t + 1)
def train(self, obv_i: int, action: int, reward: int, next_obv_i: int):
"""
function to train agent by applying the q-value update rule to the q-table
obv_i is the observation from the environment indexed for the q_table (done using index_obv function)
action is the action taken by the agent
reward is the reward provided by the environment after taking action in current state
next_obv_i is the observation after taking action in the current state indexed for the q_table (done using the index_obv function)
Note: index_obv function should be done outside of this class to prevent performance issues
"""
#ensure action is an int for indexing q-table
action = int(action)
self.q_table[obv_i, action] += self.lr * (reward + (self.gamma * np.max(self.q_table[next_obv_i])) - self.q_table[obv_i, action])
def index_obv(self, obv: np.ndarray, low: np.ndarray, high: np.ndarray) -> int:
"""
function to turn the observations from the environment into an index for the q-table (int)
obv is the observation to be indexed
low is an array of the lowest values for each observation dimension (e.g. for 4 dimensions
with the lowest value of each being 0 an array of [0, 0, 0, 0] should be passed to this
function. If using openai gym env this array is equivalent to env.observation_space.low)
high is an array of the highest values for each observation dimension (same as for low,
except the highest values should be included. If using openai gym env this array is
equivalent to env.observation_space.high)
returns the index as an int
Note: this function should not be called inside of the QLearning class for performance
"""
index = 0
#loop through observation array inversly
for i in range((np.size(obv) - 1), -1, -1):
#scale so that each array element's value is increased exponentially
scaler = (high[i + 1] + 1 - low[i + 1]) if i != (np.size(obv) - 1) else 1
index += (obv[i] - low[i]) * scaler
return int(index)