-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathQLearner.py
68 lines (62 loc) · 3.42 KB
/
QLearner.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import numpy.random as rnd
eps = np.finfo(float).eps
############# Defining Agent To Be Trained ##################
class greedye_QL:
# defining training agent
def __init__(self, num_actions, modulus, state_UB, alpha, disc1, disc2, movements):
self.name = "e-Greedy"
self.num_actions, self.modulus, self.UB, self.movements = num_actions, modulus, state_UB, movements
self.alpha, self.disc1, self.disc2 = alpha, disc1, disc2
self.X_s = np.linspace(0, int(self.UB[0]), int(self.UB[0]/self.modulus[0] + 1), dtype = np.float64)
self.N_s = np.linspace(0, int(self.UB[1]), int(self.UB[1]/self.modulus[1] + 1), dtype = np.float64)
f1, f2 = str(self.modulus[0]), str(self.modulus[1])
decimal1, decimal2 = f1[::-1].find('.'), f2[::-1].find('.')
self.sitevisit = {(X_si, N_si, ti): np.zeros((1)) for X_si in np.round(self.X_s,decimal1) for N_si in np.round(self.N_s,decimal2) for ti in np.arange(1,self.movements+1, dtype ="int")}
self.d = {(X_si, N_si, ti): np.random.randn(num_actions) for X_si in np.round(self.X_s,decimal1) for N_si in np.round(self.N_s,decimal2) for ti in np.arange(1,self.movements+1, dtype ="int")}
def act(self,state, eps_prob, s):
#e-greedy definition
self.eps_prob = eps_prob
time_to_term = int(self.movements - s)
if np.random.uniform(0,1) <= self.eps_prob:
action = np.random.randint(0,self.num_actions)
else: action = np.argmax(self.d[(state[0],state[1], time_to_term)])
return action
def Learn(self, state, action, reward):
self.reward = reward
XT, NT = state[-1,0], state[-1,1]
#takes some state and attributes discounted reward to action via QL update
for i in range(0,action.shape[0]): #attributing value to actions (hence shape-1)
indx = action[i]
time_to_term = int(self.movements - i)
if i < action.shape[0]-1:
opt_action = np.argmax(self.d[(state[i+1,0], state[i+1,1], int(time_to_term - 1))])
opt_future = self.d[(state[i+1,0], state[i+1,1], int(time_to_term - 1))][opt_action]
else: opt_future = 0
#input of variational derivative
dXdt, dNdt = (state[i+1,0] - state[i,0]), (state[i+1,1] - state[i,1])
dRdX, dRdN = 100, -1
varderivX, varderivN = dXdt * dRdX * 1/(XT+1), dNdt * dRdN * 1/(NT+1)
Rtp1 = (varderivX + varderivN)
self.d[(state[i,0],state[i,1],time_to_term)][int(indx)] = self.d[(state[i,0],state[i,1], time_to_term)][int(indx)] * (1-self.alpha) + self.alpha * (Rtp1 + opt_future*self.disc2)
return
def learned(self):
dictionary = self.d
return dictionary
############ Defining learned Agent #################
class Greedye_QLearned:
# defining trained agent
def __init__(self, num_actions, d, movements):
self.name = "e-Greedy"
self.num_actions = num_actions
self.d, self.movements = d, movements
def act(self,state, eps_prob, s):
# e-greedy definition
self.eps_prob = eps_prob
time_to_term = int(self.movements - s)
if np.random.uniform(0,1) <= self.eps_prob:
action = np.random.randint(0,self.num_actions)
else: action = np.argmax(self.d[(state[0],state[1], time_to_term)])
return action