-
Notifications
You must be signed in to change notification settings - Fork 0
/
GlobalParams.py
55 lines (40 loc) · 1.23 KB
/
GlobalParams.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
import numpy as np
"""Define the environment"""
# Actions
A = np.array(["n", "w", "s", "e"])
# Policy = random uniform
piProbs = {A[0]: .25, A[1]: .25, A[2]: .25, A[3]: .25}
# Rewards
rewards = {"pos": 1, "neg": -1}
neg_reward_states = np.array([[3, 3], [4, 5], [4, 6], [5, 6], [5, 8], [6, 8], [7, 3], [7, 5], [7, 6]])
pos_reward_states = np.array([[5, 5]])
# Walls
Walls = np.array([[2, 1], [2, 2], [2, 3], [2, 4],
[3, 4], [4, 4], [5, 4], [6, 4], [7, 4],
[2, 6], [2, 7], [2, 8]])
# Undiscounted episodic MDP
Gamma = 0.9
# Grid dimension
NROWS, NCOLS = 10, 10
# State Value Function
v = np.zeros((NROWS, NCOLS))
# Policy
pi = np.full([NROWS, NCOLS], "nwse")
# decode state
states_encoded = dict()
for i, row in enumerate(range(NROWS)):
for j, col in enumerate(range(NCOLS)):
ind = i*(NROWS-1) + i + j
states_encoded[ind] = np.array([row, col])
# State-Action Value Function
Q = np.zeros((len(states_encoded.keys()), len(A)))
terminal_states = np.array([[5, 5]])
starting_state = np.array([0, 0])
# Stopping criterion
eps = 1e-4
# Learning rate for value function
Alpha = 0.1
# Initial epsilon for epsilon-greedy policy
epsilon = 0.2
for wall in Walls:
v[wall[0], wall[1]] = np.NINF