GlobalParams.py


import numpy as np

"""Define the environment"""
# Actions
A = np.array(["n", "w", "s", "e"])

# Policy = random uniform
piProbs = {A[0]: .25, A[1]: .25, A[2]: .25, A[3]: .25}

# Rewards
rewards = {"pos": 1, "neg": -1}
neg_reward_states = np.array([[3, 3], [4, 5], [4, 6], [5, 6], [5, 8], [6, 8], [7, 3], [7, 5], [7, 6]])
pos_reward_states = np.array([[5, 5]])

# Walls
Walls = np.array([[2, 1], [2, 2], [2, 3], [2, 4],
                  [3, 4], [4, 4], [5, 4], [6, 4], [7, 4],
                  [2, 6], [2, 7], [2, 8]])

# Undiscounted episodic MDP
Gamma = 0.9

# Grid dimension
NROWS, NCOLS = 10, 10

# State Value Function
v = np.zeros((NROWS, NCOLS))
# Policy
pi = np.full([NROWS, NCOLS], "nwse")

# decode state
states_encoded = dict()
for i, row in enumerate(range(NROWS)):
    for j, col in enumerate(range(NCOLS)):
        ind = i*(NROWS-1) + i + j
        states_encoded[ind] = np.array([row, col])

# State-Action Value Function
Q = np.zeros((len(states_encoded.keys()), len(A)))

terminal_states = np.array([[5, 5]])
starting_state = np.array([0, 0])

# Stopping criterion
eps = 1e-4

# Learning rate for value function
Alpha = 0.1

# Initial epsilon for epsilon-greedy policy
epsilon = 0.2

for wall in Walls:
    v[wall[0], wall[1]] = np.NINF