-
Notifications
You must be signed in to change notification settings - Fork 0
/
q_agent.py
138 lines (119 loc) · 4.15 KB
/
q_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
import random
from agent import Agent
from qlearning import get_possible_actions
class QAgent(Agent):
'''
Agent implementing Q-learning for the game Nim.
Parameters:
----------
epsilon: float, in [0, 1].
epsilon of the epsilon-greedy policy.
alpha: float.
learning rate.
gamma: float.
discount factor.
'''
def __init__(self, epsilon, alpha=0.1, gamma=0.99):
self.epsilon = epsilon
self.alpha = alpha
self.gamma = gamma
self.qtable = {}
def get_qvalues(self, heaps):
'''
Get the Q-values of all the allowed actions, for a given state.
Parameters
----------
heaps : list of integers
list of heap sizes.
Returns
----------
qtable : dict
mapping action -> Q-value
'''
state = tuple(heaps)
if state not in self.qtable:
self.qtable[state] = {action: 0 for action in get_possible_actions(state)}
return self.qtable[state]
def _pick_best_move(self, heaps):
'''
Get the move with the highest Q-value for a given state, randomly breaking ties.
Parameters
----------
heaps : list of integers
list of heap sizes.
Returns
----------
best_value: float.
highest Q-value for the given state
best_move : tuple. Action with the highest Q-value for the given state
best_move[0] is the heap to take from (starts at 1)
best_move[1] is the number of elements to take from heap best_move[0]
'''
# Randomly shuffle the dictionary, in order to randomly break ties.
qvalues = list(self.get_qvalues(heaps).items())
random.shuffle(qvalues)
qvalues = dict(qvalues)
best_move = None
for act, val in qvalues.items():
if best_move is None or val > qvalues[best_move]:
best_move = act
return qvalues[best_move] if best_move else 0, best_move
def get_max_qvalue(self, heaps):
'''
Get the highest Q-value associated to a possible action for the given state.
Parameters
----------
heaps : list of integers
list of heap sizes.
Returns
----------
best_value: float.
highest Q-value for the given state.
'''
best_value, best_move = self._pick_best_move(heaps)
return best_value
def act(self, heaps):
'''
Take an action, given the current state.
Parameters
----------
heaps : list of integers
list of heap sizes.
Returns
-------
move : list
move[0] is the heap to take from (starts at 1)
move[1] is the number of obj to take from heap #move[0]
'''
if random.random() < self.epsilon:
# random move
move = random.choice(list(self.get_qvalues(heaps).keys()))
else:
# greedy
move = self._pick_best_move(heaps)[1]
return move
def on_step(self, state, action, reward, new_state, debug=False):
'''
Update Q-values of the agent with off-policy update, after a step of the environment.
Parameters
----------
state : list of integers
list of heap sizes.
action : list
action[0] is the heap to take from (starts at 1)
action[1] is the number of elements taken from heap action[0]
reward : int.
current reward.
new_state : list of integers
list of heap sizes.
debug : bool.
if true, print debug information.
'''
action = tuple(action)
old_q_value = self.get_qvalues(state)[action]
new_q_value = old_q_value + self.alpha * (reward + self.gamma * self.get_max_qvalue(new_state) - old_q_value)
self.get_qvalues(state)[action] = new_q_value
if(debug):
print('Last state: ', state)
print('Last action: ', action)
print('Old qvalue: ', old_q_value, '; new qvalue: ', new_q_value, '; reward: ', reward)