-
Notifications
You must be signed in to change notification settings - Fork 2
/
Copy pathq-learn.py
112 lines (79 loc) · 2.9 KB
/
q-learn.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
###### Q-LEARNING #######################################
#########################################################
from mdp import PokerMDP
from collections import defaultdict
import random
#### DECLARE GLOBAL VARIABLES HERE #################################
max_iterations = 10
learning_rate = 0.8
discount = 1
weights = defaultdict(float)
explorationProb = 0.15
numRounds = 10
#### Q-LEARNING HELPER FUNCTIONS #################################
# Simplify board state to just consider board hands and player hand
def featureExtractor(state, action):
return (sorted(state.board + state.players[state.curPlayer][0]), action)
#### Q-LEARNING MAIN FUNCTIONS #################################
# Return the Q function associated with the weights and features
def getQ(state, action):
key = featureExtractor(state, action)
return weights[keys]
# This algorithm will produce an action given a state
# by following some strategy.
# For example, we could use epsilon-greedy algorithm:
# with probability |explorationProb|, take a random action.
def chooseAction(state, actions):
if random.random() < explorationProb:
return random.choice(actions)
else:
max_actions = []
max_q = float('-inf')
for a in actions:
q = getQ(state, a)
if (q > max_q):
max_q = q
max_actions = [a]
elif (q == max_q):
max_actions.append(a)
return random.choice(max_actions)
# Call this function with (s, a, r, s'), which you should use to update |weights|.
# Note that if s is a terminal state, then s' will be None.
# Use getQ() to compute the current estimate of the parameters.
def incorporateFeedback(state, action, reward, newState, actions, newState_is_end):
qp = 0
if not newState_is_end:
for a in actions:
q = getQ(newState, a)
if (q > qp):
qp = q
update = learning_rate * (reward + (discount * qp) - getQ(state, action))
weights[featureExtractor(state, action)] += update
#### SIMULATE (RUN Q-LEARNING) #####################################
def simulateQLearning(playerWallets):
mdp = PokerMDP()
state = mdp.initState()
total_rewards = 0
actions = mdp.getActions(state)
for i in range(max_iterations):
#CASE: Game Over
if mdp.isEnd(state):
break
# Choose action based on Q and epsilon-greedy search strategy.
best_action = chooseAction(state, actions)
# Observe newState and associated reward.
newState, reward = mdp.sampleNextState(state, best_action)
total_rewards += reward
# Update Q weights
actions = mdp.getActions(newState)
incorporateFeedback(state, best_action, reward, newState, actions, mdp.isEnd(newState))
# Update state
state = newState
return total_rewards
def runGame():
playerWallets = [1000 for _ in range(numPlayers)
for _ in range(numRounds):
playerMoney += simulateQLearning()
print('Player money after ' + i + ' round: ' + playerMoney + '.')
if __name__ == '__main__':
runGame()