-
Notifications
You must be signed in to change notification settings - Fork 0
/
policy.py
116 lines (91 loc) · 3.61 KB
/
policy.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
from model import VPGModel
import torch
import random
import numpy as np
import torch.distributions as td
from collections import deque
import matplotlib.pyplot as plt
from paddle import PingPong
class VPGPolicy():
def __init__(self, state_space, action_space):
self.batch_size = 64
self.gamma = .95
self.memory = [] # deque(maxlen=100000)
self.model = VPGModel(
state_space=state_space, action_space=action_space, lr=1e-3)
def act(self, state):
probs = self.model(state)
catgor_sample = td.Categorical(probs)
act = catgor_sample.sample().item()
return act
def remember(self, states, actions, weights, returns, lengths):
self.memory.append([states, actions, weights, returns, lengths])
def compute_loss(self, states, actions, weights):
logp = self.model(states)
logp = td.Categorical(logp)
y_pred = -(logp * weights)
return y_pred.mean()
def learn(self):
# if len(self.memory) > 2000:
# self.memory.pop(0)
minibatch = random.sample(self.memory, min(
len(self.memory), self.batch_size))
states_np = np.array([i[0] for i in minibatch])
actions_np = np.array([i[1] for i in minibatch])
rewards_np = np.array([i[2] for i in minibatch])
next_states_np = np.array([i[3] for i in minibatch])
dones_np = np.array([i[4] for i in minibatch])
states_np = np.squeeze(states_np)
actions_np = np.squeeze(actions_np)
states = torch.from_numpy(np.squeeze(states_np)).to(torch.float32)
actions = torch.from_numpy(np.squeeze(actions_np))
rewards = torch.from_numpy(np.squeeze(rewards_np))
dones = torch.from_numpy(np.squeeze(dones_np))
next_states = torch.from_numpy(
np.squeeze(next_states_np)).to(torch.float32)
R = torch.tensor([np.sum(rewards_np[i:]*(self.gamma**np.array(range(i, len(rewards_np)))))
for i in range(len(rewards_np))])
probs = self.model(states)
sampler = td.Categorical(probs)
# "-" because it was built to work with gradient descent, but we are using gradient ascent
log_probs = sampler.log_prob(actions)
# loss that when differentiated with autograd gives the gradient of J(θ)
pseudo_loss = torch.sum(log_probs * R)
# update policy weights
self.model.optimizer.zero_grad()
pseudo_loss.backward()
self.model.optimizer.step()
game = PingPong()
def train_dqn(episode):
loss = []
action_space = 3
state_space = 5
max_steps = 1000
agent = VPGPolicy(state_space, action_space)
for e in range(episode):
state = game.reset()
state = np.reshape(state, (1, state_space))
state = torch.from_numpy(state).to(torch.float32)
score = 0
for i in range(max_steps):
action = agent.act(state)
reward, next_state, done = game.step(action)
score += reward
next_state = np.reshape(next_state, (1, state_space))
agent.remember(state.numpy(), action,
reward, next_state, done)
state = next_state
state = torch.from_numpy(next_state).to(torch.float32)
agent.learn()
if done:
print("episode: {}/{}, score: {}".format(e, episode, score))
break
loss.append(score)
return loss
if __name__ == '__main__':
ep = 500
loss = train_dqn(ep)
plt.plot([i for i in range(ep)], loss)
plt.xlabel('episodes')
plt.ylabel('reward')
plt.show()