-
Notifications
You must be signed in to change notification settings - Fork 0
/
run_acrobot.py
69 lines (48 loc) · 1.87 KB
/
run_acrobot.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
import gym
from policy_gradient import PolicyGradient
import matplotlib.pyplot as plt
import numpy as np
env = gym.make('Acrobot-v1')
env = env.unwrapped
# Policy gradient has high variance, seed for reproducability
env.seed(1)
print("env.action_space", env.action_space)
print("env.observation_space", env.observation_space)
print("env.observation_space.high", env.observation_space.high)
print("env.observation_space.low", env.observation_space.low)
RENDER_ENV = False
EPISODES = 500
rewards = []
RENDER_REWARD_MIN = -500
if __name__ == "__main__":
PG = PolicyGradient(
n_x = env.observation_space.shape[0],
n_y = env.action_space.n,
learning_rate=0.02,
reward_decay=0.99
)
for episode in range(EPISODES):
observation = env.reset()
episode_reward = 0
while True:
if RENDER_ENV: env.render()
# 1. Choose an action based on observation
action = PG.choose_action(observation)
# 2. Take action in the environment
observation_, reward, done, info = env.step(action)
# 4. Store transition for training
PG.store_transition(observation, action, reward)
if done:
episode_rewards_sum = sum(PG.episode_rewards)
rewards.append(episode_rewards_sum)
max_reward_so_far = np.amax(rewards)
print("==========================================")
print("Episode: ", episode)
print("Reward: ", episode_rewards_sum)
print("Max reward so far: ", max_reward_so_far)
# 5. Train neural network
discounted_episode_rewards_norm = PG.learn()
if max_reward_so_far > RENDER_REWARD_MIN: RENDER_ENV = True
break
# Save new observation
observation = observation_