-
Notifications
You must be signed in to change notification settings - Fork 0
/
rainbow_dqn_agent.py
138 lines (108 loc) · 6.29 KB
/
rainbow_dqn_agent.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
#
# Copyright (c) 2017 Intel Corporation
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from typing import Union
import numpy as np
from rl_coach.agents.categorical_dqn_agent import CategoricalDQNAlgorithmParameters, \
CategoricalDQNAgent, CategoricalDQNAgentParameters
from rl_coach.agents.dqn_agent import DQNNetworkParameters
from rl_coach.architectures.head_parameters import RainbowQHeadParameters
from rl_coach.architectures.middleware_parameters import FCMiddlewareParameters
from rl_coach.base_parameters import MiddlewareScheme
from rl_coach.exploration_policies.parameter_noise import ParameterNoiseParameters
from rl_coach.memories.non_episodic.prioritized_experience_replay import PrioritizedExperienceReplayParameters, \
PrioritizedExperienceReplay
class RainbowDQNNetworkParameters(DQNNetworkParameters):
def __init__(self):
super().__init__()
self.heads_parameters = [RainbowQHeadParameters()]
self.middleware_parameters = FCMiddlewareParameters(scheme=MiddlewareScheme.Empty)
class RainbowDQNAlgorithmParameters(CategoricalDQNAlgorithmParameters):
"""
:param n_step: (int)
The number of steps to bootstrap the network over. The first N-1 steps actual rewards will be accumulated
using an exponentially growing discount factor, and the Nth step will be bootstrapped from the network
prediction.
:param store_transitions_only_when_episodes_are_terminated: (bool)
If set to True, the transitions will be stored in an Episode object until the episode ends, and just then
written to the memory. This is useful since we want to calculate the N-step discounted rewards before saving the
transitions into the memory, and to do so we need the entire episode first.
"""
def __init__(self):
super().__init__()
self.n_step = 3
# needed for n-step updates to work. i.e. waiting for a full episode to be closed before storing each transition
self.store_transitions_only_when_episodes_are_terminated = True
class RainbowDQNAgentParameters(CategoricalDQNAgentParameters):
def __init__(self):
super().__init__()
self.algorithm = RainbowDQNAlgorithmParameters()
# ParameterNoiseParameters is changing the network wrapper parameters. This line needs to be done first.
self.network_wrappers = {"main": RainbowDQNNetworkParameters()}
self.exploration = ParameterNoiseParameters(self)
self.memory = PrioritizedExperienceReplayParameters()
@property
def path(self):
return 'rl_coach.agents.rainbow_dqn_agent:RainbowDQNAgent'
# Rainbow Deep Q Network - https://arxiv.org/abs/1710.02298
# Agent implementation is composed of:
# 1. NoisyNets
# 2. C51
# 3. Prioritized ER
# 4. DDQN
# 5. Dueling DQN
# 6. N-step returns
class RainbowDQNAgent(CategoricalDQNAgent):
def __init__(self, agent_parameters, parent: Union['LevelManager', 'CompositeAgent']=None):
super().__init__(agent_parameters, parent)
def learn_from_batch(self, batch):
network_keys = self.ap.network_wrappers['main'].input_embedders_parameters.keys()
ddqn_selected_actions = np.argmax(self.distribution_prediction_to_q_values(
self.networks['main'].online_network.predict(batch.next_states(network_keys))), axis=1)
# for the action we actually took, the error is calculated by the atoms distribution
# for all other actions, the error is 0
distributional_q_st_plus_n, TD_targets = self.networks['main'].parallel_prediction([
(self.networks['main'].target_network, batch.next_states(network_keys)),
(self.networks['main'].online_network, batch.states(network_keys))
])
# add Q value samples for logging
self.q_values.add_sample(self.distribution_prediction_to_q_values(TD_targets))
# only update the action that we have actually done in this transition (using the Double-DQN selected actions)
target_actions = ddqn_selected_actions
m = np.zeros((batch.size, self.z_values.size))
batches = np.arange(batch.size)
for j in range(self.z_values.size):
# we use batch.info('should_bootstrap_next_state') instead of (1 - batch.game_overs()) since with n-step,
# we will not bootstrap for the last n-step transitions in the episode
tzj = np.fmax(np.fmin(batch.n_step_discounted_rewards() + batch.info('should_bootstrap_next_state') *
(self.ap.algorithm.discount ** self.ap.algorithm.n_step) * self.z_values[j],
self.z_values[-1]), self.z_values[0])
bj = (tzj - self.z_values[0])/(self.z_values[1] - self.z_values[0])
u = (np.ceil(bj)).astype(int)
l = (np.floor(bj)).astype(int)
m[batches, l] += (distributional_q_st_plus_n[batches, target_actions, j] * (u - bj))
m[batches, u] += (distributional_q_st_plus_n[batches, target_actions, j] * (bj - l))
# total_loss = cross entropy between actual result above and predicted result for the given action
TD_targets[batches, batch.actions()] = m
# update errors in prioritized replay buffer
importance_weights = batch.info('weight') if isinstance(self.memory, PrioritizedExperienceReplay) else None
result = self.networks['main'].train_and_sync_networks(batch.states(network_keys), TD_targets,
importance_weights=importance_weights)
total_loss, losses, unclipped_grads = result[:3]
# TODO: fix this spaghetti code
if isinstance(self.memory, PrioritizedExperienceReplay):
errors = losses[0][np.arange(batch.size), batch.actions()]
self.call_memory('update_priorities', (batch.info('idx'), errors))
return total_loss, losses, unclipped_grads