Cleaned up reacher Td3 and removed reacher DDPG due to slow/poor conv…

…ergence
lajd · Sep 27, 2020 · 96ad6b1 · 96ad6b1
1 parent 16ef5db
commit 96ad6b1
Show file tree

Hide file tree

Showing 21 changed files with 113 additions and 729 deletions.
diff --git a/agents/models/components/mlp.py b/agents/models/components/mlp.py
@@ -17,38 +17,57 @@ def __init__(
             output_layer_initialization_fn: Optional[Callable] = None,
             with_batchnorm: bool = False
     ):
+        """
+
+        :param layer_sizes: Size for each linear layer
+        :param activation_function: Activation between layers
+        :param output_function: Any output torch.nn.Module to be applied at the head
+        :param dropout: Dropout for linear layers
+        :param seed: Random seed
+        :param hidden_layer_initialization_fn: How to initialize hidden linear layers
+        :param output_layer_initialization_fn: How to initialize the last layer of the MLP defined my layer_sizes
+        :param with_batchnorm: Apply batchnorm between linear layers
+
+        Order is always (input_bn)->FC->BN->Activation->Dropout->FC
+        """
         super().__init__()
+
+        if len(layer_sizes) < 2:
+            raise ValueError("Must provide at least 2 layer sizes")
         if seed:
             self.set_seed(seed)
 
         mlp_layers = torch.nn.ModuleList([])
 
-        # Apply batchnorm to inputs of each layer
+        # Input BN
         if with_batchnorm:
             mlp_layers.append(torch.nn.BatchNorm1d(layer_sizes[0]))
 
+        # HL 1
         first_layer = torch.nn.Linear(layer_sizes[0], layer_sizes[1])
         if hidden_layer_initialization_fn:
             first_layer.weight.data.uniform_(*hidden_layer_initialization_fn(first_layer))
 
         mlp_layers.append(first_layer)
 
-        if len(layer_sizes) == 2:
-            mlp_layers.append(activation_function)
-
+        # HL 2-N
         previous_output = layer_sizes[1]
         for n_out in layer_sizes[2:]:
+            # BN
             if with_batchnorm:
                 mlp_layers.append(torch.nn.BatchNorm1d(previous_output))
 
+            # Activation
             mlp_layers.append(activation_function)
+
+            # Dropout
             if dropout:
                 mlp_layers.append(torch.nn.Dropout(dropout))
 
+            # Next FC
             next_layer = torch.nn.Linear(previous_output, n_out)
             if hidden_layer_initialization_fn:
                 next_layer.weight.data.uniform_(*hidden_layer_initialization_fn(next_layer))
-
             mlp_layers.append(next_layer)
 
             previous_output = n_out
@@ -57,9 +76,11 @@ def __init__(
             mlp_layers[-1].weight.data.uniform_(*output_layer_initialization_fn(mlp_layers[-1]))
             mlp_layers[-1].bias.data.uniform_(*output_layer_initialization_fn(mlp_layers[-1]))
 
+        # Apply output function -- Can be an Activation or a module
         if output_function:
             mlp_layers.append(output_function)
 
+        # Stack
         self.mlp_layers = torch.nn.Sequential(*mlp_layers)
 
     def forward(self, x: torch.FloatTensor) -> torch.Tensor:

diff --git a/agents/policies/ddpg_policy.py b/agents/policies/ddpg_policy.py
@@ -74,6 +74,7 @@ def get_actions_():
             action = np.clip(action, self.action_range[0], self.action_range[1])
         else:
             raise ValueError('Must provide either epsilon_scheduler or noise')
+
         return Action(value=action)
 
     def get_random_action(self, *args) -> Action:

diff --git a/agents/ppo_agent.py b/agents/ppo_agent.py
@@ -198,8 +198,8 @@ def process_trajectory(self):
         values = torch.cat(values).detach()
         states = torch.cat(states)
         actions = torch.cat(actions)
-        joint_states = torch.cat(joint_states) if len(joint_states) > 1 else None
-        joint_actions = torch.cat(joint_actions) if len(joint_actions) > 1 else None
+        joint_states = torch.cat(joint_states) if joint_states[0] is not None else joint_states
+        joint_actions = torch.cat(joint_actions) if joint_actions[0] is not None else joint_actions
 
         advantage = returns - values
 
@@ -239,6 +239,7 @@ def step_episode(self, episode: int, *args, **kwargs):
         self.process_trajectory()
         if len(self.current_trajectory_memory) >= self.batch_size * self.min_batches_for_training:
             for _ in range(self.num_learning_updates):
+                print('learning')
                 for sampled_states, sampled_actions, sampled_log_probs, sampled_returns, sampled_advantages, _, _ in self.current_trajectory_memory.sample(self.batch_size):
                     self._learn(sampled_log_probs,  sampled_states, sampled_actions, sampled_advantages, sampled_returns)
             self.current_trajectory_memory.reset()

diff --git a/simulation/utils.py b/simulation/utils.py
@@ -25,6 +25,20 @@ def default_step_episode_agents_fn(brain_set: BrainSet, episode_number: int):
             agent.step_episode(episode_number)
 
 
+def single_agent_step_agents_fn(brain_set: BrainSet, next_brain_environment: dict, t: int):
+    for brain_name, brain_environment in next_brain_environment.items():
+        agent = brain_set[brain_name].agents[0]
+        brain_agent_experience = Experience(
+            state=brain_environment['states'],
+            action=brain_environment['actions'][0],
+            reward=brain_environment['rewards'],
+            next_state=brain_environment['next_states'],
+            done=torch.LongTensor(brain_environment['dones']),
+            t_step=t,
+        )
+        agent.step(brain_agent_experience)
+
+
 def default_preprocess_brain_actions_for_env_fn(brain_actions: Dict[str, List[Action]]) -> Dict[str, List[Action]]:
 
     assert len(brain_actions) > 0 and isinstance(list(brain_actions.values())[0][0], Action), brain_actions

diff --git a/tasks/banana_collector/solutions/ray_tracing_banana/unity-environment.log b/tasks/banana_collector/solutions/ray_tracing_banana/unity-environment.log
diff --git a/tasks/crawler/solutions/ddpg/__init__.py b/tasks/crawler/solutions/ddpg/__init__.py
diff --git a/tasks/crawler/solutions/ddpg/eval_td3.py b/tasks/crawler/solutions/ddpg/eval_td3.py
diff --git a/tasks/crawler/solutions/ddpg/train_td3.py b/tasks/crawler/solutions/ddpg/train_td3.py