chore: cleaned up base reward functions

TheNewtonCapstone · Dec 14, 2024 · 08af3ce · 08af3ce
1 parent abf04d6
commit 08af3ce
Show file tree

Hide file tree

Showing 7 changed files with 98 additions and 72 deletions.
diff --git a/.idea/misc.xml b/.idea/misc.xml
diff --git a/configs/tasks/newton_idle_task.yaml b/configs/tasks/newton_idle_task.yaml
@@ -5,7 +5,7 @@ seed: 14321
 
 n_envs: 96
 timesteps_per_env: 100_000
-base_lr: 3e-5
+base_lr: 6e-7
 episode_length: 500
 
 newton:

diff --git a/configs/world.yaml b/configs/world.yaml
@@ -18,7 +18,7 @@ sim_params:
   enable_scene_query_support: False # true to interact with the scene with the mouse (doesn't work with GPU dynamics)
   enable_enhanced_determinism: False # true for deterministic simulation, more computationally expensive
 
-  use_fabric: False # true for fabric, NVIDIA's interop layer between Hydra & PhysX
+  use_fabric: True # true for fabric, NVIDIA's interop layer between Hydra & PhysX
   enable_ccd: False # true for continuous collision detection, needs to be enabled on each rigid body & pair filter
 
   # Solver parameters

diff --git a/core/tasks/newton_base_task.py b/core/tasks/newton_base_task.py
@@ -76,7 +76,7 @@ def _on_step(self) -> bool:
         )
         self.logger.record(
             "observations/projected_gravities",
-            np.linalg.norm(agent_observations["projected_gravities"], axis=1).mean(),
+            agent_observations["projected_gravities"][:, -1].mean(),
         )
 
         return True

diff --git a/core/tasks/newton_idle_task.py b/core/tasks/newton_idle_task.py
@@ -108,10 +108,11 @@ def step_wait(self) -> VecEnvStepReturn:
         self.last_actions_buf[1, :, :] = self.last_actions_buf[0, :, :]
         self.last_actions_buf[0, :, :] = self.actions_buf.copy()
 
-        heights = self.env.get_observations()["positions"][:, 2]
+        gravity_on_z = self.env.get_observations()["projected_gravities"][:, 2]
         # terminated
-        self.dones_buf = np.where(heights <= self.reset_height, True, False)
-        self.dones_buf = np.where(heights >= 1.0, True, self.dones_buf)
+        self.dones_buf = np.where(
+            gravity_on_z >= 0.0, True, False
+        )  # when we're not upright
 
         self.dones_buf = np.where(
             self.progress_buf > self.max_episode_length,
@@ -182,9 +183,9 @@ def _calculate_rewards(self) -> None:
         #   The current reward function is missing many features, all in comments below
 
         obs = self.env.get_observations()
-        positions = obs["positions"]
         angular_velocities = obs["angular_velocities"]
         linear_velocities = obs["linear_velocities"]
+        projected_gravities = obs["projected_gravities"]
 
         joints_order = self.agent.joints_controller.art_view.joint_names
 
@@ -239,59 +240,61 @@ def _calculate_rewards(self) -> None:
             .numpy()
         )  # [-1, 1] unitless
 
-        from core.utils.math import magnitude_sqr_n
+        from core.utils.rl import (
+            exp_squared_norm,
+            fd_first_order_squared_norm,
+            fd_second_order_squared_norm,
+        )
 
         # base position reward
         # base orientation reward
-        base_linear_velocity_xy_reward = np.exp(
-            magnitude_sqr_n(
-                base_linear_velocity_xy - np.zeros_like(base_linear_velocity_xy)
-            )
-            * -8
+        base_linear_velocity_xy_reward = exp_squared_norm(
+            base_linear_velocity_xy,
+            mult=-8.0,
+            weight=1.0,
         )
-        base_linear_velocity_z_reward = np.exp(
-            np.square(base_linear_velocity_z - np.zeros_like(base_linear_velocity_z))
-            * -8
+        base_linear_velocity_z_reward = exp_squared_norm(
+            base_angular_velocity_z,
+            mult=-8.0,
+            weight=1.0,
         )
-        base_angular_velocity_xy_reward = (
-            np.exp(
-                magnitude_sqr_n(
-                    base_angular_velocity_xy - np.zeros_like(base_angular_velocity_xy)
-                )
-                * -2
-            )
-            * 0.5
+        base_angular_velocity_xy_reward = exp_squared_norm(
+            base_angular_velocity_xy,
+            mult=-2,
+            weight=0.5,
         )
-        base_angular_velocity_z_reward = (
-            np.exp(
-                np.square(
-                    base_angular_velocity_z - np.zeros_like(base_angular_velocity_z)
-                )
-                * -2
-            )
-            * 0.5
+        base_angular_velocity_z_reward = exp_squared_norm(
+            base_angular_velocity_z,
+            mult=-2,
+            weight=0.5,
         )
-        joint_positions_reward = (
-            -magnitude_sqr_n(joint_positions - animation_joint_positions) * 15.0
+        joint_positions_reward = fd_first_order_squared_norm(
+            joint_positions,
+            animation_joint_positions,
+            weight=15.0,
         )
-        joint_velocities_reward = (
-            -magnitude_sqr_n(joint_velocities - animation_joint_velocities) * 0.1
+        joint_velocities_reward = fd_first_order_squared_norm(
+            joint_velocities,
+            animation_joint_velocities,
+            weight=0.01,
         )
-        joint_efforts_reward = (
-            -magnitude_sqr_n(joint_efforts - np.zeros_like(joint_efforts)) * 0.001
+        joint_efforts_reward = fd_first_order_squared_norm(
+            joint_efforts,
+            np.zeros_like(joint_efforts),
+            weight=0.001,
         )
         # joint accelerations reward
-        joint_action_rate_reward = (
-            -magnitude_sqr_n(self.actions_buf - self.last_actions_buf[0]) * 1.5
+        joint_action_rate_reward = fd_first_order_squared_norm(
+            self.actions_buf,
+            self.last_actions_buf[0],
+            weight=1.0,
+        )
+        joint_action_acceleration_reward = fd_second_order_squared_norm(
+            self.actions_buf,
+            self.last_actions_buf[0],
+            self.last_actions_buf[1],
+            weight=0.45,
         )
-        joint_action_acceleration_reward = (
-            -magnitude_sqr_n(
-                self.actions_buf
-                - 2 * self.last_actions_buf[0]
-                + self.last_actions_buf[1]
-            )
-            * 0.45
-        )  # not sure why this is the formula, but it comes from Disney
 
         self.rewards_buf = (
             base_linear_velocity_xy_reward
@@ -305,11 +308,10 @@ def _calculate_rewards(self) -> None:
             + joint_action_acceleration_reward
         )
 
-        heights = positions[:, 2]
+        gravity_on_z = projected_gravities[:, 2]
         self.rewards_buf = np.where(
-            heights <= self.reset_height, -20.0, self.rewards_buf
+            gravity_on_z >= 0.0, self.rewards_buf - 10.0, self.rewards_buf
         )
-        self.rewards_buf = np.where(heights >= 1.0, -2.0, self.rewards_buf)
 
         # TODO: should this be here?
         self.rewards_buf = np.clip(

diff --git a/core/utils/math.py b/core/utils/math.py
@@ -69,23 +69,3 @@ def quat_slerp_n(a: np.ndarray, b: np.ndarray, t: float) -> np.ndarray:
     qc[3] = a[3] * ratio_a + b[3] * ratio_b
 
     return qc
-
-
-def difference_length_squared_n(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    diff = a - b
-
-    return np.sum(diff * diff, axis=1)
-
-
-def difference_length_n(a: np.ndarray, b: np.ndarray) -> np.ndarray:
-    diff = a - b
-
-    return np.linalg.norm(diff)
-
-
-def magnitude_sqr_n(a: np.ndarray, axis: int = 1) -> np.ndarray:
-    return np.sum(a * a, axis=axis)
-
-
-def length_n(a: np.ndarray) -> np.ndarray:
-    return np.linalg.norm(a)
diff --git a/core/utils/rl.py b/core/utils/rl.py
@@ -0,0 +1,45 @@
+import numpy as np
+
+
+def squared_norm(a: np.ndarray, weight: float = 1.0) -> np.ndarray:
+    return np.sum(a ** 2, axis=-1) * weight
+
+
+def fd_first_order_squared_norm(
+        a: np.ndarray,
+        b: np.ndarray,
+        weight: float = 1.0,
+) -> np.ndarray:
+    return squared_norm(a - b, weight)
+
+
+def fd_second_order_squared_norm(
+        a: np.ndarray,
+        b: np.ndarray,
+        c: np.ndarray,
+        weight: float = 1.0,
+) -> np.ndarray:
+    return squared_norm(a - 2 * b + c, weight)
+
+
+def exp_squared_norm(a: np.ndarray, mult: float = 1.0, weight: float = 1.0) -> np.ndarray:
+    return np.exp(squared_norm(a), mult) * weight
+
+
+def exp_fd_first_order_squared_norm(
+        a: np.ndarray,
+        b: np.ndarray,
+        mult: float = 1.0,
+        weight: float = 1.0,
+) -> np.ndarray:
+    return np.exp(mult * fd_first_order_squared_norm(a, b)) * weight
+
+
+def exp_fd_second_order_squared_norm(
+        a: np.ndarray,
+        b: np.ndarray,
+        c: np.ndarray,
+        mult: float = 1.0,
+        weight: float = 1.0,
+) -> np.ndarray:
+    return np.exp(mult * fd_second_order_squared_norm(a, b, c)) * weight