End truncation at discount convergence to zero, to avoid spurious rew…

…ards equal to zero cancelling truncation too soon.
MatthewGerber · Aug 13, 2024 · 95d639a · 95d639a
1 parent 6b40b91
commit 95d639a
Show file tree

Hide file tree

Showing 4 changed files with 44 additions and 28 deletions.
diff --git a/src/rlai/gpi/monte_carlo/evaluation.py b/src/rlai/gpi/monte_carlo/evaluation.py
@@ -76,15 +76,14 @@ def evaluate_v_pi(
                 t += 1
                 agent.sense(state, t)
 
-                # if we've truncated and the discounted reward has converged to zero, then there's no point in running
-                # longer.
+                # if we've truncated and the discount has converged to zero, then the return at the truncation time will
+                # not change by running longer. we've got an accurate return estimate at truncation. exit the episode.
                 if truncation_time_step is not None:
-                    steps_past_truncation = (t - truncation_time_step)
-                    discounted_reward = next_reward.r * (agent.gamma ** steps_past_truncation)
-                    if np.isclose(discounted_reward, 0.0):
+                    num_post_truncation_steps = (t - truncation_time_step)
+                    post_truncation_discount = agent.gamma ** num_post_truncation_steps
+                    if np.isclose(post_truncation_discount, 0.0):
                         raise ValueError(
-                            f'Discounted reward converged to zero after {steps_past_truncation} post-truncation '
-                            f'step(s).'
+                            f'Post-truncation discount converged to zero after {num_post_truncation_steps} step(s).'
                         )
 
             # if anything blows up, then let the environment know that we are exiting the episode.
@@ -208,15 +207,14 @@ def evaluate_q_pi(
                 t += 1
                 episode_generation_agent.sense(state, t)
 
-                # if we've truncated and the discounted reward has converged to zero, then there's no point in running
-                # longer.
+                # if we've truncated and the discount has converged to zero, then the return at the truncation time will
+                # not change by running longer. we've got an accurate return estimate at truncation. exit the episode.
                 if truncation_time_step is not None:
-                    steps_past_truncation = (t - truncation_time_step)
-                    discounted_reward = next_reward.r * (agent.gamma ** steps_past_truncation)
-                    if np.isclose(discounted_reward, 0.0):
+                    num_post_truncation_steps = (t - truncation_time_step)
+                    post_truncation_discount = agent.gamma ** num_post_truncation_steps
+                    if np.isclose(post_truncation_discount, 0.0):
                         raise ValueError(
-                            f'Discounted reward converged to zero after {steps_past_truncation} post-truncation '
-                            f'step(s).'
+                            f'Post-truncation discount converged to zero after {num_post_truncation_steps} step(s).'
                         )
 
             # if anything blows up, then let the environment know that we are exiting the episode.

diff --git a/src/rlai/policy_gradient/monte_carlo/reinforce.py b/src/rlai/policy_gradient/monte_carlo/reinforce.py
@@ -213,15 +213,14 @@ def improve(
                 t += 1
                 agent.sense(state, t)
 
-                # if we've truncated and the discounted reward has converged to zero, then there's no point in running
-                # longer.
+                # if we've truncated and the discount has converged to zero, then the return at the truncation time will
+                # not change by running longer. we've got an accurate return estimate at truncation. exit the episode.
                 if truncation_time_step is not None:
-                    steps_past_truncation = (t - truncation_time_step)
-                    discounted_reward = next_reward.r * (gamma ** steps_past_truncation)
-                    if np.isclose(discounted_reward, 0.0):
+                    num_post_truncation_steps = (t - truncation_time_step)
+                    post_truncation_discount = gamma ** num_post_truncation_steps
+                    if np.isclose(post_truncation_discount, 0.0):
                         raise ValueError(
-                            f'Discounted reward converged to zero after {steps_past_truncation} post-truncation '
-                            f'step(s).'
+                            f'Post-truncation discount converged to zero after {num_post_truncation_steps} step(s).'
                         )
 
             # if anything blows up, then let the environment know that we are exiting the episode.
@@ -639,15 +638,14 @@ def iterate(
                     t += 1
                     self.agent.sense(state, t)
 
-                    # if we've truncated and the discounted reward has converged to zero, then there's no point in running
-                    # longer.
+                    # if we've truncated and the discount has converged to zero, then the return at the truncation time will
+                    # not change by running longer. we've got an accurate return estimate at truncation. exit the episode.
                     if truncation_time_step is not None:
-                        steps_past_truncation = (t - truncation_time_step)
-                        discounted_reward = next_reward.r * (self.agent.gamma ** steps_past_truncation)
-                        if np.isclose(discounted_reward, 0.0):
+                        num_post_truncation_steps = (t - truncation_time_step)
+                        post_truncation_discount = self.agent.gamma ** num_post_truncation_steps
+                        if np.isclose(post_truncation_discount, 0.0):
                             raise ValueError(
-                                f'Discounted reward converged to zero after {steps_past_truncation} post-truncation '
-                                f'step(s).'
+                                f'Post-truncation discount converged to zero after {num_post_truncation_steps} step(s).'
                             )
 
                 # if anything blows up, then let the environment know that we are exiting the episode.

diff --git a/src/rlai/utils.py b/src/rlai/utils.py
@@ -7,6 +7,7 @@
 from typing import List, Any, Optional, Callable, Tuple, TextIO
 
 import numpy as np
+import scipy
 from numpy import linalg as la
 from numpy.random import RandomState
 
@@ -587,3 +588,22 @@ def insert_index_into_path(
     path_parts.insert(1, f'-{index}')
 
     return ''.join(path_parts)
+
+
+def get_sample_size(
+        confidence: float,
+        std: float,
+        margin_of_error: float
+) -> int:
+    """
+    Get sample size for calculating the mean for a given standard deviation and margin of error.
+
+    :param confidence: Confidence in (0.0, 1.0].
+    :param std: Standard deviation.
+    :param margin_of_error: Margin of error.
+    :return: Sample size.
+    """
+
+    z = scipy.stats.norm.ppf(1.0 - ((1.0 - confidence) / 2.0))
+
+    return ((z * std) / margin_of_error) ** 2.0
diff --git a/test/rlai/runners/fixtures/test_gym_continuous_mountain_car.pickle b/test/rlai/runners/fixtures/test_gym_continuous_mountain_car.pickle