WIP add option for not fitting intercept in beta-distribution policy.

MatthewGerber · Dec 25, 2023 · 874576b · 874576b
1 parent 35ac59b
commit 874576b
Show file tree

Hide file tree

Showing 3 changed files with 16 additions and 11 deletions.
diff --git a/run_configurations/MountainCar continuous train FA.run.xml b/run_configurations/MountainCar continuous train FA.run.xml
@@ -14,7 +14,7 @@
     <option name="ADD_SOURCE_ROOTS" value="true" />
     <EXTENSION ID="PythonCoverageRunConfigurationExtension" runner="coverage.py" />
     <option name="SCRIPT_NAME" value="$PROJECT_DIR$/src/rlai/runners/trainer.py" />
-    <option name="PARAMETERS" value="--random-seed 12345 --agent rlai.policy_gradient.ParameterizedMdpAgent --gamma 0.99 --environment rlai.core.environments.gymnasium.Gym --gym-id MountainCarContinuous-v0 --render-every-nth-episode 50 --video-directory ~/Desktop/mountaincar_continuous_videos --T 1000 --plot-environment --train-function rlai.policy_gradient.monte_carlo.reinforce.improve --num-episodes 1000 --plot-state-value True --v-S rlai.state_value.function_approximation.ApproximateStateValueEstimator --feature-extractor rlai.core.environments.gymnasium.ContinuousMountainCarFeatureExtractor --function-approximation-model rlai.models.sklearn.SKLearnSGD --loss squared_error --sgd-alpha 0.0 --learning-rate constant --eta0 0.001 --policy rlai.policy_gradient.policies.continuous_action.ContinuousActionBetaDistributionPolicy --policy-feature-extractor rlai.core.environments.gymnasium.ContinuousMountainCarFeatureExtractor --alpha 0.001 --update-upon-every-visit True --plot-policy --save-agent-path ~/Desktop/mountaincar_continuous_agent.pickle --log DEBUG" />
+    <option name="PARAMETERS" value="--random-seed 12345 --agent rlai.policy_gradient.ParameterizedMdpAgent --gamma 0.99 --environment rlai.core.environments.gymnasium.Gym --gym-id MountainCarContinuous-v0 --render-every-nth-episode 50 --video-directory ~/Desktop/mountaincar_continuous_videos --T 1000 --plot-environment --train-function rlai.policy_gradient.monte_carlo.reinforce.improve --num-episodes 1000 --plot-state-value True --v-S rlai.state_value.function_approximation.ApproximateStateValueEstimator --feature-extractor rlai.core.environments.gymnasium.ContinuousMountainCarFeatureExtractor --function-approximation-model rlai.models.sklearn.SKLearnSGD --no-intercept --loss squared_error --sgd-alpha 0.0 --learning-rate constant --eta0 0.001 --policy rlai.policy_gradient.policies.continuous_action.ContinuousActionBetaDistributionPolicy --policy-feature-extractor rlai.core.environments.gymnasium.ContinuousMountainCarFeatureExtractor --alpha 0.001 --update-upon-every-visit True --plot-policy --save-agent-path ~/Desktop/mountaincar_continuous_agent.pickle --log DEBUG" />
     <option name="SHOW_COMMAND_LINE" value="false" />
     <option name="EMULATE_TERMINAL" value="false" />
     <option name="MODULE_MODE" value="false" />

diff --git a/src/rlai/core/environments/gymnasium.py b/src/rlai/core/environments/gymnasium.py
@@ -1206,10 +1206,10 @@ def extract(
         """
 
         # extract raw feature values
-        scaled_feature_matrix = super().extract(state, refit_scaler)
+        scaled_feature_vector = np.append(super().extract(state, refit_scaler), 1.0)
         interacted_feature_vector = self.state_category_interacter.interact(
             np.array([state.observation]),
-            np.array([scaled_feature_matrix])
+            np.array([scaled_feature_vector])
         )[0]
 
         return interacted_feature_vector

diff --git a/src/rlai/policy_gradient/policies/continuous_action.py b/src/rlai/policy_gradient/policies/continuous_action.py
@@ -537,8 +537,10 @@ def __commit_updates__(
         ])
 
         # prepend an intercept column
-        intercept_state_feature_matrix = np.ones(shape=np.add(state_feature_matrix.shape, (0, 1)))
-        intercept_state_feature_matrix[:, 1:] = state_feature_matrix
+        if self.fit_itercept:
+            intercept_state_feature_matrix = np.ones(shape=np.add(state_feature_matrix.shape, (0, 1)))
+            intercept_state_feature_matrix[:, 1:] = state_feature_matrix
+            state_feature_matrix = intercept_state_feature_matrix
 
         # invert action values back to [0.0, 1.0] (the domain of the beta distribution), creating one row per action
         # taken and one column per action dimension.
@@ -563,7 +565,7 @@ def __commit_updates__(
             ) = self.get_action_density_gradients_vmap(
                 action_i_theta_a,
                 action_i_theta_b,
-                intercept_state_feature_matrix,
+                state_feature_matrix,
                 action_i_values
             )
 
@@ -762,23 +764,26 @@ def __getitem__(
 
         self.set_action(state)
 
-        intercept_state_features = np.append([1.0], self.feature_extractor.extract(state, True))
+        state_feature_vector = self.feature_extractor.extract(state, True)
+
+        if self.fit_intercept:
+            state_feature_vector = np.append([1.0], state_feature_vector)
 
         # initialize coefficients for each action's a-shape parameter
         if self.action_theta_a is None:
             self.action_theta_a = np.zeros(
-                shape=(self.environment.get_action_space_dimensionality(), intercept_state_features.shape[0])
+                shape=(self.environment.get_action_space_dimensionality(), state_feature_vector.shape[0])
             )
 
         # initialize coefficients for each action's b-shape parameter
         if self.action_theta_b is None:
             self.action_theta_b = np.zeros(
-                shape=(self.environment.get_action_space_dimensionality(), intercept_state_features.shape[0])
+                shape=(self.environment.get_action_space_dimensionality(), state_feature_vector.shape[0])
             )
 
         # calculate the modeled shape parameters of each action dimension
-        action_a = np.exp(self.action_theta_a.dot(intercept_state_features))
-        action_b = np.exp(self.action_theta_b.dot(intercept_state_features))
+        action_a = np.exp(self.action_theta_a.dot(state_feature_vector))
+        action_b = np.exp(self.action_theta_b.dot(state_feature_vector))
 
         try: