fix: fix step calculation in jacobian matrix approximation

Previously, the step calculation incorrectly used xj.copysign(1) instead of 1.copysign(xj). This made the sign determination useless, because signum(1) is always 1. Worse, it multiplied the step with the xj again. This was also reason why the step was zero when xj was zero, hence the check for step being zero. Gradient and Hessian were implemented correctly. However, there is one problem. When I fixed the step calculation for Jacobian matrix, the scaled Rosenbrock test with the second initial point started to fail for trust region algorithm, because in the first step, the Jacobian was suddenly ill-defined (one column was zero), which caused using LM step instead of Newton-based. Now the real culprit here is probably poor implementation of LM step, but for the time being, we will use always positive step sizes, which hopefully does not harm anyone in practice. What is also interesting that using larger epsilon also fixed the issue when using negative step size. This shows that we may need some adaptive step size technique to avoid singular Jacobians just because of a small step size.
datamole-ai · Nov 15, 2023 · d9dd2dc · d9dd2dc
1 parent 30d44a9
commit d9dd2dc
Showing 1 changed file with 14 additions and 10 deletions.
diff --git a/src/derivatives.rs b/src/derivatives.rs
@@ -6,7 +6,7 @@ use nalgebra::{
     storage::{Storage, StorageMut},
     ComplexField, DimName, Dyn, IsContiguous, OMatrix, OVector, RealField, Vector, U1,
 };
-use num_traits::{One, Zero};
+use num_traits::One;
 
 use crate::core::{Function, Problem, RealField as _, System};
 
@@ -74,16 +74,22 @@ impl<R: System> Jacobian<R> {
 
             // Compute the step size. We would like to have the step as small as
             // possible (to be as close to the zero -- i.e., real derivative --
-            // the real derivative as possible). But at the same time, very
-            // small step could cause r(x + e_j * step_j) ~= r(x) with very
-            // small number of good digits.
+            // as possible). But at the same time, very small step could cause
+            // r(x + e_j * step_j) ~= r(x) with very small number of good
+            // digits.
             //
             // A reasonable way to balance these competing needs is to scale
             // each component by x_j itself. To avoid problems when x_j is close
             // to zero, it is modified to take the typical magnitude instead.
+            //
+            // Note that you can find in the literature that the rule for step
+            // size is actually eps * max(|xj|, magnitude) * sign(xj), that is,
+            // the step has the same sign as xj. But that caused scaled
+            // Rosenbrock test to fail for trust region algorithm. This is very
+            // anecdotal evidence and I would like to understand why the sign of
+            // the step is important.
             let magnitude = R::Field::one() / scale[j];
-            let step = eps * xj.abs().max(magnitude) * xj.copysign(R::Field::one());
-            let step = if step == R::Field::zero() { eps } else { step };
+            let step = eps * xj.abs().max(magnitude);
 
             // Update the point.
             x[j] = xj + step;
@@ -170,8 +176,7 @@ impl<F: Function> Gradient<F> {
 
             // See the implementation of Jacobian for details on computing step size.
             let magnitude = F::Field::one() / scale[i];
-            let step = eps * xi.abs().max(magnitude) * F::Field::one().copysign(xi);
-            let step = if step == F::Field::zero() { eps } else { step };
+            let step = eps * xi.abs().max(magnitude);
 
             // Update the point.
             x[i] = xi + step;
@@ -261,8 +266,7 @@ impl<F: Function> Hessian<F> {
 
             // See the implementation of Jacobian for details on computing step size.
             let magnitude = F::Field::one() / scale[i];
-            let step = eps * xi.abs().max(magnitude) * F::Field::one().copysign(xi);
-            let step = if step == F::Field::zero() { eps } else { step };
+            let step = eps * xi.abs().max(magnitude);
 
             // Store the step for Hessian calculation.
             self.steps[i] = step;