GPU: Use architecture-provided fast 1/sqrt if possible

AliceO2Group · Mar 4, 2024 · c5bc59e · c5bc59e
1 parent 1dc5123
commit c5bc59e
Show file tree

Hide file tree

Showing 3 changed files with 8 additions and 5 deletions.
diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h
@@ -55,7 +55,7 @@ class GPUCommonMath
     return Max(lo, Min(v, hi));
   }
   GPUhdni() static float Sqrt(float x);
-  GPUd() static float FastInvSqrt(float x);
+  GPUd() static float InvSqrt(float x);
   template <class T>
   GPUhd() static T Abs(T x);
   GPUd() static float ASin(float x);
@@ -363,12 +363,15 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ,
   return retVal;
 }
 
-GPUdi() float GPUCommonMath::FastInvSqrt(float _x)
+GPUdi() float GPUCommonMath::InvSqrt(float _x)
 {
 #ifdef GPUCA_NO_FAST_MATH
   return 1.f / Sqrt(_x);
+#elif defined(__CUDACC__) || defined(__HIPCC__)
+  return __frsqrt_rn(_x)
+#elif defined(__FAST_MATH__)
+  return 1.f / sqrtf(_x)
 #else
-  // the function calculates fast inverse sqrt
   union {
     float f;
     int i;

diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx
@@ -322,7 +322,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int iT
         }
 #endif
         float time = merger->Param().par.earlyTpcTransform ? -1.f : merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].getTime();
-        float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::FastInvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f;
+        float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::InvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f;
         retVal = prop.Update(yy, zz, cluster.row, param, clusterState, rejectChi2, &interpolation.hit[ihit], refit, cluster.slice, time, (avgCharge += tmpCharge) / ++nAvgCharge, tmpCharge GPUCA_DEBUG_STREAMER_CHECK(, iTrk)); // TODO: Use avgCharge
         GPUCA_DEBUG_STREAMER_CHECK(if (o2::utils::DebugStreamer::checkStream(o2::utils::StreamFlags::streamUpdateTrack, iTrk)) {
           merger->DebugStreamerUpdate(iTrk, ihit, xx, yy, zz, cluster, merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num], *this, prop, interpolation.hit[ihit], rejectChi2, refit, retVal);

diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
@@ -129,7 +129,7 @@ GPUdi() void GPUTPCSliceData::CreateGrid(GPUconstantref() const MEM_CONSTANT(GPU
     tfFactor = dz / GPUTPCGeometry::TPCLength();
     dz = GPUTPCGeometry::TPCLength();
   }
-  const float norm = CAMath::FastInvSqrt(row->mNHits / tfFactor);
+  const float norm = CAMath::InvSqrt(row->mNHits / tfFactor);
   float sy = CAMath::Min(CAMath::Max((yMax - yMin) * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
   float sz = CAMath::Min(CAMath::Max(dz * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
   int maxy, maxz;