From c5bc59e3551aec7255c23b7afdd312da7a98d40f Mon Sep 17 00:00:00 2001 From: David Rohr Date: Mon, 4 Mar 2024 18:02:23 +0100 Subject: [PATCH] GPU: Use architecture-provided fast 1/sqrt if possible --- GPU/Common/GPUCommonMath.h | 9 ++++++--- GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx | 2 +- GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx | 2 +- 3 files changed, 8 insertions(+), 5 deletions(-) diff --git a/GPU/Common/GPUCommonMath.h b/GPU/Common/GPUCommonMath.h index c7bf40d73004e..391527266e635 100644 --- a/GPU/Common/GPUCommonMath.h +++ b/GPU/Common/GPUCommonMath.h @@ -55,7 +55,7 @@ class GPUCommonMath return Max(lo, Min(v, hi)); } GPUhdni() static float Sqrt(float x); - GPUd() static float FastInvSqrt(float x); + GPUd() static float InvSqrt(float x); template GPUhd() static T Abs(T x); GPUd() static float ASin(float x); @@ -363,12 +363,15 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ, return retVal; } -GPUdi() float GPUCommonMath::FastInvSqrt(float _x) +GPUdi() float GPUCommonMath::InvSqrt(float _x) { #ifdef GPUCA_NO_FAST_MATH return 1.f / Sqrt(_x); +#elif defined(__CUDACC__) || defined(__HIPCC__) + return __frsqrt_rn(_x) +#elif defined(__FAST_MATH__) + return 1.f / sqrtf(_x) #else - // the function calculates fast inverse sqrt union { float f; int i; diff --git a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx index 8a2be598442bb..6dc53454c94c8 100644 --- a/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx +++ b/GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx @@ -322,7 +322,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int iT } #endif float time = merger->Param().par.earlyTpcTransform ? -1.f : merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].getTime(); - float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::FastInvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f; + float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::InvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f; retVal = prop.Update(yy, zz, cluster.row, param, clusterState, rejectChi2, &interpolation.hit[ihit], refit, cluster.slice, time, (avgCharge += tmpCharge) / ++nAvgCharge, tmpCharge GPUCA_DEBUG_STREAMER_CHECK(, iTrk)); // TODO: Use avgCharge GPUCA_DEBUG_STREAMER_CHECK(if (o2::utils::DebugStreamer::checkStream(o2::utils::StreamFlags::streamUpdateTrack, iTrk)) { merger->DebugStreamerUpdate(iTrk, ihit, xx, yy, zz, cluster, merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num], *this, prop, interpolation.hit[ihit], rejectChi2, refit, retVal); diff --git a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx index 11ccb488b7f6d..a6d0a0ea53be3 100644 --- a/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx +++ b/GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx @@ -129,7 +129,7 @@ GPUdi() void GPUTPCSliceData::CreateGrid(GPUconstantref() const MEM_CONSTANT(GPU tfFactor = dz / GPUTPCGeometry::TPCLength(); dz = GPUTPCGeometry::TPCLength(); } - const float norm = CAMath::FastInvSqrt(row->mNHits / tfFactor); + const float norm = CAMath::InvSqrt(row->mNHits / tfFactor); float sy = CAMath::Min(CAMath::Max((yMax - yMin) * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE); float sz = CAMath::Min(CAMath::Max(dz * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE); int maxy, maxz;