Skip to content

Commit

Permalink
GPU: Use architecture-provided fast 1/sqrt if possible
Browse files Browse the repository at this point in the history
  • Loading branch information
davidrohr committed Mar 4, 2024
1 parent 1dc5123 commit c5bc59e
Show file tree
Hide file tree
Showing 3 changed files with 8 additions and 5 deletions.
9 changes: 6 additions & 3 deletions GPU/Common/GPUCommonMath.h
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ class GPUCommonMath
return Max(lo, Min(v, hi));
}
GPUhdni() static float Sqrt(float x);
GPUd() static float FastInvSqrt(float x);
GPUd() static float InvSqrt(float x);
template <class T>
GPUhd() static T Abs(T x);
GPUd() static float ASin(float x);
Expand Down Expand Up @@ -363,12 +363,15 @@ GPUdi() T GPUCommonMath::MaxWithRef(T x, T y, T z, T w, S refX, S refY, S refZ,
return retVal;
}

GPUdi() float GPUCommonMath::FastInvSqrt(float _x)
GPUdi() float GPUCommonMath::InvSqrt(float _x)
{
#ifdef GPUCA_NO_FAST_MATH
return 1.f / Sqrt(_x);
#elif defined(__CUDACC__) || defined(__HIPCC__)
return __frsqrt_rn(_x)
#elif defined(__FAST_MATH__)
return 1.f / sqrtf(_x)
#else
// the function calculates fast inverse sqrt
union {
float f;
int i;
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Merger/GPUTPCGMTrackParam.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -322,7 +322,7 @@ GPUd() bool GPUTPCGMTrackParam::Fit(GPUTPCGMMerger* GPUrestrict() merger, int iT
}
#endif
float time = merger->Param().par.earlyTpcTransform ? -1.f : merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].getTime();
float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::FastInvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f;
float tmpCharge = merger->GetConstantMem()->ioPtrs.clustersNative ? CAMath::InvSqrt(merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num].qMax) : 0.f;
retVal = prop.Update(yy, zz, cluster.row, param, clusterState, rejectChi2, &interpolation.hit[ihit], refit, cluster.slice, time, (avgCharge += tmpCharge) / ++nAvgCharge, tmpCharge GPUCA_DEBUG_STREAMER_CHECK(, iTrk)); // TODO: Use avgCharge
GPUCA_DEBUG_STREAMER_CHECK(if (o2::utils::DebugStreamer::checkStream(o2::utils::StreamFlags::streamUpdateTrack, iTrk)) {
merger->DebugStreamerUpdate(iTrk, ihit, xx, yy, zz, cluster, merger->GetConstantMem()->ioPtrs.clustersNative->clustersLinear[cluster.num], *this, prop, interpolation.hit[ihit], rejectChi2, refit, retVal);
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/SliceTracker/GPUTPCSliceData.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -129,7 +129,7 @@ GPUdi() void GPUTPCSliceData::CreateGrid(GPUconstantref() const MEM_CONSTANT(GPU
tfFactor = dz / GPUTPCGeometry::TPCLength();
dz = GPUTPCGeometry::TPCLength();
}
const float norm = CAMath::FastInvSqrt(row->mNHits / tfFactor);
const float norm = CAMath::InvSqrt(row->mNHits / tfFactor);
float sy = CAMath::Min(CAMath::Max((yMax - yMin) * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
float sz = CAMath::Min(CAMath::Max(dz * norm, GPUCA_MIN_BIN_SIZE), GPUCA_MAX_BIN_SIZE);
int maxy, maxz;
Expand Down

0 comments on commit c5bc59e

Please sign in to comment.