From 926d344b4faadc9754290ac529816b48f3942ee1 Mon Sep 17 00:00:00 2001 From: David Rohr Date: Thu, 9 Jan 2025 01:39:41 +0100 Subject: [PATCH] GPU: Fix copying of cluster data to GPU when not all processing steps are running on GPU --- GPU/GPUTracking/Global/GPUChainTracking.h | 1 + .../Global/GPUChainTrackingClusterizer.cxx | 2 +- .../Global/GPUChainTrackingTransformation.cxx | 29 ++++++++++--------- 3 files changed, 18 insertions(+), 14 deletions(-) diff --git a/GPU/GPUTracking/Global/GPUChainTracking.h b/GPU/GPUTracking/Global/GPUChainTracking.h index 11443b52504e2..6eb20f3093b2f 100644 --- a/GPU/GPUTracking/Global/GPUChainTracking.h +++ b/GPU/GPUTracking/Global/GPUChainTracking.h @@ -313,6 +313,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega void RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType); void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType); void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function allocator, bool applyClusterCuts); + bool NeedTPCClustersOnGPU(); std::atomic_flag mLockAtomicOutputBuffer = ATOMIC_FLAG_INIT; std::mutex mMutexUpdateCalib; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx index 4bc0ee4e91ff1..ff4133d9b2ce3 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx @@ -629,7 +629,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput) auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr; - bool buildNativeGPU = (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression); + bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU(); bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor; diff --git a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx index 67f1ff63e9cb3..5b7cf945a15c9 100644 --- a/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx +++ b/GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx @@ -32,6 +32,11 @@ using namespace GPUCA_NAMESPACE::gpu; using namespace o2::tpc; +bool GPUChainTracking::NeedTPCClustersOnGPU() +{ + return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression); +} + int32_t GPUChainTracking::ConvertNativeToClusterData() { #ifdef GPUCA_HAVE_O2HEADERS @@ -42,19 +47,17 @@ int32_t GPUChainTracking::ConvertNativeToClusterData() GPUTPCConvert& convertShadow = doGPU ? processorsShadow()->tpcConverter : convert; bool transferClusters = false; - if (doGPU) { - if (!(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) { - mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mIOPtrs.clustersNative->nClustersTotal; - AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); - processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; - WriteToConstantMemory(RecoStep::TPCConversion, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); - *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; - mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; - mInputsHost->mPclusterNativeAccess->setOffsetPtrs(); - GPUMemCpy(RecoStep::TPCConversion, mInputsShadow->mPclusterNativeBuffer, mIOPtrs.clustersNative->clustersLinear, sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, true); - TransferMemoryResourceLinkToGPU(RecoStep::TPCConversion, mInputsHost->mResourceClusterNativeAccess, 0); - transferClusters = true; - } + if (mRec->IsGPU() && !(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding) && NeedTPCClustersOnGPU()) { + mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mIOPtrs.clustersNative->nClustersTotal; + AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer); + processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess; + WriteToConstantMemory(RecoStep::TPCConversion, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0); + *mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative; + mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer; + mInputsHost->mPclusterNativeAccess->setOffsetPtrs(); + GPUMemCpy(RecoStep::TPCConversion, mInputsShadow->mPclusterNativeBuffer, mIOPtrs.clustersNative->clustersLinear, sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, true); + TransferMemoryResourceLinkToGPU(RecoStep::TPCConversion, mInputsHost->mResourceClusterNativeAccess, 0); + transferClusters = true; } if (!param().par.earlyTpcTransform) { if (GetProcessingSettings().debugLevel >= 3) {