Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

GPU: Fix copying of cluster data to GPU when not all processing steps are running on GPU #13845

Merged
merged 1 commit into from
Jan 10, 2025
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions GPU/GPUTracking/Global/GPUChainTracking.h
Original file line number Diff line number Diff line change
Expand Up @@ -313,6 +313,7 @@ class GPUChainTracking : public GPUChain, GPUReconstructionHelpers::helperDelega
void RunTPCTrackingMerger_MergeBorderTracks(int8_t withinSlice, int8_t mergeMode, GPUReconstruction::krnlDeviceType deviceType);
void RunTPCTrackingMerger_Resolve(int8_t useOrigTrackParam, int8_t mergeAll, GPUReconstruction::krnlDeviceType deviceType);
void RunTPCClusterFilter(o2::tpc::ClusterNativeAccess* clusters, std::function<o2::tpc::ClusterNative*(size_t)> allocator, bool applyClusterCuts);
bool NeedTPCClustersOnGPU();

std::atomic_flag mLockAtomicOutputBuffer = ATOMIC_FLAG_INIT;
std::mutex mMutexUpdateCalib;
Expand Down
2 changes: 1 addition & 1 deletion GPU/GPUTracking/Global/GPUChainTrackingClusterizer.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -629,7 +629,7 @@ int32_t GPUChainTracking::RunTPCClusterizer(bool synchronizeOutput)

auto* digitsMC = propagateMCLabels ? processors()->ioPtrs.tpcPackedDigits->tpcDigitsMC : nullptr;

bool buildNativeGPU = (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression);
bool buildNativeGPU = doGPU && NeedTPCClustersOnGPU();
bool buildNativeHost = (mRec->GetRecoStepsOutputs() & GPUDataTypes::InOutType::TPCClusters) || GetProcessingSettings().deterministicGPUReconstruction; // TODO: Should do this also when clusters are needed for later steps on the host but not requested as output

mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mRec->MemoryScalers()->nTPCHits * tpcHitLowOccupancyScalingFactor;
Expand Down
29 changes: 16 additions & 13 deletions GPU/GPUTracking/Global/GPUChainTrackingTransformation.cxx
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,11 @@
using namespace GPUCA_NAMESPACE::gpu;
using namespace o2::tpc;

bool GPUChainTracking::NeedTPCClustersOnGPU()
{
return (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCConversion) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCSliceTracking) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCMerging) || (mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCCompression);
}

int32_t GPUChainTracking::ConvertNativeToClusterData()
{
#ifdef GPUCA_HAVE_O2HEADERS
Expand All @@ -42,19 +47,17 @@ int32_t GPUChainTracking::ConvertNativeToClusterData()
GPUTPCConvert& convertShadow = doGPU ? processorsShadow()->tpcConverter : convert;

bool transferClusters = false;
if (doGPU) {
if (!(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding)) {
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mIOPtrs.clustersNative->nClustersTotal;
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
WriteToConstantMemory(RecoStep::TPCConversion, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
*mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
GPUMemCpy(RecoStep::TPCConversion, mInputsShadow->mPclusterNativeBuffer, mIOPtrs.clustersNative->clustersLinear, sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, true);
TransferMemoryResourceLinkToGPU(RecoStep::TPCConversion, mInputsHost->mResourceClusterNativeAccess, 0);
transferClusters = true;
}
if (mRec->IsGPU() && !(mRec->GetRecoStepsGPU() & GPUDataTypes::RecoStep::TPCClusterFinding) && NeedTPCClustersOnGPU()) {
mInputsHost->mNClusterNative = mInputsShadow->mNClusterNative = mIOPtrs.clustersNative->nClustersTotal;
AllocateRegisteredMemory(mInputsHost->mResourceClusterNativeBuffer);
processorsShadow()->ioPtrs.clustersNative = mInputsShadow->mPclusterNativeAccess;
WriteToConstantMemory(RecoStep::TPCConversion, (char*)&processors()->ioPtrs - (char*)processors(), &processorsShadow()->ioPtrs, sizeof(processorsShadow()->ioPtrs), 0);
*mInputsHost->mPclusterNativeAccess = *mIOPtrs.clustersNative;
mInputsHost->mPclusterNativeAccess->clustersLinear = mInputsShadow->mPclusterNativeBuffer;
mInputsHost->mPclusterNativeAccess->setOffsetPtrs();
GPUMemCpy(RecoStep::TPCConversion, mInputsShadow->mPclusterNativeBuffer, mIOPtrs.clustersNative->clustersLinear, sizeof(mIOPtrs.clustersNative->clustersLinear[0]) * mIOPtrs.clustersNative->nClustersTotal, 0, true);
TransferMemoryResourceLinkToGPU(RecoStep::TPCConversion, mInputsHost->mResourceClusterNativeAccess, 0);
transferClusters = true;
}
if (!param().par.earlyTpcTransform) {
if (GetProcessingSettings().debugLevel >= 3) {
Expand Down
Loading