From 33b421259e216f9e87c41438ae4c1fcc63691163 Mon Sep 17 00:00:00 2001 From: Matteo Concas Date: Tue, 3 Dec 2024 16:07:27 +0100 Subject: [PATCH] ITS-GPU: Move Tracklet finder on GPU (#13737) * Fix hybrid vertexer printouts * Move multiplicity mask to a vector * Add gpuSpan * Debugging getSpan * Checkpointing * Fix access in tracklet finding * Fix tracklet LUTs issue * Debugging small discrepancies * Fix bad PhiBins pick * Add tracklet counting * Fix indices for used clusters * Add tracklet writing on the buffer * tracklets on gpu * Tracklet finder on GPU --- .../include/ITSReconstruction/FastMultEst.h | 2 +- .../ITS/reconstruction/src/FastMultEst.cxx | 2 +- .../GPU/ITStrackingGPU/TimeFrameGPU.h | 63 +- .../GPU/ITStrackingGPU/TrackingKernels.h | 71 +- .../ITS/tracking/GPU/ITStrackingGPU/Utils.h | 43 ++ .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu | 168 +++- .../tracking/GPU/cuda/TrackerTraitsGPU.cxx | 335 +++----- .../ITS/tracking/GPU/cuda/TrackingKernels.cu | 723 +++++++++++------- .../tracking/include/ITStracking/TimeFrame.h | 25 +- .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx | 11 +- .../ITS/tracking/src/TrackingInterface.cxx | 2 +- .../ITSMFT/ITS/tracking/src/Vertexer.cxx | 8 +- .../ITS/workflow/src/CookedTrackerSpec.cxx | 2 +- 13 files changed, 848 insertions(+), 607 deletions(-) diff --git a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h index 457381862cc42..9e8299e89b404 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h +++ b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h @@ -45,7 +45,7 @@ struct FastMultEst { static uint32_t getCurrentRandomSeed(); int selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel); + const gsl::span trig, std::vector& sel); void fillNClPerLayer(const gsl::span& clusters); float process(const std::array ncl) diff --git a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx index a55fafdf60409..c547996c6f356 100644 --- a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx +++ b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx @@ -125,7 +125,7 @@ float FastMultEst::processNoiseImposed(const std::array ncl) } int FastMultEst::selectROFs(const gsl::span rofs, const gsl::span clus, - const gsl::span trig, std::vector& sel) + const gsl::span trig, std::vector& sel) { int nrof = rofs.size(), nsel = 0; const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h index ad8724f315ec8..37f392ebbd3a7 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h @@ -51,9 +51,19 @@ class TimeFrameGPU : public TimeFrame void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr); void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int); void initDeviceSAFitting(); + void loadIndexTableUtils(const int); void loadTrackingFrameInfoDevice(const int); void loadUnsortedClustersDevice(const int); void loadClustersDevice(const int); + void loadClustersIndexTables(const int iteration); + void createUsedClustersDevice(const int); + void loadUsedClustersDevice(); + void loadROframeClustersDevice(const int); + void loadMultiplicityCutMask(const int); + void loadVertices(const int); + + /// + void createTrackletsLUTDevice(const int); void loadTrackletsDevice(); void loadTrackletsLUTDevice(); void loadCellsDevice(); @@ -62,6 +72,7 @@ class TimeFrameGPU : public TimeFrame void loadTrackSeedsChi2Device(); void loadRoadsDevice(); void loadTrackSeedsDevice(std::vector&); + void createTrackletsBuffers(); void createCellsBuffers(const int); void createCellsDevice(); void createCellsLUTDevice(); @@ -93,7 +104,7 @@ class TimeFrameGPU : public TimeFrame std::vector>& getLabelsInChunks() { return mLabelsInChunks; } int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones. StaticTrackingParameters* getDeviceTrackingParameters() { return mTrackingParamsDevice; } - Vertex* getDeviceVertices() { return mVerticesDevice; } + Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; } int* getDeviceROFramesPV() { return mROFramesPVDevice; } unsigned char* getDeviceUsedClusters(const int); const o2::base::Propagator* getChainPropagator(); @@ -107,8 +118,12 @@ class TimeFrameGPU : public TimeFrame const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; } const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; } const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; } - const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; } - const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } + const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; } + std::vector getClusterSizes(); + const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; } + const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; } + Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; } + int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; } int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; } int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; } CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; } @@ -116,17 +131,19 @@ class TimeFrameGPU : public TimeFrame o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; } float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; } int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; } + uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; } void setDevicePropagator(const o2::base::PropagatorImpl*) override; // Host-specific getters - gsl::span getHostNTracklets(const int chunkId); - gsl::span getHostNCells(const int chunkId); + gsl::span getNTracklets() { return mNTracklets; } + gsl::span getNCells() { return mNCells; } // Host-available device getters + gsl::span getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; } gsl::span getDeviceCellLUTs() { return mCellsLUTDevice; } + gsl::span getDeviceTracklet() { return mTrackletsDevice; } gsl::span getDeviceCells() { return mCellsDevice; } - gsl::span getNCellsDevice() { return mNCells; } private: void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations @@ -136,31 +153,37 @@ class TimeFrameGPU : public TimeFrame StaticTrackingParameters mStaticTrackingParams; // Host-available device buffer sizes + std::array mNTracklets; std::array mNCells; // Device pointers StaticTrackingParameters* mTrackingParamsDevice; IndexTableUtils* mIndexTableUtilsDevice; - std::array mROFramesClustersDevice; - std::array mUsedClustersDevice; - Vertex* mVerticesDevice; - int* mROFramesPVDevice; // Hybrid pref + uint8_t* mMultMaskDevice; + Vertex* mPrimaryVerticesDevice; + int* mROFramesPVDevice; std::array mClustersDevice; std::array mUnsortedClustersDevice; + std::array mClustersIndexTablesDevice; + std::array mUsedClustersDevice; + std::array mROFramesClustersDevice; const Cluster** mClustersDeviceArray; const Cluster** mUnsortedClustersDeviceArray; + const int** mClustersIndexTablesDeviceArray; + const unsigned char** mUsedClustersDeviceArray; + const int** mROFrameClustersDeviceArray; std::array mTrackletsDevice; - const Tracklet** mTrackletsDeviceArray; - const int** mTrackletsLUTDeviceArray; - std::array mTrackletsLUTDevice; + Tracklet** mTrackletsDeviceArray; + std::array mTrackletsLUTDevice; std::array mCellsLUTDevice; std::array mNeighboursLUTDevice; int** mCellsLUTDeviceArray; int** mNeighboursCellDeviceArray; int** mNeighboursCellLUTDeviceArray; + int** mTrackletsLUTDeviceArray; std::array mCellsDevice; std::array mNeighboursIndexTablesDevice; CellSeed* mTrackSeedsDevice; @@ -186,10 +209,6 @@ class TimeFrameGPU : public TimeFrame std::vector> mNVerticesInChunks; std::vector> mLabelsInChunks; - // Host memory used only in GPU tracking - std::vector mHostNTracklets; - std::vector mHostNCells; - // Temporary buffer for storing output tracks from GPU tracking std::vector mTrackITSExt; }; @@ -215,6 +234,16 @@ inline int TimeFrameGPU::getNClustersInRofSpan(const int rofIdstart, co { return static_cast(mROFramesClusters[layerId][(rofIdstart + rofSpanSize) < mROFramesClusters.size() ? rofIdstart + rofSpanSize : mROFramesClusters.size() - 1] - mROFramesClusters[layerId][rofIdstart]); } + +template +inline std::vector TimeFrameGPU::getClusterSizes() +{ + std::vector sizes(mUnsortedClusters.size()); + std::transform(mUnsortedClusters.begin(), mUnsortedClusters.end(), sizes.begin(), + [](const auto& v) { return static_cast(v.size()); }); + return sizes; +} + } // namespace gpu } // namespace its } // namespace o2 diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h index 34e6165b9530f..54bdae302e643 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h @@ -50,11 +50,74 @@ GPUg() void fitTrackSeedsKernel( #endif } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minR, + std::vector& maxR, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + void countCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -70,8 +133,8 @@ void countCellsHandler(const Cluster** sortedClusters, void computeCellsHandler(const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h index 66244bf854b5f..a88e51742e84a 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h +++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h @@ -31,6 +31,49 @@ struct gpuPair { namespace gpu { +// Poor man implementation of a span-like struct. It is very limited. +template +struct gpuSpan { + using value_type = T; + using ptr = T*; + using ref = T&; + + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } + + protected: + ptr _data; + unsigned int _size; +}; + +template +struct gpuSpan { + using value_type = T; + using ptr = const T*; + using ref = const T&; + + GPUd() gpuSpan() : _data(nullptr), _size(0) {} + GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {} + GPUd() gpuSpan(const gpuSpan& other) : _data(other._data), _size(other._size) {} + GPUd() ref operator[](unsigned int idx) const { return _data[idx]; } + GPUd() unsigned int size() const { return _size; } + GPUd() bool empty() const { return _size == 0; } + GPUd() ref front() const { return _data[0]; } + GPUd() ref back() const { return _data[_size - 1]; } + GPUd() ptr begin() const { return _data; } + GPUd() ptr end() const { return _data + _size; } + + protected: + ptr _data; + unsigned int _size; +}; enum class Task { Tracker = 0, diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu index 67144ba2c98ea..4bd15c0203d81 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu @@ -92,6 +92,19 @@ void TimeFrameGPU::setDevicePropagator(const o2::base::PropagatorImpl +void TimeFrameGPU::loadIndexTableUtils(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils"); + if (!iteration) { + LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB); + allocMemAsync(reinterpret_cast(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator()); + } + LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB); + checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadUnsortedClustersDevice(const int iteration) { @@ -128,6 +141,65 @@ void TimeFrameGPU::loadClustersDevice(const int iteration) } } +template +void TimeFrameGPU::loadClustersIndexTables(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createUsedClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB); + allocMemAsync(reinterpret_cast(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadUsedClustersDevice() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB); + checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::loadROframeClustersDevice(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters"); + for (auto iLayer{0}; iLayer < nLayers; ++iLayer) { + LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + allocMemAsync(reinterpret_cast(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + template void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) { @@ -146,19 +218,76 @@ void TimeFrameGPU::loadTrackingFrameInfoDevice(const int iteration) STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } +template +void TimeFrameGPU::loadMultiplicityCutMask(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask"); + LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB); + allocMemAsync(reinterpret_cast(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::loadVertices(const int iteration) +{ + if (!iteration) { + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices"); + LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB); + allocMemAsync(reinterpret_cast(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); + } +} + +template +void TimeFrameGPU::createTrackletsLUTDevice(const int iteration) +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + if (!iteration) { + LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); + } + checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + } + if (!iteration) { + allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator()); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + } + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + +template +void TimeFrameGPU::createTrackletsBuffers() +{ + START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); + for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { + mNTracklets[iLayer] = 0; + checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB); + allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator()); + } + allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); + checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); + STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); +} + template void TimeFrameGPU::loadTrackletsDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator()); checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable)); checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get())); } - allocMemAsync(reinterpret_cast(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -167,14 +296,12 @@ void TimeFrameGPU::loadTrackletsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator()); + LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB); checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice)); } - allocMemAsync(reinterpret_cast(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); - checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable)); - checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice)); + checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable)); + checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice)); STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -214,9 +341,9 @@ void TimeFrameGPU::createCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs"); for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB); - allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator()); - checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB); + allocMemAsync(reinterpret_cast(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator()); + checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get())); } allocMemAsync(reinterpret_cast(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator()); checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get())); @@ -228,7 +355,7 @@ void TimeFrameGPU::createCellsBuffers(const int layer) { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers"); mNCells[layer] = 0; - checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost)); + checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost)); LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB); allocMemAsync(reinterpret_cast(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator()); @@ -319,9 +446,9 @@ void TimeFrameGPU::downloadCellsLUTDevice() { START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts"); for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1)); - mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1); - checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); + LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1)); + mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1); + checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get())); } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } @@ -362,13 +489,6 @@ void TimeFrameGPU::unregisterRest() LOGP(debug, "unregistering rest of the host memory..."); checkGPUError(cudaHostUnregister(mCellsDevice.data())); checkGPUError(cudaHostUnregister(mTrackletsDevice.data())); - checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data())); - for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - if (iLayer < nLayers - 2) { - checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data())); - } - checkGPUError(cudaHostUnregister(mTracklets[iLayer].data())); - } STOP_GPU_STREAM_TIMER(mGpuStreams[0].get()); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx index 3c6a307fc4ff6..ae86507e46325 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx @@ -31,241 +31,18 @@ void TrackerTraitsGPU::initialiseTimeFrame(const int iteration) mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers); mTimeFrameGPU->loadClustersDevice(iteration); mTimeFrameGPU->loadUnsortedClustersDevice(iteration); + mTimeFrameGPU->loadClustersIndexTables(iteration); mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration); + mTimeFrameGPU->loadMultiplicityCutMask(iteration); + mTimeFrameGPU->loadVertices(iteration); + mTimeFrameGPU->loadROframeClustersDevice(iteration); + mTimeFrameGPU->createUsedClustersDevice(iteration); + mTimeFrameGPU->loadIndexTableUtils(iteration); } template void TrackerTraitsGPU::computeLayerTracklets(const int iteration, int, int) { - // if (!mTimeFrameGPU->getClusters().size()) { - // return; - // } - // const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); - // gsl::span diamondSpan(&diamondVert, 1); - // std::vector threads(mTimeFrameGPU->getNChunks()); - - // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) { - // int maxTracklets{static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity) * - // static_cast(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster)}; - // int maxRofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()}; - // // Define workload - // auto doTrackReconstruction = [&, chunkId, maxRofPerChunk, iteration]() -> void { - // auto offset = chunkId * maxRofPerChunk; - // auto maxROF = offset + maxRofPerChunk; - // while (offset < maxROF) { - // auto rofs = mTimeFrameGPU->loadChunkData(chunkId, offset, maxROF); - // //////////////////// - // /// Tracklet finding - - // for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) { - // auto nclus = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, iLayer); - // const float meanDeltaR{mTrkParams[iteration].LayerRadii[iLayer + 1] - mTrkParams[iteration].LayerRadii[iLayer]}; - // gpu::computeLayerTrackletsKernelMultipleRof<<getStream(chunkId).get()>>>( - // iLayer, // const int layerIndex, - // iteration, // const int iteration, - // offset, // const unsigned int startRofId, - // rofs, // const unsigned int rofSize, - // 0, // const unsigned int deltaRof, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer), // const Cluster* clustersCurrentLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer + 1), // const Cluster* clustersNextLayer, - // mTimeFrameGPU->getDeviceROframesClusters(iLayer), // const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - // mTimeFrameGPU->getDeviceROframesClusters(iLayer + 1), // const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - // mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(iLayer + 1), // const int* indexTableNextLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer), // const int* usedClustersCurrentLayer, - // mTimeFrameGPU->getDeviceUsedClusters(iLayer + 1), // const int* usedClustersNextLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), // Tracklet* tracklets, // output data - // mTimeFrameGPU->getDeviceVertices(), // const Vertex* vertices, - // mTimeFrameGPU->getDeviceROframesPV(), // const int* pvROFrame, - // mTimeFrameGPU->getPhiCut(iLayer), // const float phiCut, - // mTimeFrameGPU->getMinR(iLayer + 1), // const float minR, - // mTimeFrameGPU->getMaxR(iLayer + 1), // const float maxR, - // meanDeltaR, // const float meanDeltaR, - // mTimeFrameGPU->getPositionResolution(iLayer), // const float positionResolution, - // mTimeFrameGPU->getMSangle(iLayer), // const float mSAngle, - // mTimeFrameGPU->getDeviceTrackingParameters(), // const StaticTrackingParameters* trkPars, - // mTimeFrameGPU->getDeviceIndexTableUtils(), // const IndexTableUtils* utils - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity, // const int clustersPerROfCapacity, - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster); // const int maxTrackletsPerCluster - - // // Remove empty tracklets due to striding. - // auto nulltracklet = o2::its::Tracklet{}; - // auto thrustTrackletsBegin = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer)); - // auto thrustTrackletsEnd = thrust::device_ptr(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer) + (int)rofs * maxTracklets); - // auto thrustTrackletsAfterEraseEnd = thrust::remove(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsEnd, - // nulltracklet); - // // Sort tracklets by first cluster index. - // thrust::sort(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), - // thrustTrackletsBegin, - // thrustTrackletsAfterEraseEnd, - // gpu::trackletSortIndexFunctor()); - - // // Remove duplicates. - // auto thrustTrackletsAfterUniqueEnd = thrust::unique(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), thrustTrackletsBegin, thrustTrackletsAfterEraseEnd); - - // discardResult(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get())); - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer] = thrustTrackletsAfterUniqueEnd - thrustTrackletsBegin; - // // Compute tracklet lookup table. - // gpu::compileTrackletsLookupTableKernel<<getStream(chunkId).get()>>>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer), // d_out - // nclus, // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create tracklets labels, at the moment on the host - // if (mTimeFrameGPU->hasMCinformation()) { - // std::vector tracklets(mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]); - // checkGPUError(cudaHostRegister(tracklets.data(), tracklets.size() * sizeof(o2::its::Tracklet), cudaHostRegisterDefault)); - // checkGPUError(cudaMemcpyAsync(tracklets.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), tracklets.size() * sizeof(o2::its::Tracklet), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get())); - // for (auto& trk : tracklets) { - // MCCompLabel label; - // int currentId{mTimeFrameGPU->mClusters[iLayer][trk.firstClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // int nextId{mTimeFrameGPU->mClusters[iLayer + 1][trk.secondClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk - // for (auto& lab1 : mTimeFrameGPU->getClusterLabels(iLayer, currentId)) { - // for (auto& lab2 : mTimeFrameGPU->getClusterLabels(iLayer + 1, nextId)) { - // if (lab1 == lab2 && lab1.isValid()) { - // label = lab1; - // break; - // } - // } - // if (label.isValid()) { - // break; - // } - // } - // // TODO: implment label merging. - // // mTimeFrameGPU->getTrackletsLabel(iLayer).emplace_back(label); - // } - // checkGPUError(cudaHostUnregister(tracklets.data())); - // } - // } - - // //////////////// - // /// Cell finding - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // // Compute layer cells. - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - - // // Compute number of found Cells - // checkGPUError(cub::DeviceReduce::Sum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells() + iLayer, // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - // // Compute LUT - // discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellsKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1), - // mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer], - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer), - // mTimeFrameGPU->getDeviceTrackingParameters()); - // } - // checkGPUError(cudaMemcpyAsync(mTimeFrameGPU->getHostNCells(chunkId).data(), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // (nLayers - 2) * sizeof(int), - // cudaMemcpyDeviceToHost, - // mTimeFrameGPU->getStream(chunkId).get())); - - // // Create cells labels - // // TODO: make it work after fixing the tracklets labels - // if (mTimeFrameGPU->hasMCinformation()) { - // for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) { - // std::vector cells(mTimeFrameGPU->getHostNCells(chunkId)[iLayer]); - // // Async with not registered memory? - // checkGPUError(cudaMemcpyAsync(cells.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), mTimeFrameGPU->getHostNCells(chunkId)[iLayer] * sizeof(o2::its::Cell), cudaMemcpyDeviceToHost)); - // for (auto& cell : cells) { - // MCCompLabel currentLab{mTimeFrameGPU->getTrackletsLabel(iLayer)[cell.getFirstTrackletIndex()]}; - // MCCompLabel nextLab{mTimeFrameGPU->getTrackletsLabel(iLayer + 1)[cell.getSecondTrackletIndex()]}; - // mTimeFrameGPU->getCellsLabel(iLayer).emplace_back(currentLab == nextLab ? currentLab : MCCompLabel()); - // } - // } - // } - - // ///////////////////// - // /// Neighbour finding - // for (int iLayer{0}; iLayer < nLayers - 3; ++iLayer) { - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // nullptr, - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // Compute Cell Neighbours LUT - // checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(), // d_temp_storage - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_in - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), // d_out - // mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1], // num_items - // mTimeFrameGPU->getStream(chunkId).get())); - - // gpu::computeLayerCellNeighboursKernel<<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>( - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1), - // iLayer, - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), - // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize); - - // // if (!chunkId) { - // // gpu::printBufferLayerOnThread<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLayer, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer), - // // mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize * rofs); - // // } - // } - // // Download cells into vectors - - // for (int iLevel{nLayers - 2}; iLevel >= mTrkParams[iteration].CellMinimumLevel(); --iLevel) { - // const int minimumLevel{iLevel - 1}; - // for (int iLayer{nLayers - 3}; iLayer >= minimumLevel; --iLayer) { - // // gpu::computeLayerRoadsKernel<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLevel, // const int level, - // // iLayer, // const int layerIndex, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayCells(), // const CellSeed** cells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(), // const int* nCells, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCell(), // const int** neighbours, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCellLUT(), // const int** neighboursLUT, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoads(), // Road* roads, - // // mTimeFrameGPU->getChunk(chunkId).getDeviceRoadsLookupTables(iLayer)); // int* roadsLookupTable - // } - // } - - // // End of tracking for this chunk - // offset += rofs; - // } - // }; - // threads[chunkId] = std::thread(doTrackReconstruction); - // } - // for (auto& thread : threads) { - // thread.join(); - // } - - // mTimeFrameGPU->wipe(nLayers); } template @@ -299,7 +76,7 @@ int TrackerTraitsGPU::getTFNumberOfClusters() const template int TrackerTraitsGPU::getTFNumberOfTracklets() const { - return mTimeFrameGPU->getNumberOfTracklets(); + return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0); } template @@ -313,31 +90,94 @@ int TrackerTraitsGPU::getTFNumberOfCells() const template void TrackerTraitsGPU::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex) { - TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); + // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex); + mTimeFrameGPU->createTrackletsLUTDevice(iteration); + + const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f); + gsl::span diamondSpan(&diamondVert, 1); + int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0}; + int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()}; + + countTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); + mTimeFrameGPU->createTrackletsBuffers(); + computeTrackletsInROFsHandler(mTimeFrameGPU->getDeviceIndexTableUtils(), + mTimeFrameGPU->getDeviceMultCutMask(), + startROF, + endROF, + mTimeFrameGPU->getNrof(), + mTrkParams[iteration].DeltaROF, + iVertex, + mTimeFrameGPU->getDeviceVertices(), + mTimeFrameGPU->getDeviceROFramesPV(), + mTimeFrameGPU->getPrimaryVerticesNum(), + mTimeFrameGPU->getDeviceArrayClusters(), + mTimeFrameGPU->getClusterSizes(), + mTimeFrameGPU->getDeviceROframeClusters(), + mTimeFrameGPU->getDeviceArrayUsedClusters(), + mTimeFrameGPU->getDeviceArrayClustersIndexTables(), + mTimeFrameGPU->getDeviceArrayTracklets(), + mTimeFrameGPU->getDeviceTracklet(), + mTimeFrameGPU->getNTracklets(), + mTimeFrameGPU->getDeviceArrayTrackletsLUT(), + mTimeFrameGPU->getDeviceTrackletsLUTs(), + iteration, + mTrkParams[iteration].NSigmaCut, + mTimeFrameGPU->getPhiCuts(), + mTrkParams[iteration].PVres, + mTimeFrameGPU->getMinRs(), + mTimeFrameGPU->getMaxRs(), + mTimeFrameGPU->getPositionResolutions(), + mTrkParams[iteration].LayerRadii, + mTimeFrameGPU->getMSangles(), + conf.nBlocks, + conf.nThreads); } template void TrackerTraitsGPU::computeCellsHybrid(const int iteration) { - mTimeFrameGPU->loadTrackletsDevice(); - mTimeFrameGPU->loadTrackletsLUTDevice(); mTimeFrameGPU->createCellsLUTDevice(); auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); - // #pragma omp parallel for num_threads(nLayers) for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) { - if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() || - mTimeFrameGPU->getTracklets()[iLayer].empty()) { + if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) { continue; } - - const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getTracklets()[iLayer].size())}; + const int currentLayerTrackletsNum{static_cast(mTimeFrameGPU->getNTracklets()[iLayer])}; countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(), mTimeFrameGPU->getDeviceArrayUnsortedClusters(), mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, nullptr, mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -354,7 +194,7 @@ void TrackerTraitsGPU::computeCellsHybrid(const int iteration) mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(), mTimeFrameGPU->getDeviceArrayTracklets(), mTimeFrameGPU->getDeviceArrayTrackletsLUT(), - mTimeFrameGPU->getTracklets()[iLayer].size(), + mTimeFrameGPU->getNTracklets()[iLayer], iLayer, mTimeFrameGPU->getDeviceCells()[iLayer], mTimeFrameGPU->getDeviceArrayCellsLUT(), @@ -378,7 +218,7 @@ void TrackerTraitsGPU::findCellsNeighboursHybrid(const int iteration) auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance(); std::vector>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1); for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) { - const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])}; + const int nextLayerCellsNum{static_cast(mTimeFrameGPU->getNCells()[iLayer + 1])}; mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear(); mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0); @@ -441,7 +281,7 @@ void TrackerTraitsGPU::findRoads(const int iteration) std::vector lastCellId, updatedCellId; std::vector lastCellSeed, updatedCellSeed; - processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); + processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId); int level = startLevel; for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) { @@ -495,8 +335,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); - isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); + nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer))); + isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)); } if (nShared > mTrkParams[0].ClusterSharing) { @@ -508,8 +348,8 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (track.getClusterIndex(iLayer) == UnusedIndex) { continue; } - mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); - int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer)); + mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer)); + int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer)); for (int iR{0}; iR < 3; ++iR) { if (rofs[iR] == INT_MAX) { rofs[iR] = currentROF; @@ -525,9 +365,10 @@ void TrackerTraitsGPU::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); } } + mTimeFrameGPU->loadUsedClustersDevice(); if (iteration == mTrkParams.size() - 1) { mTimeFrameGPU->unregisterHostMemory(0); } diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu index 73dcf3bcb4894..229827611c077 100644 --- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu +++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu @@ -32,6 +32,7 @@ #include "ITStracking/IndexTableUtils.h" #include "ITStracking/MathUtils.h" #include "DataFormatsITS/TrackITS.h" +#include "ReconstructionDataFormats/Vertex.h" #include "ITStrackingGPU/TrackerTraitsGPU.h" #include "ITStrackingGPU/TrackingKernels.h" @@ -70,12 +71,39 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort = } namespace o2::its - { using namespace constants::its2; +using Vertex = o2::dataformats::Vertex>; + +GPUd() float Sq(float v) +{ + return v * v; +} namespace gpu { + +GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, + const o2::its::IndexTableUtils& utils, + const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +{ + const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz; + const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi; + const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz; + const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi; + + if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || + zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + + return getEmptyBinsRect(); + } + + return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), + o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), + utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; +} + GPUd() bool fitTrack(TrackITSExt& track, int start, int end, @@ -127,7 +155,7 @@ GPUd() bool fitTrack(TrackITSExt& track, } nCl++; } - return o2::gpu::GPUCommonMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); + return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5); } GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, @@ -146,7 +174,7 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, const float y3 = tf3.positionTrackingFrame[0]; const float z3 = tf3.positionTrackingFrame[1]; - const bool zeroField{o2::gpu::GPUCommonMath::Abs(bz) < o2::constants::math::Almost0}; + const bool zeroField{o2::gpu::CAMath::Abs(bz) < o2::constants::math::Almost0}; const float tgp = zeroField ? o2::gpu::CAMath::ATan2(y3 - y1, x3 - x1) : 1.f; const float crv = zeroField ? 1.f : math_utils::computeCurvature(x3, y3, x2, y2, x1, y1); const float snp = zeroField ? tgp / o2::gpu::CAMath::Sqrt(1.f + tgp * tgp) : crv * (x3 - math_utils::computeCurvatureCentreX(x3, y3, x2, y2, x1, y1)); @@ -164,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1, 0.f, 0.f, 0.f, 0.f, sg2q2pt}); } +// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }; +// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }; + +struct sort_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); } +}; + +struct equal_tracklets { + GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; } +}; + template struct pair_to_first : public thrust::unary_function, T1> { GPUhd() int operator()(const gpuPair& a) const @@ -196,6 +235,33 @@ struct is_valid_pair { } }; +GPUd() gpuSpan getPrimaryVertices(const int rof, + const int* roframesPV, + const int nROF, + const uint8_t* mask, + const Vertex* vertices) +{ + const int start_pv_id = roframesPV[rof]; + const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1; + size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded + return gpuSpan(&vertices[start_pv_id], delta); +}; + +GPUd() gpuSpan getClustersOnLayer(const int rof, + const int totROFs, + const int layer, + const int** roframesClus, + const Cluster** clusters) +{ + if (rof < 0 || rof >= totROFs) { + return gpuSpan(); + } + const int start_clus_id{roframesClus[layer][rof]}; + const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1; + const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id; + return gpuSpan(&(clusters[layer][start_clus_id]), delta); +} + template GPUg() void fitTrackSeedsKernel( CellSeed* trackSeeds, @@ -314,8 +380,8 @@ GPUg() void computeLayerCellsKernel( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTrackletsCurrent, const int layer, CellSeed* cells, @@ -331,8 +397,8 @@ GPUg() void computeLayerCellsKernel( for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) { const Tracklet& currentTracklet = tracklets[layer][iCurrentTrackletIndex]; const int nextLayerClusterIndex{currentTracklet.secondClusterIndex}; - const int nextLayerFirstTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex]}; - const int nextLayerLastTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex + 1]}; + const int nextLayerFirstTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex]}; + const int nextLayerLastTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex + 1]}; if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) { continue; } @@ -342,7 +408,7 @@ GPUg() void computeLayerCellsKernel( break; } const Tracklet& nextTracklet = tracklets[layer + 1][iNextTrackletIndex]; - const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; + const float deltaTanLambda{o2::gpu::CAMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)}; if (deltaTanLambda / cellDeltaTanLambdaSigma < nSigmaCut) { const int clusId[3]{ @@ -394,35 +460,124 @@ GPUg() void computeLayerCellsKernel( } } -///////////////////////////////////////// -// Debug Kernels -///////////////////////////////////////// -GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex, - const o2::its::IndexTableUtils& utils, - const float z1, const float z2, float maxdeltaz, float maxdeltaphi) +template +GPUg() void computeLayerTrackletsMultiROFKernel( + const IndexTableUtils* utils, + const uint8_t* multMask, + const int layerIndex, + const int startROF, + const int endROF, + const int totalROFs, + const int deltaROF, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const int vertexId, + const Cluster** clusters, // Input data rof0 + const int** ROFClusters, // Number of clusters on layers per ROF + const unsigned char** usedClusters, // Used clusters + const int** indexTables, // Input data rof0-delta getNphiBins()}; + const int zBins{utils->getNzBins()}; + for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) { + const short rof0 = iROF + startROF; + auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices); + const auto startVtx{vertexId >= 0 ? vertexId : 0}; + const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + const short minROF = o2::gpu::CAMath::Max(startROF, static_cast(rof0 - deltaROF)); + const short maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast(rof0 + deltaROF)); + auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters); + if (clustersCurrentLayer.empty()) { + continue; + } - if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] || - zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) { + for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) { + unsigned int storedTracklets{0}; + auto currentCluster{clustersCurrentLayer[currentClusterIndex]}; + const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex}; + if (usedClusters[layerIndex][currentCluster.clusterId]) { + continue; + } - return getEmptyBinsRect(); - } + const float inverseR0{1.f / currentCluster.radius}; + for (int iV{startVtx}; iV < endVtx; ++iV) { + auto& primaryVertex{primaryVertices[iV]}; + if (primaryVertex.isFlagSet(2) && iteration != 3) { + continue; + } + const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution)); + const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; + const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; + const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; + const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution + const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))}; + const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)}; + if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + continue; + } + int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - return int4{o2::gpu::GPUCommonMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)), - o2::gpu::GPUCommonMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)), - utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))}; -} + if (phiBinsNum < 0) { + phiBinsNum += phiBins; + } -GPUhd() float Sq(float q) -{ - return q * q; + const int tableSize{phiBins * zBins + 1}; + for (short rof1{minROF}; rof1 <= maxROF; ++rof1) { + auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters); + if (clustersNextLayer.empty()) { + continue; + } + for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { + int iPhiBin = (selectedBinsRect.y + iPhiCount) % phiBins; + const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; + const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; + const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex]; + const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex]; + for (int nextClusterIndex{firstRowClusterIndex}; nextClusterIndex < maxRowClusterIndex; ++nextClusterIndex) { + if (nextClusterIndex >= clustersNextLayer.size()) { + break; + } + const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; + if (usedClusters[layerIndex + 1][nextCluster.clusterId]) { + continue; + } + const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)}; + const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; + const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex}; + if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { + if constexpr (initRun) { + trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums. + } else { + const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; + const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; + new (tracklets[layerIndex] + trackletsLUT[layerIndex][currentSortedIndex] + storedTracklets) Tracklet{currentSortedIndex, nextSortedIndex, tanL, phi, rof0, rof1}; + } + ++storedTracklets; + } + } + } + } + } + } + } } +///////////////////////////////////////// +// Debug Kernels +///////////////////////////////////////// + template GPUd() void pPointer(T* ptr) { @@ -437,7 +592,6 @@ GPUg() void printPointersKernel(std::tuple args) std::apply(print_all, args); } -// Functors to sort tracklets template struct trackletSortEmptyFunctor : public thrust::binary_function { GPUhd() bool operator()(const T& lhs, const T& rhs) const @@ -454,7 +608,6 @@ struct trackletSortIndexFunctor : public thrust::binary_function { } }; -// Print layer buffer GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { @@ -494,52 +647,12 @@ GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned } } -// Dump vertices GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0) { if (blockIdx.x * blockDim.x + threadIdx.x == tId) { - printf("vertices: "); + printf("vertices: \n"); for (int i{0}; i < size; ++i) { - printf("x=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); - } - } -} - -// Dump tracklets -GPUg() void printTracklets(const Tracklet* t, - const int offset, - const int startRof, - const int nrof, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int maxClustersPerRof = 5e2, - const int maxTrackletsPerCluster = 50, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - auto offsetChunk{(startRof - offset) * maxClustersPerRof * maxTrackletsPerCluster}; - for (int i{offsetChunk}; i < offsetChunk + nrof * maxClustersPerRof * maxTrackletsPerCluster; ++i) { - if (t[i].firstClusterIndex != -1) { - t[i].dump(offsetCurrent, offsetNext); - } - } - } -} - -GPUg() void printTrackletsNotStrided(const Tracklet* t, - const int offset, - const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF - const int* roFrameClustersNextLayer, // Number of clusters on layer 1 per ROF - const int ntracklets, - const unsigned int tId = 0) -{ - if (threadIdx.x == tId) { - auto offsetCurrent{roFrameClustersCurrentLayer[offset]}; - auto offsetNext{roFrameClustersNextLayer[offset]}; - for (int i{0}; i < ntracklets; ++i) { - t[i].dump(offsetCurrent, offsetNext); + printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ()); } } } @@ -556,102 +669,25 @@ GPUg() void printNeighbours(const gpuPair* neighbours, } } -// Compute the tracklets for a given layer -template -GPUg() void computeLayerTrackletsKernelSingleRof( - const short rof0, - const short maxRofs, - const int layerIndex, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxTrackletsPerCluster = 50) +GPUg() void printTrackletsLUTPerROF(const int layerId, + const int** ROFClusters, + int** luts, + const int tId = 0) { - for (int currentClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; currentClusterIndex < currentLayerClustersSize; currentClusterIndex += blockDim.x * gridDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayer[currentClusterIndex]}; - const int currentSortedIndex{roFrameClusters[rof0] + currentClusterIndex}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - short minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - short maxRof = (rof0 == static_cast(maxRofs - trkPars->DeltaROF)) ? rof0 : rof0 + trkPars->DeltaROF; - const float inverseR0{1.f / currentCluster.radius}; - for (int iPrimaryVertex{0}; iPrimaryVertex < nVertices; iPrimaryVertex++) { - const auto& primaryVertex{vertices[iPrimaryVertex]}; - if (primaryVertex.getX() == 0.f && primaryVertex.getY() == 0.f && primaryVertex.getZ() == 0.f) { - continue; - } - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { + if (blockIdx.x * blockDim.x + threadIdx.x == tId) { + for (auto rofId{0}; rofId < 2304; ++rofId) { + int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId]; + if (!nClus) { continue; } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - constexpr int tableSize{256 * 128 + 1}; // hardcoded for the time being + printf("rof: %d (%d) ==> ", rofId, nClus); - for (short rof1{minRof}; rof1 <= maxRof; ++rof1) { - if (!(roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTable[rof1 * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTable[rof1 * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= (roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { - break; - } - const Cluster& nextCluster{getPtrFromRuler(rof1, clustersNextLayer, roFrameClustersNext)[iNextCluster]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if (deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) { - trackletsLookUpTable[currentSortedIndex]++; // Race-condition safe - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - new (tracklets + stride + storedTracklets) Tracklet{currentSortedIndex, roFrameClustersNext[rof1] + iNextCluster, tanL, phi, rof0, rof1}; - ++storedTracklets; - } - } - } + for (int iC{0}; iC < nClus; ++iC) { + int nT = luts[layerId][ROFClusters[layerId][rofId] + iC]; + printf("%d\t", nT); } + printf("\n"); } - // if (storedTracklets > maxTrackletsPerCluster) { - // printf("its-gpu-tracklet finder: found more tracklets per clusters (%d) than maximum set (%d), check the configuration!\n", maxTrackletsPerCluster, storedTracklets); - // } } } @@ -661,124 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets, const int nTracklets) { for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) { - auto& tracklet{tracklets[currentTrackletIndex]}; - if (tracklet.firstClusterIndex >= 0) { - atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1); - } - } -} - -template -GPUg() void computeLayerTrackletsKernelMultipleRof( - const int layerIndex, - const int iteration, - const unsigned int startRofId, - const unsigned int rofSize, - const int maxRofs, - const Cluster* clustersCurrentLayer, // input data rof0 - const Cluster* clustersNextLayer, // input data rof0-delta * trkPars, - const IndexTableUtils* utils, - const unsigned int maxClustersPerRof = 5e2, - const unsigned int maxTrackletsPerCluster = 50) -{ - const int phiBins{utils->getNphiBins()}; - const int zBins{utils->getNzBins()}; - for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) { - auto rof0 = iRof + startRofId; - auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof); - // if (nClustersCurrentLayerRof > maxClustersPerRof) { - // printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof); - // } - auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]); - auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0]; - auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof; - for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) { - unsigned int storedTracklets{0}; - const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]}; - const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex}; - const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]}; - if (usedClustersLayer[currentSortedIndex]) { - continue; - } - - int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0; - int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1} - const float inverseR0{1.f / currentCluster.radius}; - - for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) { - const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]}; - const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))}; - const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0}; - const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate}; - const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate}; - const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution - const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))}; - - const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)}; - - if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) { - continue; - } - int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1}; - if (phiBinsNum < 0) { - phiBinsNum += trkPars->PhiBins; - } - const int tableSize{phiBins * zBins + 1}; - for (int rof1{minRof}; rof1 <= maxRof; ++rof1) { - auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]}; - if (!nClustersNext) { // number of clusters on next layer > 0 - continue; - } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { - int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins; - const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)}; - const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1}; - const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex]; - const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= nClustersNext) { - break; - } - auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster}; - const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]}; - if (usedClustersNextLayer[nextCluster.clusterId]) { - continue; - } - const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)}; - const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)}; - - if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) { - const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)}; - const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)}; - const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster}; - if (storedTracklets < maxTrackletsPerCluster) { - new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast(rof0), static_cast(rof1)}; - } - // else { - // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets); - // } - ++storedTracklets; - } - } - } - } - } - } + atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1); } } @@ -803,12 +722,176 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel( } // namespace gpu +template +void countTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + gpu::computeLayerTrackletsMultiROFKernel<<>>( + utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + nullptr, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } +} + +template +void computeTrackletsInROFsHandler(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads) +{ + for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) { + gpu::computeLayerTrackletsMultiROFKernel<<>>(utils, + multMask, + iLayer, + startROF, + endROF, + maxROF, + deltaROF, + vertices, + rofPV, + nVertices, + vertexId, + clusters, + ROFClusters, + usedClusters, + clustersIndexTables, + tracklets, + trackletsLUTs, + iteration, + NSigmaCut, + phiCuts[iLayer], + resolutionPV, + minRs[iLayer + 1], + maxRs[iLayer + 1], + resolutions[iLayer], + radii[iLayer + 1] - radii[iLayer], + mulScatAng[iLayer]); + thrust::device_ptr tracklets_ptr(spanTracklets[iLayer]); + thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets()); + auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets()); + nTracklets[iLayer] = unique_end - tracklets_ptr; + if (iLayer > 0) { + gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int))); + gpu::compileTrackletsLookupTableKernel<<>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]); + void* d_temp_storage = nullptr; + size_t temp_storage_bytes = 0; + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes)); + gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage, // d_temp_storage + temp_storage_bytes, // temp_storage_bytes + trackletsLUTsHost[iLayer], // d_in + trackletsLUTsHost[iLayer], // d_out + nClusters[iLayer] + 1, // num_items + 0)); // NOLINT: this is the offset of the sum, not a pointer + gpuCheckError(cudaFree(d_temp_storage)); + } + } +} + void countCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -850,7 +933,6 @@ void countCellsHandler( cellsLUTsHost, // d_out nTracklets + 1, // num_items 0)); // NOLINT: this is the offset of the sum, not a pointer - // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1); gpuCheckError(cudaFree(d_temp_storage)); } @@ -858,8 +940,8 @@ void computeCellsHandler( const Cluster** sortedClusters, const Cluster** unsortedClusters, const TrackingFrameInfo** tfInfo, - const Tracklet** tracklets, - const int** trackletsLUT, + Tracklet** tracklets, + int** trackletsLUT, const int nTracklets, const int layer, CellSeed* cells, @@ -963,8 +1045,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice, const int nThreads) { - gpu::computeLayerCellNeighboursKernel<<>>( + gpu::computeLayerCellNeighboursKernel<<>>( cellsLayersDevice, neighboursLUT, neighboursIndexTable, @@ -1032,4 +1114,65 @@ void trackSeedHandler(CellSeed* trackSeeds, gpuCheckError(cudaPeekAtLastError()); gpuCheckError(cudaDeviceSynchronize()); } -} // namespace o2::its + +template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); + +template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils, + const uint8_t* multMask, + const int startROF, + const int endROF, + const int maxROF, + const int deltaROF, + const int vertexId, + const Vertex* vertices, + const int* rofPV, + const int nVertices, + const Cluster** clusters, + std::vector nClusters, + const int** ROFClusters, + const unsigned char** usedClusters, + const int** clustersIndexTables, + Tracklet** tracklets, + gsl::span spanTracklets, + gsl::span nTracklets, + int** trackletsLUTs, + gsl::span trackletsLUTsHost, + const int iteration, + const float NSigmaCut, + std::vector& phiCuts, + const float resolutionPV, + std::vector& minRs, + std::vector& maxRs, + std::vector& resolutions, + std::vector& radii, + std::vector& mulScatAng, + const int nBlocks, + const int nThreads); +} // namespace o2::its \ No newline at end of file diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h index 906eb0fa5c21e..fa4f33782d16a 100644 --- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h +++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h @@ -106,12 +106,16 @@ class TimeFrame float getBeamX() const; float getBeamY() const; - + std::vector& getMinRs() { return mMinR; } + std::vector& getMaxRs() { return mMaxR; } float getMinR(int layer) const { return mMinR[layer]; } float getMaxR(int layer) const { return mMaxR[layer]; } float getMSangle(int layer) const { return mMSangles[layer]; } + std::vector& getMSangles() { return mMSangles; } float getPhiCut(int layer) const { return mPhiCuts[layer]; } + std::vector& getPhiCuts() { return mPhiCuts; } float getPositionResolution(int layer) const { return mPositionResolution[layer]; } + std::vector& getPositionResolutions() { return mPositionResolution; } gsl::span getClustersOnLayer(int rofId, int layerId); gsl::span getClustersOnLayer(int rofId, int layerId) const; @@ -209,8 +213,8 @@ class TimeFrame const unsigned long long& getRoadLabel(int i) const; bool isRoadFake(int i) const; - void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } - void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } + void setMultiplicityCutMask(const std::vector& cutMask) { mMultiplicityCutMask = cutMask; } + void setROFMask(const std::vector& rofMask) { mROFMask = rofMask; } void swapMasks() { mMultiplicityCutMask.swap(mROFMask); } int hasBogusClusters() const { return std::accumulate(mBogusClusters.begin(), mBogusClusters.end(), 0); } @@ -289,6 +293,7 @@ class TimeFrame std::vector> mTracks; std::vector> mCellsNeighbours; std::vector> mCellsLookupTable; + std::vector mMultiplicityCutMask; const o2::base::PropagatorImpl* mPropagatorDevice = nullptr; // Needed only for GPU protected: @@ -311,8 +316,8 @@ class TimeFrame std::vector mPhiCuts; std::vector mPositionResolution; std::vector mClusterSize; - std::vector mMultiplicityCutMask; - std::vector mROFMask; + + std::vector mROFMask; std::vector> mPValphaX; /// PV x and alpha for track propagation std::vector> mTrackletLabels; std::vector> mCellLabels; @@ -439,33 +444,33 @@ inline gsl::span TimeFrame::getClustersPerROFrange(int rofMin, in return gsl::span(); } int startIdx{mROFramesClusters[layerId][rofMin]}; // First cluster of rofMin - int endIdx{mROFramesClusters[layerId][std::min(rofMin + range, mNrof)]}; + int endIdx{mROFramesClusters[layerId][o2::gpu::CAMath::Min(rofMin + range, mNrof)]}; return {&mClusters[layerId][startIdx], static_cast::size_type>(endIdx - startIdx)}; } inline gsl::span TimeFrame::getROFramesClustersPerROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mROFramesClusters[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline gsl::span TimeFrame::getNClustersROFrange(int rofMin, int range, int layerId) const { - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mNClustersPerROF[layerId][rofMin], static_cast::size_type>(chkdRange)}; } inline int TimeFrame::getTotalClustersPerROFrange(int rofMin, int range, int layerId) const { int startIdx{rofMin}; // First cluster of rofMin - int endIdx{std::min(rofMin + range, mNrof)}; + int endIdx{o2::gpu::CAMath::Min(rofMin + range, mNrof)}; return mROFramesClusters[layerId][endIdx] - mROFramesClusters[layerId][startIdx]; } inline gsl::span TimeFrame::getIndexTablePerROFrange(int rofMin, int range, int layerId) const { const int iTableSize{mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1}; - int chkdRange{std::min(range, mNrof - rofMin)}; + int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)}; return {&mIndexTables[layerId][rofMin * iTableSize], static_cast::size_type>(chkdRange * iTableSize)}; } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx index da0abbae9dc1f..409b20ea23235 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx @@ -75,9 +75,9 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in for (int rof0{startROF}; rof0 < endROF; ++rof0) { gsl::span primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : tf->getPrimaryVertices(rof0); const int startVtx{iVertex >= 0 ? iVertex : 0}; - const int endVtx{iVertex >= 0 ? std::min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; - int minRof = std::max(startROF, rof0 - mTrkParams[iteration].DeltaROF); - int maxRof = std::min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); + const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast(primaryVertices.size())) : static_cast(primaryVertices.size())}; + int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF); + int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF); #pragma omp parallel for num_threads(mNThreads) for (int iLayer = 0; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) { gsl::span layer0 = tf->getClustersOnLayer(rof0, iLayer); @@ -128,7 +128,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in if (layer1.empty()) { continue; } - for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) { int iPhiBin = (selectedBinsRect.y + iPhiCount) % mTrkParams[iteration].PhiBins; const int firstBinIndex{tf->mIndexTableUtils.getBinIndex(selectedBinsRect.x, iPhiBin)}; @@ -145,9 +144,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in } const int firstRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[firstBinIndex]; const int maxRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[maxBinIndex]; - for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) { - if (iNextCluster >= (int)layer1.size()) { break; } @@ -668,7 +665,7 @@ void TrackerTraits::findRoads(const int iteration) if (rofs[1] != INT_MAX) { track.setNextROFbit(); } - mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track); + mTimeFrame->getTracks(o2::gpu::CAMath::Min(rofs[0], rofs[1])).emplace_back(track); } } } diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx index f00d87164d7d6..5b8a9bb1cb0f2 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx @@ -174,7 +174,7 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc) auto errorLogger = [&](std::string s) { LOG(error) << s; }; FastMultEst multEst; // mult estimator - std::vector processingMask, processUPCMask; + std::vector processingMask, processUPCMask; int cutVertexMult{0}, cutUPCVertex{0}, cutRandomMult = int(trackROFvec.size()) - multEst.selectROFs(trackROFvec, compClusters, physTriggers, processingMask); processUPCMask.resize(processingMask.size(), false); mTimeFrame->setMultiplicityCutMask(processingMask); diff --git a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx index 4eaddc8385b8a..e87e2289b49e7 100644 --- a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx +++ b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx @@ -90,7 +90,7 @@ float Vertexer::clustersToVerticesHybrid(std::function logg auto timeVertexingIteration = evaluateTask( &Vertexer::findVerticesHybrid, "Hybrid Vertexer vertex finding", [](std::string) {}, iteration); - printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration().size(), timeInit, timeTracklet, timeSelection, timeVertexing); + printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration()[iteration], timeInitIteration, timeTrackletIteration, timeSelectionIteration, timeVertexingIteration); timeInit += timeInitIteration; timeTracklet += timeTrackletIteration; timeSelection += timeSelectionIteration; @@ -142,9 +142,9 @@ void Vertexer::printEpilog(std::function logger, const float initT, const float trackletT, const float selecT, const float vertexT) { float total = initT + trackletT + selecT + vertexT; - logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", trackletN01, trackletN12, trackletT)); - logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", selectedN, selecT)); - logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid" : "", vertexN, vertexT)); + logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", trackletN01, trackletN12, trackletT)); + logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", selectedN, selecT)); + logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid " : "", vertexN, vertexT)); // logger(fmt::format(" - Timeframe {} vertexing completed in: {} ms, using {} thread(s).", mTimeFrameCounter++, total, mTraits->getNThreads())); } diff --git a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx index 01e649f982896..4a0470adcf07a 100644 --- a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx +++ b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx @@ -132,7 +132,7 @@ void CookedTrackerDPL::run(ProcessingContext& pc) const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts FastMultEst multEst; // mult estimator - std::vector processingMask; + std::vector processingMask; int cutVertexMult{0}, cutRandomMult = int(rofsinput.size()) - multEst.selectROFs(rofsinput, compClusters, physTriggers, processingMask); // auto processingMask_ephemeral = processingMask;