From 33b421259e216f9e87c41438ae4c1fcc63691163 Mon Sep 17 00:00:00 2001
From: Matteo Concas <matteo.concas@cern.ch>
Date: Tue, 3 Dec 2024 16:07:27 +0100
Subject: [PATCH] ITS-GPU: Move Tracklet finder on GPU (#13737)

* Fix hybrid vertexer printouts

* Move multiplicity mask to a vector<uint8_t>

* Add gpuSpan

* Debugging getSpan

* Checkpointing

* Fix access in tracklet finding

* Fix tracklet LUTs issue

* Debugging small discrepancies

* Fix bad PhiBins pick

* Add tracklet counting

* Fix indices for used clusters

* Add tracklet writing on the buffer

* tracklets on gpu

* Tracklet finder on GPU
---
 .../include/ITSReconstruction/FastMultEst.h   |   2 +-
 .../ITS/reconstruction/src/FastMultEst.cxx    |   2 +-
 .../GPU/ITStrackingGPU/TimeFrameGPU.h         |  63 +-
 .../GPU/ITStrackingGPU/TrackingKernels.h      |  71 +-
 .../ITS/tracking/GPU/ITStrackingGPU/Utils.h   |  43 ++
 .../ITS/tracking/GPU/cuda/TimeFrameGPU.cu     | 168 +++-
 .../tracking/GPU/cuda/TrackerTraitsGPU.cxx    | 335 +++-----
 .../ITS/tracking/GPU/cuda/TrackingKernels.cu  | 723 +++++++++++-------
 .../tracking/include/ITStracking/TimeFrame.h  |  25 +-
 .../ITSMFT/ITS/tracking/src/TrackerTraits.cxx |  11 +-
 .../ITS/tracking/src/TrackingInterface.cxx    |   2 +-
 .../ITSMFT/ITS/tracking/src/Vertexer.cxx      |   8 +-
 .../ITS/workflow/src/CookedTrackerSpec.cxx    |   2 +-
 13 files changed, 848 insertions(+), 607 deletions(-)
diff --git a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h
index 457381862cc42..9e8299e89b404 100644
--- a/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h
+++ b/Detectors/ITSMFT/ITS/reconstruction/include/ITSReconstruction/FastMultEst.h
@@ -45,7 +45,7 @@ struct FastMultEst {
 
   static uint32_t getCurrentRandomSeed();
   int selectROFs(const gsl::span<const o2::itsmft::ROFRecord> rofs, const gsl::span<const o2::itsmft::CompClusterExt> clus,
-                 const gsl::span<const o2::itsmft::PhysTrigger> trig, std::vector<bool>& sel);
+                 const gsl::span<const o2::itsmft::PhysTrigger> trig, std::vector<uint8_t>& sel);
 
   void fillNClPerLayer(const gsl::span<const o2::itsmft::CompClusterExt>& clusters);
   float process(const std::array<int, NLayers> ncl)
diff --git a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx
index a55fafdf60409..c547996c6f356 100644
--- a/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx
+++ b/Detectors/ITSMFT/ITS/reconstruction/src/FastMultEst.cxx
@@ -125,7 +125,7 @@ float FastMultEst::processNoiseImposed(const std::array<int, NLayers> ncl)
 }
 
 int FastMultEst::selectROFs(const gsl::span<const o2::itsmft::ROFRecord> rofs, const gsl::span<const o2::itsmft::CompClusterExt> clus,
-                            const gsl::span<const o2::itsmft::PhysTrigger> trig, std::vector<bool>& sel)
+                            const gsl::span<const o2::itsmft::PhysTrigger> trig, std::vector<uint8_t>& sel)
 {
   int nrof = rofs.size(), nsel = 0;
   const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
index ad8724f315ec8..37f392ebbd3a7 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TimeFrameGPU.h
@@ -51,9 +51,19 @@ class TimeFrameGPU : public TimeFrame
   void initialise(const int, const TrackingParameters&, const int, IndexTableUtils* utils = nullptr, const TimeFrameGPUParameters* pars = nullptr);
   void initDevice(IndexTableUtils*, const TrackingParameters& trkParam, const TimeFrameGPUParameters&, const int, const int);
   void initDeviceSAFitting();
+  void loadIndexTableUtils(const int);
   void loadTrackingFrameInfoDevice(const int);
   void loadUnsortedClustersDevice(const int);
   void loadClustersDevice(const int);
+  void loadClustersIndexTables(const int iteration);
+  void createUsedClustersDevice(const int);
+  void loadUsedClustersDevice();
+  void loadROframeClustersDevice(const int);
+  void loadMultiplicityCutMask(const int);
+  void loadVertices(const int);
+
+  ///
+  void createTrackletsLUTDevice(const int);
   void loadTrackletsDevice();
   void loadTrackletsLUTDevice();
   void loadCellsDevice();
@@ -62,6 +72,7 @@ class TimeFrameGPU : public TimeFrame
   void loadTrackSeedsChi2Device();
   void loadRoadsDevice();
   void loadTrackSeedsDevice(std::vector<CellSeed>&);
+  void createTrackletsBuffers();
   void createCellsBuffers(const int);
   void createCellsDevice();
   void createCellsLUTDevice();
@@ -93,7 +104,7 @@ class TimeFrameGPU : public TimeFrame
   std::vector<std::vector<o2::MCCompLabel>>& getLabelsInChunks() { return mLabelsInChunks; }
   int getNAllocatedROFs() const { return mNrof; } // Allocated means maximum nROF for each chunk while populated is the number of loaded ones.
   StaticTrackingParameters<nLayers>* getDeviceTrackingParameters() { return mTrackingParamsDevice; }
-  Vertex* getDeviceVertices() { return mVerticesDevice; }
+  Vertex* getDeviceVertices() { return mPrimaryVerticesDevice; }
   int* getDeviceROFramesPV() { return mROFramesPVDevice; }
   unsigned char* getDeviceUsedClusters(const int);
   const o2::base::Propagator* getChainPropagator();
@@ -107,8 +118,12 @@ class TimeFrameGPU : public TimeFrame
   const TrackingFrameInfo** getDeviceArrayTrackingFrameInfo() const { return mTrackingFrameInfoDeviceArray; }
   const Cluster** getDeviceArrayClusters() const { return mClustersDeviceArray; }
   const Cluster** getDeviceArrayUnsortedClusters() const { return mUnsortedClustersDeviceArray; }
-  const Tracklet** getDeviceArrayTracklets() const { return mTrackletsDeviceArray; }
-  const int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
+  const int** getDeviceArrayClustersIndexTables() const { return mClustersIndexTablesDeviceArray; }
+  std::vector<unsigned int> getClusterSizes();
+  const unsigned char** getDeviceArrayUsedClusters() const { return mUsedClustersDeviceArray; }
+  const int** getDeviceROframeClusters() const { return mROFrameClustersDeviceArray; }
+  Tracklet** getDeviceArrayTracklets() { return mTrackletsDeviceArray; }
+  int** getDeviceArrayTrackletsLUT() const { return mTrackletsLUTDeviceArray; }
   int** getDeviceArrayCellsLUT() const { return mCellsLUTDeviceArray; }
   int** getDeviceArrayNeighboursCellLUT() const { return mNeighboursCellLUTDeviceArray; }
   CellSeed** getDeviceArrayCells() const { return mCellsDeviceArray; }
@@ -116,17 +131,19 @@ class TimeFrameGPU : public TimeFrame
   o2::track::TrackParCovF** getDeviceArrayTrackSeeds() { return mCellSeedsDeviceArray; }
   float** getDeviceArrayTrackSeedsChi2() { return mCellSeedsChi2DeviceArray; }
   int* getDeviceNeighboursIndexTables(const int layer) { return mNeighboursIndexTablesDevice[layer]; }
+  uint8_t* getDeviceMultCutMask() { return mMultMaskDevice; }
 
   void setDevicePropagator(const o2::base::PropagatorImpl<float>*) override;
 
   // Host-specific getters
-  gsl::span<int> getHostNTracklets(const int chunkId);
-  gsl::span<int> getHostNCells(const int chunkId);
+  gsl::span<int, nLayers - 1> getNTracklets() { return mNTracklets; }
+  gsl::span<int, nLayers - 2> getNCells() { return mNCells; }
 
   // Host-available device getters
+  gsl::span<int*> getDeviceTrackletsLUTs() { return mTrackletsLUTDevice; }
   gsl::span<int*> getDeviceCellLUTs() { return mCellsLUTDevice; }
+  gsl::span<Tracklet*> getDeviceTracklet() { return mTrackletsDevice; }
   gsl::span<CellSeed*> getDeviceCells() { return mCellsDevice; }
-  gsl::span<int, nLayers - 2> getNCellsDevice() { return mNCells; }
 
  private:
   void allocMemAsync(void**, size_t, Stream*, bool); // Abstract owned and unowned memory allocations
@@ -136,31 +153,37 @@ class TimeFrameGPU : public TimeFrame
   StaticTrackingParameters<nLayers> mStaticTrackingParams;
 
   // Host-available device buffer sizes
+  std::array<int, nLayers - 1> mNTracklets;
   std::array<int, nLayers - 2> mNCells;
 
   // Device pointers
   StaticTrackingParameters<nLayers>* mTrackingParamsDevice;
   IndexTableUtils* mIndexTableUtilsDevice;
-  std::array<int*, nLayers> mROFramesClustersDevice;
-  std::array<unsigned char*, nLayers> mUsedClustersDevice;
-  Vertex* mVerticesDevice;
-  int* mROFramesPVDevice;
 
   // Hybrid pref
+  uint8_t* mMultMaskDevice;
+  Vertex* mPrimaryVerticesDevice;
+  int* mROFramesPVDevice;
   std::array<Cluster*, nLayers> mClustersDevice;
   std::array<Cluster*, nLayers> mUnsortedClustersDevice;
+  std::array<int*, nLayers> mClustersIndexTablesDevice;
+  std::array<unsigned char*, nLayers> mUsedClustersDevice;
+  std::array<int*, nLayers> mROFramesClustersDevice;
   const Cluster** mClustersDeviceArray;
   const Cluster** mUnsortedClustersDeviceArray;
+  const int** mClustersIndexTablesDeviceArray;
+  const unsigned char** mUsedClustersDeviceArray;
+  const int** mROFrameClustersDeviceArray;
   std::array<Tracklet*, nLayers - 1> mTrackletsDevice;
-  const Tracklet** mTrackletsDeviceArray;
-  const int** mTrackletsLUTDeviceArray;
-  std::array<int*, nLayers - 2> mTrackletsLUTDevice;
+  Tracklet** mTrackletsDeviceArray;
+  std::array<int*, nLayers - 1> mTrackletsLUTDevice;
   std::array<int*, nLayers - 2> mCellsLUTDevice;
   std::array<int*, nLayers - 3> mNeighboursLUTDevice;
 
   int** mCellsLUTDeviceArray;
   int** mNeighboursCellDeviceArray;
   int** mNeighboursCellLUTDeviceArray;
+  int** mTrackletsLUTDeviceArray;
   std::array<CellSeed*, nLayers - 2> mCellsDevice;
   std::array<int*, nLayers - 2> mNeighboursIndexTablesDevice;
   CellSeed* mTrackSeedsDevice;
@@ -186,10 +209,6 @@ class TimeFrameGPU : public TimeFrame
   std::vector<std::vector<int>> mNVerticesInChunks;
   std::vector<std::vector<o2::MCCompLabel>> mLabelsInChunks;
 
-  // Host memory used only in GPU tracking
-  std::vector<int> mHostNTracklets;
-  std::vector<int> mHostNCells;
-
   // Temporary buffer for storing output tracks from GPU tracking
   std::vector<TrackITSExt> mTrackITSExt;
 };
@@ -215,6 +234,16 @@ inline int TimeFrameGPU<nLayers>::getNClustersInRofSpan(const int rofIdstart, co
 {
   return static_cast<int>(mROFramesClusters[layerId][(rofIdstart + rofSpanSize) < mROFramesClusters.size() ? rofIdstart + rofSpanSize : mROFramesClusters.size() - 1] - mROFramesClusters[layerId][rofIdstart]);
 }
+
+template <int nLayers>
+inline std::vector<unsigned int> TimeFrameGPU<nLayers>::getClusterSizes()
+{
+  std::vector<unsigned int> sizes(mUnsortedClusters.size());
+  std::transform(mUnsortedClusters.begin(), mUnsortedClusters.end(), sizes.begin(),
+                 [](const auto& v) { return static_cast<unsigned int>(v.size()); });
+  return sizes;
+}
+
 } // namespace gpu
 } // namespace its
 } // namespace o2
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
index 34e6165b9530f..54bdae302e643 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/TrackingKernels.h
@@ -50,11 +50,74 @@ GPUg() void fitTrackSeedsKernel(
 #endif
 } // namespace gpu
 
+template <int nLayers = 7>
+void countTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                 const uint8_t* multMask,
+                                 const int startROF,
+                                 const int endROF,
+                                 const int maxROF,
+                                 const int deltaROF,
+                                 const int vertexId,
+                                 const Vertex* vertices,
+                                 const int* rofPV,
+                                 const int nVertices,
+                                 const Cluster** clusters,
+                                 std::vector<unsigned int> nClusters,
+                                 const int** ROFClusters,
+                                 const unsigned char** usedClusters,
+                                 const int** clustersIndexTables,
+                                 int** trackletsLUTs,
+                                 gsl::span<int*> trackletsLUTsHost,
+                                 const int iteration,
+                                 const float NSigmaCut,
+                                 std::vector<float>& phiCuts,
+                                 const float resolutionPV,
+                                 std::vector<float>& minR,
+                                 std::vector<float>& maxR,
+                                 std::vector<float>& resolutions,
+                                 std::vector<float>& radii,
+                                 std::vector<float>& mulScatAng,
+                                 const int nBlocks,
+                                 const int nThreads);
+
+template <int nLayers = 7>
+void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                   const uint8_t* multMask,
+                                   const int startROF,
+                                   const int endROF,
+                                   const int maxROF,
+                                   const int deltaROF,
+                                   const int vertexId,
+                                   const Vertex* vertices,
+                                   const int* rofPV,
+                                   const int nVertices,
+                                   const Cluster** clusters,
+                                   std::vector<unsigned int> nClusters,
+                                   const int** ROFClusters,
+                                   const unsigned char** usedClusters,
+                                   const int** clustersIndexTables,
+                                   Tracklet** tracklets,
+                                   gsl::span<Tracklet*> spanTracklets,
+                                   gsl::span<int> nTracklets,
+                                   int** trackletsLUTs,
+                                   gsl::span<int*> trackletsLUTsHost,
+                                   const int iteration,
+                                   const float NSigmaCut,
+                                   std::vector<float>& phiCuts,
+                                   const float resolutionPV,
+                                   std::vector<float>& minR,
+                                   std::vector<float>& maxR,
+                                   std::vector<float>& resolutions,
+                                   std::vector<float>& radii,
+                                   std::vector<float>& mulScatAng,
+                                   const int nBlocks,
+                                   const int nThreads);
+
 void countCellsHandler(const Cluster** sortedClusters,
                        const Cluster** unsortedClusters,
                        const TrackingFrameInfo** tfInfo,
-                       const Tracklet** tracklets,
-                       const int** trackletsLUT,
+                       Tracklet** tracklets,
+                       int** trackletsLUT,
                        const int nTracklets,
                        const int layer,
                        CellSeed* cells,
@@ -70,8 +133,8 @@ void countCellsHandler(const Cluster** sortedClusters,
 void computeCellsHandler(const Cluster** sortedClusters,
                          const Cluster** unsortedClusters,
                          const TrackingFrameInfo** tfInfo,
-                         const Tracklet** tracklets,
-                         const int** trackletsLUT,
+                         Tracklet** tracklets,
+                         int** trackletsLUT,
                          const int nTracklets,
                          const int layer,
                          CellSeed* cells,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
index 66244bf854b5f..a88e51742e84a 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/ITStrackingGPU/Utils.h
@@ -31,6 +31,49 @@ struct gpuPair {
 
 namespace gpu
 {
+// Poor man implementation of a span-like struct. It is very limited.
+template <typename T>
+struct gpuSpan {
+  using value_type = T;
+  using ptr = T*;
+  using ref = T&;
+
+  GPUd() gpuSpan() : _data(nullptr), _size(0) {}
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
+  GPUd() bool empty() const { return _size == 0; }
+  GPUd() ref front() const { return _data[0]; }
+  GPUd() ref back() const { return _data[_size - 1]; }
+  GPUd() ptr begin() const { return _data; }
+  GPUd() ptr end() const { return _data + _size; }
+
+ protected:
+  ptr _data;
+  unsigned int _size;
+};
+
+template <typename T>
+struct gpuSpan<const T> {
+  using value_type = T;
+  using ptr = const T*;
+  using ref = const T&;
+
+  GPUd() gpuSpan() : _data(nullptr), _size(0) {}
+  GPUd() gpuSpan(ptr data, unsigned int dim) : _data(data), _size(dim) {}
+  GPUd() gpuSpan(const gpuSpan<T>& other) : _data(other._data), _size(other._size) {}
+  GPUd() ref operator[](unsigned int idx) const { return _data[idx]; }
+  GPUd() unsigned int size() const { return _size; }
+  GPUd() bool empty() const { return _size == 0; }
+  GPUd() ref front() const { return _data[0]; }
+  GPUd() ref back() const { return _data[_size - 1]; }
+  GPUd() ptr begin() const { return _data; }
+  GPUd() ptr end() const { return _data + _size; }
+
+ protected:
+  ptr _data;
+  unsigned int _size;
+};
 
 enum class Task {
   Tracker = 0,
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
index 67144ba2c98ea..4bd15c0203d81 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TimeFrameGPU.cu
@@ -92,6 +92,19 @@ void TimeFrameGPU<nLayers>::setDevicePropagator(const o2::base::PropagatorImpl<f
   mPropagatorDevice = propagator;
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadIndexTableUtils(const int iteration)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading indextable utils");
+  if (!iteration) {
+    LOGP(debug, "gpu-allocation: allocating IndexTableUtils buffer, for {} MB.", sizeof(IndexTableUtils) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mIndexTableUtilsDevice), sizeof(IndexTableUtils), nullptr, getExtAllocator());
+  }
+  LOGP(debug, "gpu-transfer: loading IndexTableUtils object, for {} MB.", sizeof(IndexTableUtils) / MB);
+  checkGPUError(cudaMemcpyAsync(mIndexTableUtilsDevice, &mIndexTableUtils, sizeof(IndexTableUtils), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadUnsortedClustersDevice(const int iteration)
 {
@@ -128,6 +141,65 @@ void TimeFrameGPU<nLayers>::loadClustersDevice(const int iteration)
   }
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadClustersIndexTables(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading sorted clusters");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(debug, "gpu-transfer: loading clusters indextable for layer {} with {} elements, for {} MB.", iLayer, mIndexTables[iLayer].size(), mIndexTables[iLayer].size() * sizeof(int) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDevice[iLayer]), mIndexTables[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
+      checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDevice[iLayer], mIndexTables[iLayer].data(), mIndexTables[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mClustersIndexTablesDeviceArray), nLayers * sizeof(int), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mClustersIndexTablesDeviceArray, mClustersIndexTablesDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createUsedClustersDevice(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating used clusters flags");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(debug, "gpu-transfer: creating {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mUsedClusters[iLayer].size() * sizeof(unsigned char) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDevice[iLayer]), mUsedClusters[iLayer].size() * sizeof(unsigned char), nullptr, getExtAllocator());
+      checkGPUError(cudaMemsetAsync(mUsedClustersDevice[iLayer], 0, mUsedClusters[iLayer].size() * sizeof(unsigned char), mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mUsedClustersDeviceArray), nLayers * sizeof(unsigned char*), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDeviceArray, mUsedClustersDevice.data(), nLayers * sizeof(unsigned char*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadUsedClustersDevice()
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading used clusters flags");
+  for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+    LOGP(debug, "gpu-transfer: loading {} used clusters flags on layer {}, for {} MB.", mUsedClusters[iLayer].size(), iLayer, mClusters[iLayer].size() * sizeof(unsigned char) / MB);
+    checkGPUError(cudaMemcpyAsync(mUsedClustersDevice[iLayer], mUsedClusters[iLayer].data(), mUsedClusters[iLayer].size() * sizeof(unsigned char), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadROframeClustersDevice(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading ROframe clusters");
+    for (auto iLayer{0}; iLayer < nLayers; ++iLayer) {
+      LOGP(debug, "gpu-transfer: loading {} ROframe clusters info on layer {}, for {} MB.", mROFramesClusters[iLayer].size(), iLayer, mROFramesClusters[iLayer].size() * sizeof(int) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mROFramesClustersDevice[iLayer]), mROFramesClusters[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
+      checkGPUError(cudaMemcpyAsync(mROFramesClustersDevice[iLayer], mROFramesClusters[iLayer].data(), mROFramesClusters[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    }
+    allocMemAsync(reinterpret_cast<void**>(&mROFrameClustersDeviceArray), nLayers * sizeof(int*), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mROFrameClustersDeviceArray, mROFramesClustersDevice.data(), nLayers * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
 {
@@ -146,19 +218,76 @@ void TimeFrameGPU<nLayers>::loadTrackingFrameInfoDevice(const int iteration)
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadMultiplicityCutMask(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading multiplicity cut mask");
+    LOGP(debug, "gpu-transfer: loading multiplicity cut mask with {} elements, for {} MB.", mMultiplicityCutMask.size(), mMultiplicityCutMask.size() * sizeof(bool) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mMultMaskDevice), mMultiplicityCutMask.size() * sizeof(uint8_t), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mMultMaskDevice, mMultiplicityCutMask.data(), mMultiplicityCutMask.size() * sizeof(uint8_t), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::loadVertices(const int iteration)
+{
+  if (!iteration) {
+    START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading seeding vertices");
+    LOGP(debug, "gpu-transfer: loading {} ROframes vertices, for {} MB.", mROFramesPV.size(), mROFramesPV.size() * sizeof(int) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mROFramesPVDevice), mROFramesPV.size() * sizeof(int), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mROFramesPVDevice, mROFramesPV.data(), mROFramesPV.size() * sizeof(int), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    LOGP(debug, "gpu-transfer: loading {} seeding vertices, for {} MB.", mPrimaryVertices.size(), mPrimaryVertices.size() * sizeof(Vertex) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mPrimaryVerticesDevice), mPrimaryVertices.size() * sizeof(Vertex), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mPrimaryVerticesDevice, mPrimaryVertices.data(), mPrimaryVertices.size() * sizeof(Vertex), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+    STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+  }
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createTrackletsLUTDevice(const int iteration)
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating tracklets LUTs");
+  for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
+    if (!iteration) {
+      LOGP(debug, "gpu-transfer: creating tracklets LUT for {} elements on layer {}, for {} MB.", mClusters[iLayer].size() + 1, iLayer, (mClusters[iLayer].size() + 1) * sizeof(int) / MB);
+      allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), (mClusters[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator());
+    }
+    checkGPUError(cudaMemsetAsync(mTrackletsLUTDevice[iLayer], 0, (mClusters[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+  }
+  if (!iteration) {
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 1) * sizeof(int*), nullptr, getExtAllocator());
+    checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), mTrackletsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  }
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
+template <int nLayers>
+void TimeFrameGPU<nLayers>::createTrackletsBuffers()
+{
+  START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
+  for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
+    mNTracklets[iLayer] = 0;
+    checkGPUError(cudaMemcpyAsync(&mNTracklets[iLayer], mTrackletsLUTDevice[iLayer] + mClusters[iLayer].size(), sizeof(int), cudaMemcpyDeviceToHost));
+    LOGP(debug, "gpu-transfer: creating tracklets buffer for {} elements on layer {}, for {} MB.", mNTracklets[iLayer], iLayer, mNTracklets[iLayer] * sizeof(Tracklet) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mNTracklets[iLayer] * sizeof(Tracklet), nullptr, getExtAllocator());
+  }
+  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator());
+  checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
+  checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
+  STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
+}
+
 template <int nLayers>
 void TimeFrameGPU<nLayers>::loadTrackletsDevice()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
     LOGP(debug, "gpu-transfer: loading {} tracklets on layer {}, for {} MB.", mTracklets[iLayer].size(), iLayer, mTracklets[iLayer].size() * sizeof(Tracklet) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsDevice[iLayer]), mTracklets[iLayer].size() * sizeof(Tracklet), nullptr, getExtAllocator());
     checkGPUError(cudaHostRegister(mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaHostRegisterPortable));
     checkGPUError(cudaMemcpyAsync(mTrackletsDevice[iLayer], mTracklets[iLayer].data(), mTracklets[iLayer].size() * sizeof(Tracklet), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mTrackletsDeviceArray), (nLayers - 1) * sizeof(Tracklet*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mTrackletsDeviceArray, mTrackletsDevice.data(), (nLayers - 1) * sizeof(Tracklet*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -167,14 +296,12 @@ void TimeFrameGPU<nLayers>::loadTrackletsLUTDevice()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "loading tracklets");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDevice[iLayer]), mTrackletsLookupTable[iLayer].size() * sizeof(int), nullptr, getExtAllocator());
+    LOGP(debug, "gpu-transfer: loading tracklets LUT for {} elements on layer {}, for {} MB", mTrackletsLookupTable[iLayer].size(), iLayer + 1, mTrackletsLookupTable[iLayer].size() * sizeof(int) / MB);
     checkGPUError(cudaHostRegister(mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaHostRegisterPortable));
-    checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
+    checkGPUError(cudaMemcpyAsync(mTrackletsLUTDevice[iLayer + 1], mTrackletsLookupTable[iLayer].data(), mTrackletsLookupTable[iLayer].size() * sizeof(int), cudaMemcpyHostToDevice));
   }
-  allocMemAsync(reinterpret_cast<void**>(&mTrackletsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
-  checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaHostRegisterPortable));
-  checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 2) * sizeof(int*), cudaMemcpyHostToDevice));
+  checkGPUError(cudaHostRegister(mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaHostRegisterPortable));
+  checkGPUError(cudaMemcpyAsync(mTrackletsLUTDeviceArray, mTrackletsLUTDevice.data(), (nLayers - 1) * sizeof(int*), cudaMemcpyHostToDevice));
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
@@ -214,9 +341,9 @@ void TimeFrameGPU<nLayers>::createCellsLUTDevice()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells LUTs");
   for (auto iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-    LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mTracklets[iLayer].size() + 1, iLayer, (mTracklets[iLayer].size() + 1) * sizeof(int) / MB);
-    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mTracklets[iLayer].size() + 1) * sizeof(int), nullptr, getExtAllocator());
-    checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mTracklets[iLayer].size() + 1) * sizeof(int), mGpuStreams[0].get()));
+    LOGP(debug, "gpu-transfer: creating cell LUT for {} elements on layer {}, for {} MB.", mNTracklets[iLayer] + 1, iLayer, (mNTracklets[iLayer] + 1) * sizeof(int) / MB);
+    allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDevice[iLayer]), (mNTracklets[iLayer] + 1) * sizeof(int), nullptr, getExtAllocator());
+    checkGPUError(cudaMemsetAsync(mCellsLUTDevice[iLayer], 0, (mNTracklets[iLayer] + 1) * sizeof(int), mGpuStreams[0].get()));
   }
   allocMemAsync(reinterpret_cast<void**>(&mCellsLUTDeviceArray), (nLayers - 2) * sizeof(int*), nullptr, getExtAllocator());
   checkGPUError(cudaMemcpyAsync(mCellsLUTDeviceArray, mCellsLUTDevice.data(), mCellsLUTDevice.size() * sizeof(int*), cudaMemcpyHostToDevice, mGpuStreams[0].get()));
@@ -228,7 +355,7 @@ void TimeFrameGPU<nLayers>::createCellsBuffers(const int layer)
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "creating cells buffers");
   mNCells[layer] = 0;
-  checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mTracklets[layer].size(), sizeof(int), cudaMemcpyDeviceToHost));
+  checkGPUError(cudaMemcpyAsync(&mNCells[layer], mCellsLUTDevice[layer] + mNTracklets[layer], sizeof(int), cudaMemcpyDeviceToHost));
   LOGP(debug, "gpu-transfer: creating cell buffer for {} elements on layer {}, for {} MB.", mNCells[layer], layer, mNCells[layer] * sizeof(CellSeed) / MB);
   allocMemAsync(reinterpret_cast<void**>(&mCellsDevice[layer]), mNCells[layer] * sizeof(CellSeed), nullptr, getExtAllocator());
 
@@ -319,9 +446,9 @@ void TimeFrameGPU<nLayers>::downloadCellsLUTDevice()
 {
   START_GPU_STREAM_TIMER(mGpuStreams[0].get(), "downloading cell luts");
   for (auto iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
-    LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mTracklets[iLayer + 1].size() + 1));
-    mCellsLookupTable[iLayer].resize(mTracklets[iLayer + 1].size() + 1);
-    checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mTracklets[iLayer + 1].size() + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
+    LOGP(debug, "gpu-transfer: downloading cells lut on layer {} for {} elements", iLayer, (mNTracklets[iLayer + 1] + 1));
+    mCellsLookupTable[iLayer].resize(mNTracklets[iLayer + 1] + 1);
+    checkGPUError(cudaMemcpyAsync(mCellsLookupTable[iLayer].data(), mCellsLUTDevice[iLayer + 1], (mNTracklets[iLayer + 1] + 1) * sizeof(int), cudaMemcpyDeviceToHost, mGpuStreams[0].get()));
   }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
@@ -362,13 +489,6 @@ void TimeFrameGPU<nLayers>::unregisterRest()
   LOGP(debug, "unregistering rest of the host memory...");
   checkGPUError(cudaHostUnregister(mCellsDevice.data()));
   checkGPUError(cudaHostUnregister(mTrackletsDevice.data()));
-  checkGPUError(cudaHostUnregister(mTrackletsLUTDevice.data()));
-  for (auto iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
-    if (iLayer < nLayers - 2) {
-      checkGPUError(cudaHostUnregister(mTrackletsLookupTable[iLayer].data()));
-    }
-    checkGPUError(cudaHostUnregister(mTracklets[iLayer].data()));
-  }
   STOP_GPU_STREAM_TIMER(mGpuStreams[0].get());
 }
 
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
index 3c6a307fc4ff6..ae86507e46325 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackerTraitsGPU.cxx
@@ -31,241 +31,18 @@ void TrackerTraitsGPU<nLayers>::initialiseTimeFrame(const int iteration)
   mTimeFrameGPU->initialise(iteration, mTrkParams[iteration], nLayers);
   mTimeFrameGPU->loadClustersDevice(iteration);
   mTimeFrameGPU->loadUnsortedClustersDevice(iteration);
+  mTimeFrameGPU->loadClustersIndexTables(iteration);
   mTimeFrameGPU->loadTrackingFrameInfoDevice(iteration);
+  mTimeFrameGPU->loadMultiplicityCutMask(iteration);
+  mTimeFrameGPU->loadVertices(iteration);
+  mTimeFrameGPU->loadROframeClustersDevice(iteration);
+  mTimeFrameGPU->createUsedClustersDevice(iteration);
+  mTimeFrameGPU->loadIndexTableUtils(iteration);
 }
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeLayerTracklets(const int iteration, int, int)
 {
-  // if (!mTimeFrameGPU->getClusters().size()) {
-  //   return;
-  // }
-  // const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
-  // gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
-  // std::vector<std::thread> threads(mTimeFrameGPU->getNChunks());
-
-  // for (int chunkId{0}; chunkId < mTimeFrameGPU->getNChunks(); ++chunkId) {
-  //   int maxTracklets{static_cast<int>(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity) *
-  //                    static_cast<int>(mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster)};
-  //   int maxRofPerChunk{mTimeFrameGPU->mNrof / (int)mTimeFrameGPU->getNChunks()};
-  //   // Define workload
-  //   auto doTrackReconstruction = [&, chunkId, maxRofPerChunk, iteration]() -> void {
-  //     auto offset = chunkId * maxRofPerChunk;
-  //     auto maxROF = offset + maxRofPerChunk;
-  //     while (offset < maxROF) {
-  //       auto rofs = mTimeFrameGPU->loadChunkData<gpu::Task::Tracker>(chunkId, offset, maxROF);
-  //       ////////////////////
-  //       /// Tracklet finding
-
-  //       for (int iLayer{0}; iLayer < nLayers - 1; ++iLayer) {
-  //         auto nclus = mTimeFrameGPU->getTotalClustersPerROFrange(offset, rofs, iLayer);
-  //         const float meanDeltaR{mTrkParams[iteration].LayerRadii[iLayer + 1] - mTrkParams[iteration].LayerRadii[iLayer]};
-  //         gpu::computeLayerTrackletsKernelMultipleRof<<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //           iLayer,                                                                                // const int layerIndex,
-  //           iteration,                                                                             // const int iteration,
-  //           offset,                                                                                // const unsigned int startRofId,
-  //           rofs,                                                                                  // const unsigned int rofSize,
-  //           0,                                                                                     // const unsigned int deltaRof,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer),                            // const Cluster* clustersCurrentLayer,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceClusters(iLayer + 1),                        // const Cluster* clustersNextLayer,
-  //           mTimeFrameGPU->getDeviceROframesClusters(iLayer),                                      // const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF
-  //           mTimeFrameGPU->getDeviceROframesClusters(iLayer + 1),                                  // const int* roFrameClustersNextLayer,    // Number of clusters on layer 1 per ROF
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceIndexTables(iLayer + 1),                     // const int* indexTableNextLayer,
-  //           mTimeFrameGPU->getDeviceUsedClusters(iLayer),                                          // const int* usedClustersCurrentLayer,
-  //           mTimeFrameGPU->getDeviceUsedClusters(iLayer + 1),                                      // const int* usedClustersNextLayer,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer),                           // Tracklet* tracklets,       // output data
-  //           mTimeFrameGPU->getDeviceVertices(),                                                    // const Vertex* vertices,
-  //           mTimeFrameGPU->getDeviceROframesPV(),                                                  // const int* pvROFrame,
-  //           mTimeFrameGPU->getPhiCut(iLayer),                                                      // const float phiCut,
-  //           mTimeFrameGPU->getMinR(iLayer + 1),                                                    // const float minR,
-  //           mTimeFrameGPU->getMaxR(iLayer + 1),                                                    // const float maxR,
-  //           meanDeltaR,                                                                            // const float meanDeltaR,
-  //           mTimeFrameGPU->getPositionResolution(iLayer),                                          // const float positionResolution,
-  //           mTimeFrameGPU->getMSangle(iLayer),                                                     // const float mSAngle,
-  //           mTimeFrameGPU->getDeviceTrackingParameters(),                                          // const StaticTrackingParameters<nLayers>* trkPars,
-  //           mTimeFrameGPU->getDeviceIndexTableUtils(),                                             // const IndexTableUtils* utils
-  //           mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->clustersPerROfCapacity,  // const int clustersPerROfCapacity,
-  //           mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxTrackletsPerCluster); // const int maxTrackletsPerCluster
-
-  //         // Remove empty tracklets due to striding.
-  //         auto nulltracklet = o2::its::Tracklet{};
-  //         auto thrustTrackletsBegin = thrust::device_ptr<o2::its::Tracklet>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer));
-  //         auto thrustTrackletsEnd = thrust::device_ptr<o2::its::Tracklet>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer) + (int)rofs * maxTracklets);
-  //         auto thrustTrackletsAfterEraseEnd = thrust::remove(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()),
-  //                                                            thrustTrackletsBegin,
-  //                                                            thrustTrackletsEnd,
-  //                                                            nulltracklet);
-  //         // Sort tracklets by first cluster index.
-  //         thrust::sort(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()),
-  //                      thrustTrackletsBegin,
-  //                      thrustTrackletsAfterEraseEnd,
-  //                      gpu::trackletSortIndexFunctor<o2::its::Tracklet>());
-
-  //         // Remove duplicates.
-  //         auto thrustTrackletsAfterUniqueEnd = thrust::unique(THRUST_NAMESPACE::par.on(mTimeFrameGPU->getStream(chunkId).get()), thrustTrackletsBegin, thrustTrackletsAfterEraseEnd);
-
-  //         discardResult(cudaStreamSynchronize(mTimeFrameGPU->getStream(chunkId).get()));
-  //         mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer] = thrustTrackletsAfterUniqueEnd - thrustTrackletsBegin;
-  //         // Compute tracklet lookup table.
-  //         gpu::compileTrackletsLookupTableKernel<<<rofs, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer),
-  //                                                                                                            mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer),
-  //                                                                                                            mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]);
-  //         discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),                       // d_temp_storage
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer),        // d_in
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer),        // d_out
-  //                                                     nclus,                                                                          // num_items
-  //                                                     mTimeFrameGPU->getStream(chunkId).get()));
-
-  //         // Create tracklets labels, at the moment on the host
-  //         if (mTimeFrameGPU->hasMCinformation()) {
-  //           std::vector<o2::its::Tracklet> tracklets(mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer]);
-  //           checkGPUError(cudaHostRegister(tracklets.data(), tracklets.size() * sizeof(o2::its::Tracklet), cudaHostRegisterDefault));
-  //           checkGPUError(cudaMemcpyAsync(tracklets.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer), tracklets.size() * sizeof(o2::its::Tracklet), cudaMemcpyDeviceToHost, mTimeFrameGPU->getStream(chunkId).get()));
-  //           for (auto& trk : tracklets) {
-  //             MCCompLabel label;
-  //             int currentId{mTimeFrameGPU->mClusters[iLayer][trk.firstClusterIndex].clusterId};   // This is not yet offsetted to the index of the first cluster of the chunk
-  //             int nextId{mTimeFrameGPU->mClusters[iLayer + 1][trk.secondClusterIndex].clusterId}; // This is not yet offsetted to the index of the first cluster of the chunk
-  //             for (auto& lab1 : mTimeFrameGPU->getClusterLabels(iLayer, currentId)) {
-  //               for (auto& lab2 : mTimeFrameGPU->getClusterLabels(iLayer + 1, nextId)) {
-  //                 if (lab1 == lab2 && lab1.isValid()) {
-  //                   label = lab1;
-  //                   break;
-  //                 }
-  //               }
-  //               if (label.isValid()) {
-  //                 break;
-  //               }
-  //             }
-  //             // TODO: implment label merging.
-  //             // mTimeFrameGPU->getTrackletsLabel(iLayer).emplace_back(label);
-  //           }
-  //           checkGPUError(cudaHostUnregister(tracklets.data()));
-  //         }
-  //       }
-
-  //       ////////////////
-  //       /// Cell finding
-  //       for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-  //         // Compute layer cells.
-  //         gpu::computeLayerCellsKernel<true><<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1),
-  //           mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer],
-  //           nullptr,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer),
-  //           mTimeFrameGPU->getDeviceTrackingParameters());
-
-  //         // Compute number of found Cells
-  //         checkGPUError(cub::DeviceReduce::Sum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),                       // d_temp_storage
-  //                                              mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes
-  //                                              mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer),            // d_in
-  //                                              mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells() + iLayer,               // d_out
-  //                                              mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer],                              // num_items
-  //                                              mTimeFrameGPU->getStream(chunkId).get()));
-  //         // Compute LUT
-  //         discardResult(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),                       // d_temp_storage
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer),            // d_in
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer),            // d_out
-  //                                                     mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer],                              // num_items
-  //                                                     mTimeFrameGPU->getStream(chunkId).get()));
-
-  //         gpu::computeLayerCellsKernel<false><<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTracklets(iLayer + 1),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceTrackletsLookupTables(iLayer + 1),
-  //           mTimeFrameGPU->getHostNTracklets(chunkId)[iLayer],
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer),
-  //           mTimeFrameGPU->getDeviceTrackingParameters());
-  //       }
-  //       checkGPUError(cudaMemcpyAsync(mTimeFrameGPU->getHostNCells(chunkId).data(),
-  //                                     mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(),
-  //                                     (nLayers - 2) * sizeof(int),
-  //                                     cudaMemcpyDeviceToHost,
-  //                                     mTimeFrameGPU->getStream(chunkId).get()));
-
-  //       // Create cells labels
-  //       // TODO: make it work after fixing the tracklets labels
-  //       if (mTimeFrameGPU->hasMCinformation()) {
-  //         for (int iLayer{0}; iLayer < nLayers - 2; ++iLayer) {
-  //           std::vector<o2::its::Cell> cells(mTimeFrameGPU->getHostNCells(chunkId)[iLayer]);
-  //           // Async with not registered memory?
-  //           checkGPUError(cudaMemcpyAsync(cells.data(), mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer), mTimeFrameGPU->getHostNCells(chunkId)[iLayer] * sizeof(o2::its::Cell), cudaMemcpyDeviceToHost));
-  //           for (auto& cell : cells) {
-  //             MCCompLabel currentLab{mTimeFrameGPU->getTrackletsLabel(iLayer)[cell.getFirstTrackletIndex()]};
-  //             MCCompLabel nextLab{mTimeFrameGPU->getTrackletsLabel(iLayer + 1)[cell.getSecondTrackletIndex()]};
-  //             mTimeFrameGPU->getCellsLabel(iLayer).emplace_back(currentLab == nextLab ? currentLab : MCCompLabel());
-  //           }
-  //         }
-  //       }
-
-  //       /////////////////////
-  //       /// Neighbour finding
-  //       for (int iLayer{0}; iLayer < nLayers - 3; ++iLayer) {
-  //         gpu::computeLayerCellNeighboursKernel<true><<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1),
-  //           iLayer,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),
-  //           nullptr,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(),
-  //           mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize);
-
-  //         // Compute Cell Neighbours LUT
-  //         checkGPUError(cub::DeviceScan::ExclusiveSum(mTimeFrameGPU->getChunk(chunkId).getDeviceCUBTmpBuffer(),                       // d_temp_storage
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->tmpCUBBufferSize, // temp_storage_bytes
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),    // d_in
-  //                                                     mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),    // d_out
-  //                                                     mTimeFrameGPU->getHostNCells(chunkId)[iLayer + 1],                              // num_items
-  //                                                     mTimeFrameGPU->getStream(chunkId).get()));
-
-  //         gpu::computeLayerCellNeighboursKernel<false><<<10, 1024, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCells(iLayer + 1),
-  //           iLayer,
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellsLookupTables(iLayer + 1),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeigboursLookupTables(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer),
-  //           mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(),
-  //           mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize);
-
-  //         // if (!chunkId) {
-  //         //   gpu::printBufferLayerOnThread<<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLayer,
-  //         //                                                                                       mTimeFrameGPU->getChunk(chunkId).getDeviceCellNeighbours(iLayer),
-  //         //                                                                                       mTimeFrameGPU->getChunk(chunkId).getTimeFrameGPUParameters()->maxNeighboursSize * rofs);
-  //         // }
-  //       }
-  //       // Download cells into vectors
-
-  //       for (int iLevel{nLayers - 2}; iLevel >= mTrkParams[iteration].CellMinimumLevel(); --iLevel) {
-  //         const int minimumLevel{iLevel - 1};
-  //         for (int iLayer{nLayers - 3}; iLayer >= minimumLevel; --iLayer) {
-  //           // gpu::computeLayerRoadsKernel<true><<<1, 1, 0, mTimeFrameGPU->getStream(chunkId).get()>>>(iLevel,                                                               // const int level,
-  //           //  iLayer,                                                               // const int layerIndex,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceArrayCells(),               // const CellSeed** cells,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceNFoundCells(),              // const int* nCells,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCell(),      // const int** neighbours,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceArrayNeighboursCellLUT(),   // const int** neighboursLUT,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceRoads(),                    // Road* roads,
-  //           //  mTimeFrameGPU->getChunk(chunkId).getDeviceRoadsLookupTables(iLayer)); // int* roadsLookupTable
-  //         }
-  //       }
-
-  //       // End of tracking for this chunk
-  //       offset += rofs;
-  //     }
-  //   };
-  //   threads[chunkId] = std::thread(doTrackReconstruction);
-  // }
-  // for (auto& thread : threads) {
-  //   thread.join();
-  // }
-
-  // mTimeFrameGPU->wipe(nLayers);
 }
 
 template <int nLayers>
@@ -299,7 +76,7 @@ int TrackerTraitsGPU<nLayers>::getTFNumberOfClusters() const
 template <int nLayers>
 int TrackerTraitsGPU<nLayers>::getTFNumberOfTracklets() const
 {
-  return mTimeFrameGPU->getNumberOfTracklets();
+  return std::accumulate(mTimeFrameGPU->getNTracklets().begin(), mTimeFrameGPU->getNTracklets().end(), 0);
 }
 
 template <int nLayers>
@@ -313,31 +90,94 @@ int TrackerTraitsGPU<nLayers>::getTFNumberOfCells() const
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeTrackletsHybrid(const int iteration, int iROFslice, int iVertex)
 {
-  TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
+  auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
+  // TrackerTraits::computeLayerTracklets(iteration, iROFslice, iVertex);
+  mTimeFrameGPU->createTrackletsLUTDevice(iteration);
+
+  const Vertex diamondVert({mTrkParams[iteration].Diamond[0], mTrkParams[iteration].Diamond[1], mTrkParams[iteration].Diamond[2]}, {25.e-6f, 0.f, 0.f, 25.e-6f, 0.f, 36.f}, 1, 1.f);
+  gsl::span<const Vertex> diamondSpan(&diamondVert, 1);
+  int startROF{mTrkParams[iteration].nROFsPerIterations > 0 ? iROFslice * mTrkParams[iteration].nROFsPerIterations : 0};
+  int endROF{mTrkParams[iteration].nROFsPerIterations > 0 ? (iROFslice + 1) * mTrkParams[iteration].nROFsPerIterations + mTrkParams[iteration].DeltaROF : mTimeFrameGPU->getNrof()};
+
+  countTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                       mTimeFrameGPU->getDeviceMultCutMask(),
+                                       startROF,
+                                       endROF,
+                                       mTimeFrameGPU->getNrof(),
+                                       mTrkParams[iteration].DeltaROF,
+                                       iVertex,
+                                       mTimeFrameGPU->getDeviceVertices(),
+                                       mTimeFrameGPU->getDeviceROFramesPV(),
+                                       mTimeFrameGPU->getPrimaryVerticesNum(),
+                                       mTimeFrameGPU->getDeviceArrayClusters(),
+                                       mTimeFrameGPU->getClusterSizes(),
+                                       mTimeFrameGPU->getDeviceROframeClusters(),
+                                       mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                       mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                       mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
+                                       mTimeFrameGPU->getDeviceTrackletsLUTs(), // Required for the exclusive sums
+                                       iteration,
+                                       mTrkParams[iteration].NSigmaCut,
+                                       mTimeFrameGPU->getPhiCuts(),
+                                       mTrkParams[iteration].PVres,
+                                       mTimeFrameGPU->getMinRs(),
+                                       mTimeFrameGPU->getMaxRs(),
+                                       mTimeFrameGPU->getPositionResolutions(),
+                                       mTrkParams[iteration].LayerRadii,
+                                       mTimeFrameGPU->getMSangles(),
+                                       conf.nBlocks,
+                                       conf.nThreads);
+  mTimeFrameGPU->createTrackletsBuffers();
+  computeTrackletsInROFsHandler<nLayers>(mTimeFrameGPU->getDeviceIndexTableUtils(),
+                                         mTimeFrameGPU->getDeviceMultCutMask(),
+                                         startROF,
+                                         endROF,
+                                         mTimeFrameGPU->getNrof(),
+                                         mTrkParams[iteration].DeltaROF,
+                                         iVertex,
+                                         mTimeFrameGPU->getDeviceVertices(),
+                                         mTimeFrameGPU->getDeviceROFramesPV(),
+                                         mTimeFrameGPU->getPrimaryVerticesNum(),
+                                         mTimeFrameGPU->getDeviceArrayClusters(),
+                                         mTimeFrameGPU->getClusterSizes(),
+                                         mTimeFrameGPU->getDeviceROframeClusters(),
+                                         mTimeFrameGPU->getDeviceArrayUsedClusters(),
+                                         mTimeFrameGPU->getDeviceArrayClustersIndexTables(),
+                                         mTimeFrameGPU->getDeviceArrayTracklets(),
+                                         mTimeFrameGPU->getDeviceTracklet(),
+                                         mTimeFrameGPU->getNTracklets(),
+                                         mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
+                                         mTimeFrameGPU->getDeviceTrackletsLUTs(),
+                                         iteration,
+                                         mTrkParams[iteration].NSigmaCut,
+                                         mTimeFrameGPU->getPhiCuts(),
+                                         mTrkParams[iteration].PVres,
+                                         mTimeFrameGPU->getMinRs(),
+                                         mTimeFrameGPU->getMaxRs(),
+                                         mTimeFrameGPU->getPositionResolutions(),
+                                         mTrkParams[iteration].LayerRadii,
+                                         mTimeFrameGPU->getMSangles(),
+                                         conf.nBlocks,
+                                         conf.nThreads);
 }
 
 template <int nLayers>
 void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
 {
-  mTimeFrameGPU->loadTrackletsDevice();
-  mTimeFrameGPU->loadTrackletsLUTDevice();
   mTimeFrameGPU->createCellsLUTDevice();
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
 
-  // #pragma omp parallel for num_threads(nLayers)
   for (int iLayer = 0; iLayer < mTrkParams[iteration].CellsPerRoad(); ++iLayer) {
-    if (mTimeFrameGPU->getTracklets()[iLayer + 1].empty() ||
-        mTimeFrameGPU->getTracklets()[iLayer].empty()) {
+    if (!mTimeFrameGPU->getNTracklets()[iLayer + 1] || !mTimeFrameGPU->getNTracklets()[iLayer]) {
       continue;
     }
-
-    const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getTracklets()[iLayer].size())};
+    const int currentLayerTrackletsNum{static_cast<int>(mTimeFrameGPU->getNTracklets()[iLayer])};
     countCellsHandler(mTimeFrameGPU->getDeviceArrayClusters(),
                       mTimeFrameGPU->getDeviceArrayUnsortedClusters(),
                       mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                       mTimeFrameGPU->getDeviceArrayTracklets(),
                       mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                      mTimeFrameGPU->getTracklets()[iLayer].size(),
+                      mTimeFrameGPU->getNTracklets()[iLayer],
                       iLayer,
                       nullptr,
                       mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -354,7 +194,7 @@ void TrackerTraitsGPU<nLayers>::computeCellsHybrid(const int iteration)
                         mTimeFrameGPU->getDeviceArrayTrackingFrameInfo(),
                         mTimeFrameGPU->getDeviceArrayTracklets(),
                         mTimeFrameGPU->getDeviceArrayTrackletsLUT(),
-                        mTimeFrameGPU->getTracklets()[iLayer].size(),
+                        mTimeFrameGPU->getNTracklets()[iLayer],
                         iLayer,
                         mTimeFrameGPU->getDeviceCells()[iLayer],
                         mTimeFrameGPU->getDeviceArrayCellsLUT(),
@@ -378,7 +218,7 @@ void TrackerTraitsGPU<nLayers>::findCellsNeighboursHybrid(const int iteration)
   auto& conf = o2::its::ITSGpuTrackingParamConfig::Instance();
   std::vector<std::vector<std::pair<int, int>>> cellsNeighboursLayer(mTrkParams[iteration].CellsPerRoad() - 1);
   for (int iLayer{0}; iLayer < mTrkParams[iteration].CellsPerRoad() - 1; ++iLayer) {
-    const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCellsDevice()[iLayer + 1])};
+    const int nextLayerCellsNum{static_cast<int>(mTimeFrameGPU->getNCells()[iLayer + 1])};
     mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].clear();
     mTimeFrameGPU->getCellsNeighboursLUT()[iLayer].resize(nextLayerCellsNum, 0);
 
@@ -441,7 +281,7 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       std::vector<int> lastCellId, updatedCellId;
       std::vector<CellSeed> lastCellSeed, updatedCellSeed;
 
-      processNeighbours(startLayer, startLevel, mTimeFrame->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
+      processNeighbours(startLayer, startLevel, mTimeFrameGPU->getCells()[startLayer], lastCellId, updatedCellSeed, updatedCellId);
 
       int level = startLevel;
       for (int iLayer{startLayer - 1}; iLayer > 0 && level > 2; --iLayer) {
@@ -495,8 +335,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
         if (track.getClusterIndex(iLayer) == UnusedIndex) {
           continue;
         }
-        nShared += int(mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
-        isFirstShared |= !iLayer && mTimeFrame->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
+        nShared += int(mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer)));
+        isFirstShared |= !iLayer && mTimeFrameGPU->isClusterUsed(iLayer, track.getClusterIndex(iLayer));
       }
 
       if (nShared > mTrkParams[0].ClusterSharing) {
@@ -508,8 +348,8 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
         if (track.getClusterIndex(iLayer) == UnusedIndex) {
           continue;
         }
-        mTimeFrame->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
-        int currentROF = mTimeFrame->getClusterROF(iLayer, track.getClusterIndex(iLayer));
+        mTimeFrameGPU->markUsedCluster(iLayer, track.getClusterIndex(iLayer));
+        int currentROF = mTimeFrameGPU->getClusterROF(iLayer, track.getClusterIndex(iLayer));
         for (int iR{0}; iR < 3; ++iR) {
           if (rofs[iR] == INT_MAX) {
             rofs[iR] = currentROF;
@@ -525,9 +365,10 @@ void TrackerTraitsGPU<nLayers>::findRoads(const int iteration)
       if (rofs[1] != INT_MAX) {
         track.setNextROFbit();
       }
-      mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
+      mTimeFrameGPU->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
     }
   }
+  mTimeFrameGPU->loadUsedClustersDevice();
   if (iteration == mTrkParams.size() - 1) {
     mTimeFrameGPU->unregisterHostMemory(0);
   }
diff --git a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
index 73dcf3bcb4894..229827611c077 100644
--- a/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
+++ b/Detectors/ITSMFT/ITS/tracking/GPU/cuda/TrackingKernels.cu
@@ -32,6 +32,7 @@
 #include "ITStracking/IndexTableUtils.h"
 #include "ITStracking/MathUtils.h"
 #include "DataFormatsITS/TrackITS.h"
+#include "ReconstructionDataFormats/Vertex.h"
 
 #include "ITStrackingGPU/TrackerTraitsGPU.h"
 #include "ITStrackingGPU/TrackingKernels.h"
@@ -70,12 +71,39 @@ inline void gpuAssert(cudaError_t code, const char* file, int line, bool abort =
 }
 
 namespace o2::its
-
 {
 using namespace constants::its2;
+using Vertex = o2::dataformats::Vertex<o2::dataformats::TimeStamp<int>>;
+
+GPUd() float Sq(float v)
+{
+  return v * v;
+}
 
 namespace gpu
 {
+
+GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
+                              const o2::its::IndexTableUtils& utils,
+                              const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
+{
+  const float zRangeMin = o2::gpu::CAMath::Min(z1, z2) - maxdeltaz;
+  const float phiRangeMin = (maxdeltaphi > constants::math::Pi) ? 0.f : currentCluster.phi - maxdeltaphi;
+  const float zRangeMax = o2::gpu::CAMath::Max(z1, z2) + maxdeltaz;
+  const float phiRangeMax = (maxdeltaphi > constants::math::Pi) ? constants::math::TwoPi : currentCluster.phi + maxdeltaphi;
+
+  if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] ||
+      zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) {
+
+    return getEmptyBinsRect();
+  }
+
+  return int4{o2::gpu::CAMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)),
+              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
+              o2::gpu::CAMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)),
+              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
+}
+
 GPUd() bool fitTrack(TrackITSExt& track,
                      int start,
                      int end,
@@ -127,7 +155,7 @@ GPUd() bool fitTrack(TrackITSExt& track,
     }
     nCl++;
   }
-  return o2::gpu::GPUCommonMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5);
+  return o2::gpu::CAMath::Abs(track.getQ2Pt()) < maxQoverPt && track.getChi2() < chi2ndfcut * (nCl * 2 - 5);
 }
 
 GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
@@ -146,7 +174,7 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
   const float y3 = tf3.positionTrackingFrame[0];
   const float z3 = tf3.positionTrackingFrame[1];
 
-  const bool zeroField{o2::gpu::GPUCommonMath::Abs(bz) < o2::constants::math::Almost0};
+  const bool zeroField{o2::gpu::CAMath::Abs(bz) < o2::constants::math::Almost0};
   const float tgp = zeroField ? o2::gpu::CAMath::ATan2(y3 - y1, x3 - x1) : 1.f;
   const float crv = zeroField ? 1.f : math_utils::computeCurvature(x3, y3, x2, y2, x1, y1);
   const float snp = zeroField ? tgp / o2::gpu::CAMath::Sqrt(1.f + tgp * tgp) : crv * (x3 - math_utils::computeCurvatureCentreX(x3, y3, x2, y2, x1, y1));
@@ -164,6 +192,17 @@ GPUd() o2::track::TrackParCov buildTrackSeed(const Cluster& cluster1,
                              0.f, 0.f, 0.f, 0.f, sg2q2pt});
 }
 
+// auto sort_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); };
+// auto equal_tracklets = [] GPUhdni()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; };
+
+struct sort_tracklets {
+  GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex < b.firstClusterIndex || (a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex < b.secondClusterIndex); }
+};
+
+struct equal_tracklets {
+  GPUhd() bool operator()(const Tracklet& a, const Tracklet& b) { return a.firstClusterIndex == b.firstClusterIndex && a.secondClusterIndex == b.secondClusterIndex; }
+};
+
 template <typename T1, typename T2>
 struct pair_to_first : public thrust::unary_function<gpuPair<T1, T2>, T1> {
   GPUhd() int operator()(const gpuPair<T1, T2>& a) const
@@ -196,6 +235,33 @@ struct is_valid_pair {
   }
 };
 
+GPUd() gpuSpan<const Vertex> getPrimaryVertices(const int rof,
+                                                const int* roframesPV,
+                                                const int nROF,
+                                                const uint8_t* mask,
+                                                const Vertex* vertices)
+{
+  const int start_pv_id = roframesPV[rof];
+  const int stop_rof = rof >= nROF - 1 ? nROF : rof + 1;
+  size_t delta = mask[rof] ? roframesPV[stop_rof] - start_pv_id : 0; // return empty span if ROF is excluded
+  return gpuSpan<const Vertex>(&vertices[start_pv_id], delta);
+};
+
+GPUd() gpuSpan<const Cluster> getClustersOnLayer(const int rof,
+                                                 const int totROFs,
+                                                 const int layer,
+                                                 const int** roframesClus,
+                                                 const Cluster** clusters)
+{
+  if (rof < 0 || rof >= totROFs) {
+    return gpuSpan<const Cluster>();
+  }
+  const int start_clus_id{roframesClus[layer][rof]};
+  const int stop_rof = rof >= totROFs - 1 ? totROFs : rof + 1;
+  const unsigned int delta = roframesClus[layer][stop_rof] - start_clus_id;
+  return gpuSpan<const Cluster>(&(clusters[layer][start_clus_id]), delta);
+}
+
 template <int nLayers>
 GPUg() void fitTrackSeedsKernel(
   CellSeed* trackSeeds,
@@ -314,8 +380,8 @@ GPUg() void computeLayerCellsKernel(
   const Cluster** sortedClusters,
   const Cluster** unsortedClusters,
   const TrackingFrameInfo** tfInfo,
-  const Tracklet** tracklets,
-  const int** trackletsLUT,
+  Tracklet** tracklets,
+  int** trackletsLUT,
   const int nTrackletsCurrent,
   const int layer,
   CellSeed* cells,
@@ -331,8 +397,8 @@ GPUg() void computeLayerCellsKernel(
   for (int iCurrentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; iCurrentTrackletIndex < nTrackletsCurrent; iCurrentTrackletIndex += blockDim.x * gridDim.x) {
     const Tracklet& currentTracklet = tracklets[layer][iCurrentTrackletIndex];
     const int nextLayerClusterIndex{currentTracklet.secondClusterIndex};
-    const int nextLayerFirstTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex]};
-    const int nextLayerLastTrackletIndex{trackletsLUT[layer][nextLayerClusterIndex + 1]};
+    const int nextLayerFirstTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex]};
+    const int nextLayerLastTrackletIndex{trackletsLUT[layer + 1][nextLayerClusterIndex + 1]};
     if (nextLayerFirstTrackletIndex == nextLayerLastTrackletIndex) {
       continue;
     }
@@ -342,7 +408,7 @@ GPUg() void computeLayerCellsKernel(
         break;
       }
       const Tracklet& nextTracklet = tracklets[layer + 1][iNextTrackletIndex];
-      const float deltaTanLambda{o2::gpu::GPUCommonMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)};
+      const float deltaTanLambda{o2::gpu::CAMath::Abs(currentTracklet.tanLambda - nextTracklet.tanLambda)};
 
       if (deltaTanLambda / cellDeltaTanLambdaSigma < nSigmaCut) {
         const int clusId[3]{
@@ -394,35 +460,124 @@ GPUg() void computeLayerCellsKernel(
   }
 }
 
-/////////////////////////////////////////
-// Debug Kernels
-/////////////////////////////////////////
-GPUd() const int4 getBinsRect(const Cluster& currentCluster, const int layerIndex,
-                              const o2::its::IndexTableUtils& utils,
-                              const float z1, const float z2, float maxdeltaz, float maxdeltaphi)
+template <bool initRun = true, int nLayers = 7>
+GPUg() void computeLayerTrackletsMultiROFKernel(
+  const IndexTableUtils* utils,
+  const uint8_t* multMask,
+  const int layerIndex,
+  const int startROF,
+  const int endROF,
+  const int totalROFs,
+  const int deltaROF,
+  const Vertex* vertices,
+  const int* rofPV,
+  const int nVertices,
+  const int vertexId,
+  const Cluster** clusters,           // Input data rof0
+  const int** ROFClusters,            // Number of clusters on layers per ROF
+  const unsigned char** usedClusters, // Used clusters
+  const int** indexTables,            // Input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
+  Tracklet** tracklets,               // Output data
+  int** trackletsLUT,
+  const int iteration,
+  const float NSigmaCut,
+  const float phiCut,
+  const float resolutionPV,
+  const float minR,
+  const float maxR,
+  const float positionResolution,
+  const float meanDeltaR = -42.f,
+  const float MSAngle = -42.f)
 {
-  const float zRangeMin = o2::gpu::GPUCommonMath::Min(z1, z2) - maxdeltaz;
-  const float phiRangeMin = currentCluster.phi - maxdeltaphi;
-  const float zRangeMax = o2::gpu::GPUCommonMath::Max(z1, z2) + maxdeltaz;
-  const float phiRangeMax = currentCluster.phi + maxdeltaphi;
+  const int phiBins{utils->getNphiBins()};
+  const int zBins{utils->getNzBins()};
+  for (unsigned int iROF{blockIdx.x}; iROF < endROF - startROF; iROF += gridDim.x) {
+    const short rof0 = iROF + startROF;
+    auto primaryVertices = getPrimaryVertices(rof0, rofPV, totalROFs, multMask, vertices);
+    const auto startVtx{vertexId >= 0 ? vertexId : 0};
+    const auto endVtx{vertexId >= 0 ? o2::gpu::CAMath::Min(vertexId + 1, static_cast<int>(primaryVertices.size())) : static_cast<int>(primaryVertices.size())};
+    const short minROF = o2::gpu::CAMath::Max(startROF, static_cast<int>(rof0 - deltaROF));
+    const short maxROF = o2::gpu::CAMath::Min(endROF - 1, static_cast<int>(rof0 + deltaROF));
+    auto clustersCurrentLayer = getClustersOnLayer(rof0, totalROFs, layerIndex, ROFClusters, clusters);
+    if (clustersCurrentLayer.empty()) {
+      continue;
+    }
 
-  if (zRangeMax < -LayersZCoordinate()[layerIndex + 1] ||
-      zRangeMin > LayersZCoordinate()[layerIndex + 1] || zRangeMin > zRangeMax) {
+    for (int currentClusterIndex = threadIdx.x; currentClusterIndex < clustersCurrentLayer.size(); currentClusterIndex += blockDim.x) {
+      unsigned int storedTracklets{0};
+      auto currentCluster{clustersCurrentLayer[currentClusterIndex]};
+      const int currentSortedIndex{ROFClusters[layerIndex][rof0] + currentClusterIndex};
+      if (usedClusters[layerIndex][currentCluster.clusterId]) {
+        continue;
+      }
 
-    return getEmptyBinsRect();
-  }
+      const float inverseR0{1.f / currentCluster.radius};
+      for (int iV{startVtx}; iV < endVtx; ++iV) {
+        auto& primaryVertex{primaryVertices[iV]};
+        if (primaryVertex.isFlagSet(2) && iteration != 3) {
+          continue;
+        }
+        const float resolution = o2::gpu::CAMath::Sqrt(Sq(resolutionPV) / primaryVertex.getNContributors() + Sq(positionResolution));
+        const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
+        const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
+        const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
+        const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
+        const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * MSAngle))};
+        const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * NSigmaCut, phiCut)};
+        if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
+          continue;
+        }
+        int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
 
-  return int4{o2::gpu::GPUCommonMath::Max(0, utils.getZBinIndex(layerIndex + 1, zRangeMin)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMin)),
-              o2::gpu::GPUCommonMath::Min(ZBins - 1, utils.getZBinIndex(layerIndex + 1, zRangeMax)),
-              utils.getPhiBinIndex(math_utils::getNormalizedPhi(phiRangeMax))};
-}
+        if (phiBinsNum < 0) {
+          phiBinsNum += phiBins;
+        }
 
-GPUhd() float Sq(float q)
-{
-  return q * q;
+        const int tableSize{phiBins * zBins + 1};
+        for (short rof1{minROF}; rof1 <= maxROF; ++rof1) {
+          auto clustersNextLayer = getClustersOnLayer(rof1, totalROFs, layerIndex + 1, ROFClusters, clusters);
+          if (clustersNextLayer.empty()) {
+            continue;
+          }
+          for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
+            int iPhiBin = (selectedBinsRect.y + iPhiCount) % phiBins;
+            const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
+            const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
+            const int firstRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + firstBinIndex];
+            const int maxRowClusterIndex = indexTables[layerIndex + 1][(rof1 - startROF) * tableSize + maxBinIndex];
+            for (int nextClusterIndex{firstRowClusterIndex}; nextClusterIndex < maxRowClusterIndex; ++nextClusterIndex) {
+              if (nextClusterIndex >= clustersNextLayer.size()) {
+                break;
+              }
+              const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]};
+              if (usedClusters[layerIndex + 1][nextCluster.clusterId]) {
+                continue;
+              }
+              const float deltaPhi{o2::gpu::CAMath::Abs(currentCluster.phi - nextCluster.phi)};
+              const float deltaZ{o2::gpu::CAMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)};
+              const int nextSortedIndex{ROFClusters[layerIndex + 1][rof1] + nextClusterIndex};
+              if (deltaZ / sigmaZ < NSigmaCut && (deltaPhi < phiCut || o2::gpu::CAMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) {
+                if constexpr (initRun) {
+                  trackletsLUT[layerIndex][currentSortedIndex]++; // we need l0 as well for usual exclusive sums.
+                } else {
+                  const float phi{o2::gpu::CAMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
+                  const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
+                  new (tracklets[layerIndex] + trackletsLUT[layerIndex][currentSortedIndex] + storedTracklets) Tracklet{currentSortedIndex, nextSortedIndex, tanL, phi, rof0, rof1};
+                }
+                ++storedTracklets;
+              }
+            }
+          }
+        }
+      }
+    }
+  }
 }
 
+/////////////////////////////////////////
+// Debug Kernels
+/////////////////////////////////////////
+
 template <typename T>
 GPUd() void pPointer(T* ptr)
 {
@@ -437,7 +592,6 @@ GPUg() void printPointersKernel(std::tuple<Args...> args)
   std::apply(print_all, args);
 }
 
-// Functors to sort tracklets
 template <typename T>
 struct trackletSortEmptyFunctor : public thrust::binary_function<T, T, bool> {
   GPUhd() bool operator()(const T& lhs, const T& rhs) const
@@ -454,7 +608,6 @@ struct trackletSortIndexFunctor : public thrust::binary_function<T, T, bool> {
   }
 };
 
-// Print layer buffer
 GPUg() void printBufferLayerOnThread(const int layer, const int* v, unsigned int size, const int len = 150, const unsigned int tId = 0)
 {
   if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
@@ -494,52 +647,12 @@ GPUg() void printBufferPointersLayerOnThread(const int layer, void** v, unsigned
   }
 }
 
-// Dump vertices
 GPUg() void printVertices(const Vertex* v, unsigned int size, const unsigned int tId = 0)
 {
   if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
-    printf("vertices: ");
+    printf("vertices: \n");
     for (int i{0}; i < size; ++i) {
-      printf("x=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ());
-    }
-  }
-}
-
-// Dump tracklets
-GPUg() void printTracklets(const Tracklet* t,
-                           const int offset,
-                           const int startRof,
-                           const int nrof,
-                           const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF
-                           const int* roFrameClustersNextLayer,    // Number of clusters on layer 1 per ROF
-                           const int maxClustersPerRof = 5e2,
-                           const int maxTrackletsPerCluster = 50,
-                           const unsigned int tId = 0)
-{
-  if (threadIdx.x == tId) {
-    auto offsetCurrent{roFrameClustersCurrentLayer[offset]};
-    auto offsetNext{roFrameClustersNextLayer[offset]};
-    auto offsetChunk{(startRof - offset) * maxClustersPerRof * maxTrackletsPerCluster};
-    for (int i{offsetChunk}; i < offsetChunk + nrof * maxClustersPerRof * maxTrackletsPerCluster; ++i) {
-      if (t[i].firstClusterIndex != -1) {
-        t[i].dump(offsetCurrent, offsetNext);
-      }
-    }
-  }
-}
-
-GPUg() void printTrackletsNotStrided(const Tracklet* t,
-                                     const int offset,
-                                     const int* roFrameClustersCurrentLayer, // Number of clusters on layer 0 per ROF
-                                     const int* roFrameClustersNextLayer,    // Number of clusters on layer 1 per ROF
-                                     const int ntracklets,
-                                     const unsigned int tId = 0)
-{
-  if (threadIdx.x == tId) {
-    auto offsetCurrent{roFrameClustersCurrentLayer[offset]};
-    auto offsetNext{roFrameClustersNextLayer[offset]};
-    for (int i{0}; i < ntracklets; ++i) {
-      t[i].dump(offsetCurrent, offsetNext);
+      printf("\tx=%f y=%f z=%f\n", v[i].getX(), v[i].getY(), v[i].getZ());
     }
   }
 }
@@ -556,102 +669,25 @@ GPUg() void printNeighbours(const gpuPair<int, int>* neighbours,
   }
 }
 
-// Compute the tracklets for a given layer
-template <int nLayers = 7>
-GPUg() void computeLayerTrackletsKernelSingleRof(
-  const short rof0,
-  const short maxRofs,
-  const int layerIndex,
-  const Cluster* clustersCurrentLayer,        // input data rof0
-  const Cluster* clustersNextLayer,           // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
-  const int* indexTable,                      // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
-  const int* roFrameClusters,                 // input data O(1)
-  const int* roFrameClustersNext,             // input data O(1)
-  const unsigned char* usedClustersLayer,     // input data rof0
-  const unsigned char* usedClustersNextLayer, // input data rof1
-  const Vertex* vertices,                     // input data
-  int* trackletsLookUpTable,                  // output data
-  Tracklet* tracklets,                        // output data
-  const int nVertices,
-  const int currentLayerClustersSize,
-  const float phiCut,
-  const float minR,
-  const float maxR,
-  const float meanDeltaR,
-  const float positionResolution,
-  const float mSAngle,
-  const StaticTrackingParameters<nLayers>* trkPars,
-  const IndexTableUtils* utils,
-  const unsigned int maxTrackletsPerCluster = 50)
+GPUg() void printTrackletsLUTPerROF(const int layerId,
+                                    const int** ROFClusters,
+                                    int** luts,
+                                    const int tId = 0)
 {
-  for (int currentClusterIndex = blockIdx.x * blockDim.x + threadIdx.x; currentClusterIndex < currentLayerClustersSize; currentClusterIndex += blockDim.x * gridDim.x) {
-    unsigned int storedTracklets{0};
-    const Cluster& currentCluster{clustersCurrentLayer[currentClusterIndex]};
-    const int currentSortedIndex{roFrameClusters[rof0] + currentClusterIndex};
-    if (usedClustersLayer[currentSortedIndex]) {
-      continue;
-    }
-    short minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0;
-    short maxRof = (rof0 == static_cast<short>(maxRofs - trkPars->DeltaROF)) ? rof0 : rof0 + trkPars->DeltaROF;
-    const float inverseR0{1.f / currentCluster.radius};
-    for (int iPrimaryVertex{0}; iPrimaryVertex < nVertices; iPrimaryVertex++) {
-      const auto& primaryVertex{vertices[iPrimaryVertex]};
-      if (primaryVertex.getX() == 0.f && primaryVertex.getY() == 0.f && primaryVertex.getZ() == 0.f) {
-        continue;
-      }
-      const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))};
-      const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
-      const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
-      const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
-      const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
-      const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))};
-
-      const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)};
-      if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
+  if (blockIdx.x * blockDim.x + threadIdx.x == tId) {
+    for (auto rofId{0}; rofId < 2304; ++rofId) {
+      int nClus = ROFClusters[layerId][rofId + 1] - ROFClusters[layerId][rofId];
+      if (!nClus) {
         continue;
       }
-      int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
-      if (phiBinsNum < 0) {
-        phiBinsNum += trkPars->PhiBins;
-      }
-      constexpr int tableSize{256 * 128 + 1}; // hardcoded for the time being
+      printf("rof: %d (%d) ==> ", rofId, nClus);
 
-      for (short rof1{minRof}; rof1 <= maxRof; ++rof1) {
-        if (!(roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) { // number of clusters on next layer > 0
-          continue;
-        }
-        for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
-          int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins;
-          const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
-          const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
-          const int firstRowClusterIndex = indexTable[rof1 * tableSize + firstBinIndex];
-          const int maxRowClusterIndex = indexTable[rof1 * tableSize + maxBinIndex];
-          for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) {
-            if (iNextCluster >= (roFrameClustersNext[rof1 + 1] - roFrameClustersNext[rof1])) {
-              break;
-            }
-            const Cluster& nextCluster{getPtrFromRuler<Cluster>(rof1, clustersNextLayer, roFrameClustersNext)[iNextCluster]};
-            if (usedClustersNextLayer[nextCluster.clusterId]) {
-              continue;
-            }
-            const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)};
-            const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)};
-
-            if (deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut)) {
-              trackletsLookUpTable[currentSortedIndex]++; // Race-condition safe
-              const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
-              const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
-              const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster};
-              new (tracklets + stride + storedTracklets) Tracklet{currentSortedIndex, roFrameClustersNext[rof1] + iNextCluster, tanL, phi, rof0, rof1};
-              ++storedTracklets;
-            }
-          }
-        }
+      for (int iC{0}; iC < nClus; ++iC) {
+        int nT = luts[layerId][ROFClusters[layerId][rofId] + iC];
+        printf("%d\t", nT);
       }
+      printf("\n");
     }
-    // if (storedTracklets > maxTrackletsPerCluster) {
-    //   printf("its-gpu-tracklet finder: found more tracklets per clusters (%d) than maximum set (%d), check the configuration!\n", maxTrackletsPerCluster, storedTracklets);
-    // }
   }
 }
 
@@ -661,124 +697,7 @@ GPUg() void compileTrackletsLookupTableKernel(const Tracklet* tracklets,
                                               const int nTracklets)
 {
   for (int currentTrackletIndex = blockIdx.x * blockDim.x + threadIdx.x; currentTrackletIndex < nTracklets; currentTrackletIndex += blockDim.x * gridDim.x) {
-    auto& tracklet{tracklets[currentTrackletIndex]};
-    if (tracklet.firstClusterIndex >= 0) {
-      atomicAdd(trackletsLookUpTable + tracklet.firstClusterIndex, 1);
-    }
-  }
-}
-
-template <int nLayers = 7>
-GPUg() void computeLayerTrackletsKernelMultipleRof(
-  const int layerIndex,
-  const int iteration,
-  const unsigned int startRofId,
-  const unsigned int rofSize,
-  const int maxRofs,
-  const Cluster* clustersCurrentLayer,        // input data rof0
-  const Cluster* clustersNextLayer,           // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
-  const int* roFrameClustersCurrentLayer,     // Number of clusters on layer 0 per ROF
-  const int* roFrameClustersNextLayer,        // Number of clusters on layer 1 per ROF
-  const int* indexTablesNext,                 // input data rof0-delta <rof0< rof0+delta (up to 3 rofs)
-  const unsigned char* usedClustersLayer,     // input data rof0
-  const unsigned char* usedClustersNextLayer, // input data rof1
-  Tracklet* tracklets,                        // output data
-  const Vertex* vertices,
-  const int* nVertices,
-  const float phiCut,
-  const float minR,
-  const float maxR,
-  const float meanDeltaR,
-  const float positionResolution,
-  const float mSAngle,
-  const StaticTrackingParameters<nLayers>* trkPars,
-  const IndexTableUtils* utils,
-  const unsigned int maxClustersPerRof = 5e2,
-  const unsigned int maxTrackletsPerCluster = 50)
-{
-  const int phiBins{utils->getNphiBins()};
-  const int zBins{utils->getNzBins()};
-  for (unsigned int iRof{blockIdx.x}; iRof < rofSize; iRof += gridDim.x) {
-    auto rof0 = iRof + startRofId;
-    auto nClustersCurrentLayerRof = o2::gpu::GPUCommonMath::Min(roFrameClustersCurrentLayer[rof0 + 1] - roFrameClustersCurrentLayer[rof0], (int)maxClustersPerRof);
-    // if (nClustersCurrentLayerRof > maxClustersPerRof) {
-    //   printf("its-gpu-tracklet finder: on layer %d found more clusters per ROF (%d) than maximum set (%d), check the configuration!\n", layerIndex, nClustersCurrentLayerRof, maxClustersPerRof);
-    // }
-    auto* clustersCurrentLayerRof = clustersCurrentLayer + (roFrameClustersCurrentLayer[rof0] - roFrameClustersCurrentLayer[startRofId]);
-    auto nVerticesRof0 = nVertices[rof0 + 1] - nVertices[rof0];
-    auto trackletsRof0 = tracklets + maxTrackletsPerCluster * maxClustersPerRof * iRof;
-    for (int currentClusterIndex = threadIdx.x; currentClusterIndex < nClustersCurrentLayerRof; currentClusterIndex += blockDim.x) {
-      unsigned int storedTracklets{0};
-      const Cluster& currentCluster{clustersCurrentLayerRof[currentClusterIndex]};
-      const int currentSortedIndex{roFrameClustersCurrentLayer[rof0] + currentClusterIndex};
-      const int currentSortedIndexChunk{currentSortedIndex - roFrameClustersCurrentLayer[startRofId]};
-      if (usedClustersLayer[currentSortedIndex]) {
-        continue;
-      }
-
-      int minRof = (rof0 >= trkPars->DeltaROF) ? rof0 - trkPars->DeltaROF : 0;
-      int maxRof = (rof0 == maxRofs - trkPars->DeltaROF) ? rof0 : rof0 + trkPars->DeltaROF; // works with delta = {0, 1}
-      const float inverseR0{1.f / currentCluster.radius};
-
-      for (int iPrimaryVertex{0}; iPrimaryVertex < nVerticesRof0; iPrimaryVertex++) {
-        const auto& primaryVertex{vertices[nVertices[rof0] + iPrimaryVertex]};
-        const float resolution{o2::gpu::GPUCommonMath::Sqrt(Sq(trkPars->PVres) / primaryVertex.getNContributors() + Sq(positionResolution))};
-        const float tanLambda{(currentCluster.zCoordinate - primaryVertex.getZ()) * inverseR0};
-        const float zAtRmin{tanLambda * (minR - currentCluster.radius) + currentCluster.zCoordinate};
-        const float zAtRmax{tanLambda * (maxR - currentCluster.radius) + currentCluster.zCoordinate};
-        const float sqInverseDeltaZ0{1.f / (Sq(currentCluster.zCoordinate - primaryVertex.getZ()) + 2.e-8f)}; /// protecting from overflows adding the detector resolution
-        const float sigmaZ{o2::gpu::CAMath::Sqrt(Sq(resolution) * Sq(tanLambda) * ((Sq(inverseR0) + sqInverseDeltaZ0) * Sq(meanDeltaR) + 1.f) + Sq(meanDeltaR * mSAngle))};
-
-        const int4 selectedBinsRect{getBinsRect(currentCluster, layerIndex, *utils, zAtRmin, zAtRmax, sigmaZ * trkPars->NSigmaCut, phiCut)};
-
-        if (selectedBinsRect.x == 0 && selectedBinsRect.y == 0 && selectedBinsRect.z == 0 && selectedBinsRect.w == 0) {
-          continue;
-        }
-        int phiBinsNum{selectedBinsRect.w - selectedBinsRect.y + 1};
-        if (phiBinsNum < 0) {
-          phiBinsNum += trkPars->PhiBins;
-        }
-        const int tableSize{phiBins * zBins + 1};
-        for (int rof1{minRof}; rof1 <= maxRof; ++rof1) {
-          auto nClustersNext{roFrameClustersNextLayer[rof1 + 1] - roFrameClustersNextLayer[rof1]};
-          if (!nClustersNext) { // number of clusters on next layer > 0
-            continue;
-          }
-          for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
-            int iPhiBin = (selectedBinsRect.y + iPhiCount) % trkPars->PhiBins;
-            const int firstBinIndex{utils->getBinIndex(selectedBinsRect.x, iPhiBin)};
-            const int maxBinIndex{firstBinIndex + selectedBinsRect.z - selectedBinsRect.x + 1};
-            const int firstRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + firstBinIndex];
-            const int maxRowClusterIndex = indexTablesNext[(rof1 - startRofId) * tableSize + maxBinIndex];
-            for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) {
-              if (iNextCluster >= nClustersNext) {
-                break;
-              }
-              auto nextClusterIndex{roFrameClustersNextLayer[rof1] - roFrameClustersNextLayer[startRofId] + iNextCluster};
-              const Cluster& nextCluster{clustersNextLayer[nextClusterIndex]};
-              if (usedClustersNextLayer[nextCluster.clusterId]) {
-                continue;
-              }
-              const float deltaPhi{o2::gpu::GPUCommonMath::Abs(currentCluster.phi - nextCluster.phi)};
-              const float deltaZ{o2::gpu::GPUCommonMath::Abs(tanLambda * (nextCluster.radius - currentCluster.radius) + currentCluster.zCoordinate - nextCluster.zCoordinate)};
-
-              if ((deltaZ / sigmaZ < trkPars->NSigmaCut && (deltaPhi < phiCut || o2::gpu::GPUCommonMath::Abs(deltaPhi - constants::math::TwoPi) < phiCut))) {
-                const float phi{o2::gpu::GPUCommonMath::ATan2(currentCluster.yCoordinate - nextCluster.yCoordinate, currentCluster.xCoordinate - nextCluster.xCoordinate)};
-                const float tanL{(currentCluster.zCoordinate - nextCluster.zCoordinate) / (currentCluster.radius - nextCluster.radius)};
-                const unsigned int stride{currentClusterIndex * maxTrackletsPerCluster};
-                if (storedTracklets < maxTrackletsPerCluster) {
-                  new (trackletsRof0 + stride + storedTracklets) Tracklet{currentSortedIndexChunk, nextClusterIndex, tanL, phi, static_cast<short>(rof0), static_cast<short>(rof1)};
-                }
-                // else {
-                // printf("its-gpu-tracklet-finder: on rof %d layer: %d: found more tracklets (%d) than maximum allowed per cluster. This is lossy!\n", rof0, layerIndex, storedTracklets);
-                // }
-                ++storedTracklets;
-              }
-            }
-          }
-        }
-      }
-    }
+    atomicAdd(&trackletsLookUpTable[tracklets[currentTrackletIndex].firstClusterIndex], 1);
   }
 }
 
@@ -803,12 +722,176 @@ GPUg() void removeDuplicateTrackletsEntriesLUTKernel(
 
 } // namespace gpu
 
+template <int nLayers>
+void countTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                 const uint8_t* multMask,
+                                 const int startROF,
+                                 const int endROF,
+                                 const int maxROF,
+                                 const int deltaROF,
+                                 const int vertexId,
+                                 const Vertex* vertices,
+                                 const int* rofPV,
+                                 const int nVertices,
+                                 const Cluster** clusters,
+                                 std::vector<unsigned int> nClusters,
+                                 const int** ROFClusters,
+                                 const unsigned char** usedClusters,
+                                 const int** clustersIndexTables,
+                                 int** trackletsLUTs,
+                                 gsl::span<int*> trackletsLUTsHost,
+                                 const int iteration,
+                                 const float NSigmaCut,
+                                 std::vector<float>& phiCuts,
+                                 const float resolutionPV,
+                                 std::vector<float>& minRs,
+                                 std::vector<float>& maxRs,
+                                 std::vector<float>& resolutions,
+                                 std::vector<float>& radii,
+                                 std::vector<float>& mulScatAng,
+                                 const int nBlocks,
+                                 const int nThreads)
+{
+  for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
+    gpu::computeLayerTrackletsMultiROFKernel<true><<<nBlocks, nThreads>>>(
+      utils,
+      multMask,
+      iLayer,
+      startROF,
+      endROF,
+      maxROF,
+      deltaROF,
+      vertices,
+      rofPV,
+      nVertices,
+      vertexId,
+      clusters,
+      ROFClusters,
+      usedClusters,
+      clustersIndexTables,
+      nullptr,
+      trackletsLUTs,
+      iteration,
+      NSigmaCut,
+      phiCuts[iLayer],
+      resolutionPV,
+      minRs[iLayer + 1],
+      maxRs[iLayer + 1],
+      resolutions[iLayer],
+      radii[iLayer + 1] - radii[iLayer],
+      mulScatAng[iLayer]);
+    void* d_temp_storage = nullptr;
+    size_t temp_storage_bytes = 0;
+    gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                                temp_storage_bytes,        // temp_storage_bytes
+                                                trackletsLUTsHost[iLayer], // d_in
+                                                trackletsLUTsHost[iLayer], // d_out
+                                                nClusters[iLayer] + 1,     // num_items
+                                                0));                       // NOLINT: this is the offset of the sum, not a pointer
+    discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+    gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                                temp_storage_bytes,        // temp_storage_bytes
+                                                trackletsLUTsHost[iLayer], // d_in
+                                                trackletsLUTsHost[iLayer], // d_out
+                                                nClusters[iLayer] + 1,     // num_items
+                                                0));                       // NOLINT: this is the offset of the sum, not a pointer
+    gpuCheckError(cudaFree(d_temp_storage));
+  }
+}
+
+template <int nLayers>
+void computeTrackletsInROFsHandler(const IndexTableUtils* utils,
+                                   const uint8_t* multMask,
+                                   const int startROF,
+                                   const int endROF,
+                                   const int maxROF,
+                                   const int deltaROF,
+                                   const int vertexId,
+                                   const Vertex* vertices,
+                                   const int* rofPV,
+                                   const int nVertices,
+                                   const Cluster** clusters,
+                                   std::vector<unsigned int> nClusters,
+                                   const int** ROFClusters,
+                                   const unsigned char** usedClusters,
+                                   const int** clustersIndexTables,
+                                   Tracklet** tracklets,
+                                   gsl::span<Tracklet*> spanTracklets,
+                                   gsl::span<int> nTracklets,
+                                   int** trackletsLUTs,
+                                   gsl::span<int*> trackletsLUTsHost,
+                                   const int iteration,
+                                   const float NSigmaCut,
+                                   std::vector<float>& phiCuts,
+                                   const float resolutionPV,
+                                   std::vector<float>& minRs,
+                                   std::vector<float>& maxRs,
+                                   std::vector<float>& resolutions,
+                                   std::vector<float>& radii,
+                                   std::vector<float>& mulScatAng,
+                                   const int nBlocks,
+                                   const int nThreads)
+{
+  for (int iLayer = 0; iLayer < nLayers - 1; ++iLayer) {
+    gpu::computeLayerTrackletsMultiROFKernel<false><<<nBlocks, nThreads>>>(utils,
+                                                                           multMask,
+                                                                           iLayer,
+                                                                           startROF,
+                                                                           endROF,
+                                                                           maxROF,
+                                                                           deltaROF,
+                                                                           vertices,
+                                                                           rofPV,
+                                                                           nVertices,
+                                                                           vertexId,
+                                                                           clusters,
+                                                                           ROFClusters,
+                                                                           usedClusters,
+                                                                           clustersIndexTables,
+                                                                           tracklets,
+                                                                           trackletsLUTs,
+                                                                           iteration,
+                                                                           NSigmaCut,
+                                                                           phiCuts[iLayer],
+                                                                           resolutionPV,
+                                                                           minRs[iLayer + 1],
+                                                                           maxRs[iLayer + 1],
+                                                                           resolutions[iLayer],
+                                                                           radii[iLayer + 1] - radii[iLayer],
+                                                                           mulScatAng[iLayer]);
+    thrust::device_ptr<Tracklet> tracklets_ptr(spanTracklets[iLayer]);
+    thrust::sort(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::sort_tracklets());
+    auto unique_end = thrust::unique(thrust::device, tracklets_ptr, tracklets_ptr + nTracklets[iLayer], gpu::equal_tracklets());
+    nTracklets[iLayer] = unique_end - tracklets_ptr;
+    if (iLayer > 0) {
+      gpuCheckError(cudaMemset(trackletsLUTsHost[iLayer], 0, nClusters[iLayer] * sizeof(int)));
+      gpu::compileTrackletsLookupTableKernel<<<nBlocks, nThreads>>>(spanTracklets[iLayer], trackletsLUTsHost[iLayer], nTracklets[iLayer]);
+      void* d_temp_storage = nullptr;
+      size_t temp_storage_bytes = 0;
+      gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                                  temp_storage_bytes,        // temp_storage_bytes
+                                                  trackletsLUTsHost[iLayer], // d_in
+                                                  trackletsLUTsHost[iLayer], // d_out
+                                                  nClusters[iLayer] + 1,     // num_items
+                                                  0));                       // NOLINT: this is the offset of the sum, not a pointer
+      discardResult(cudaMalloc(&d_temp_storage, temp_storage_bytes));
+      gpuCheckError(cub::DeviceScan::ExclusiveSum(d_temp_storage,            // d_temp_storage
+                                                  temp_storage_bytes,        // temp_storage_bytes
+                                                  trackletsLUTsHost[iLayer], // d_in
+                                                  trackletsLUTsHost[iLayer], // d_out
+                                                  nClusters[iLayer] + 1,     // num_items
+                                                  0));                       // NOLINT: this is the offset of the sum, not a pointer
+      gpuCheckError(cudaFree(d_temp_storage));
+    }
+  }
+}
+
 void countCellsHandler(
   const Cluster** sortedClusters,
   const Cluster** unsortedClusters,
   const TrackingFrameInfo** tfInfo,
-  const Tracklet** tracklets,
-  const int** trackletsLUT,
+  Tracklet** tracklets,
+  int** trackletsLUT,
   const int nTracklets,
   const int layer,
   CellSeed* cells,
@@ -850,7 +933,6 @@ void countCellsHandler(
                                               cellsLUTsHost,      // d_out
                                               nTracklets + 1,     // num_items
                                               0));                // NOLINT: this is the offset of the sum, not a pointer
-  // gpu::printBufferLayerOnThread<<<1, 1>>>(layer, cellsLUTsHost, nTracklets + 1);
   gpuCheckError(cudaFree(d_temp_storage));
 }
 
@@ -858,8 +940,8 @@ void computeCellsHandler(
   const Cluster** sortedClusters,
   const Cluster** unsortedClusters,
   const TrackingFrameInfo** tfInfo,
-  const Tracklet** tracklets,
-  const int** trackletsLUT,
+  Tracklet** tracklets,
+  int** trackletsLUT,
   const int nTracklets,
   const int layer,
   CellSeed* cells,
@@ -963,8 +1045,8 @@ void computeCellNeighboursHandler(CellSeed** cellsLayersDevice,
                                   const int nThreads)
 {
 
-  gpu::computeLayerCellNeighboursKernel<false><<<o2::gpu::GPUCommonMath::Min(nBlocks, GPU_BLOCKS),
-                                                 o2::gpu::GPUCommonMath::Min(nThreads, GPU_THREADS)>>>(
+  gpu::computeLayerCellNeighboursKernel<false><<<o2::gpu::CAMath::Min(nBlocks, GPU_BLOCKS),
+                                                 o2::gpu::CAMath::Min(nThreads, GPU_THREADS)>>>(
     cellsLayersDevice,
     neighboursLUT,
     neighboursIndexTable,
@@ -1032,4 +1114,65 @@ void trackSeedHandler(CellSeed* trackSeeds,
   gpuCheckError(cudaPeekAtLastError());
   gpuCheckError(cudaDeviceSynchronize());
 }
-} // namespace o2::its
+
+template void countTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
+                                             const uint8_t* multMask,
+                                             const int startROF,
+                                             const int endROF,
+                                             const int maxROF,
+                                             const int deltaROF,
+                                             const int vertexId,
+                                             const Vertex* vertices,
+                                             const int* rofPV,
+                                             const int nVertices,
+                                             const Cluster** clusters,
+                                             std::vector<unsigned int> nClusters,
+                                             const int** ROFClusters,
+                                             const unsigned char** usedClusters,
+                                             const int** clustersIndexTables,
+                                             int** trackletsLUTs,
+                                             gsl::span<int*> trackletsLUTsHost,
+                                             const int iteration,
+                                             const float NSigmaCut,
+                                             std::vector<float>& phiCuts,
+                                             const float resolutionPV,
+                                             std::vector<float>& minRs,
+                                             std::vector<float>& maxRs,
+                                             std::vector<float>& resolutions,
+                                             std::vector<float>& radii,
+                                             std::vector<float>& mulScatAng,
+                                             const int nBlocks,
+                                             const int nThreads);
+
+template void computeTrackletsInROFsHandler<7>(const IndexTableUtils* utils,
+                                               const uint8_t* multMask,
+                                               const int startROF,
+                                               const int endROF,
+                                               const int maxROF,
+                                               const int deltaROF,
+                                               const int vertexId,
+                                               const Vertex* vertices,
+                                               const int* rofPV,
+                                               const int nVertices,
+                                               const Cluster** clusters,
+                                               std::vector<unsigned int> nClusters,
+                                               const int** ROFClusters,
+                                               const unsigned char** usedClusters,
+                                               const int** clustersIndexTables,
+                                               Tracklet** tracklets,
+                                               gsl::span<Tracklet*> spanTracklets,
+                                               gsl::span<int> nTracklets,
+                                               int** trackletsLUTs,
+                                               gsl::span<int*> trackletsLUTsHost,
+                                               const int iteration,
+                                               const float NSigmaCut,
+                                               std::vector<float>& phiCuts,
+                                               const float resolutionPV,
+                                               std::vector<float>& minRs,
+                                               std::vector<float>& maxRs,
+                                               std::vector<float>& resolutions,
+                                               std::vector<float>& radii,
+                                               std::vector<float>& mulScatAng,
+                                               const int nBlocks,
+                                               const int nThreads);
+} // namespace o2::its
\ No newline at end of file
diff --git a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
index 906eb0fa5c21e..fa4f33782d16a 100644
--- a/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
+++ b/Detectors/ITSMFT/ITS/tracking/include/ITStracking/TimeFrame.h
@@ -106,12 +106,16 @@ class TimeFrame
 
   float getBeamX() const;
   float getBeamY() const;
-
+  std::vector<float>& getMinRs() { return mMinR; }
+  std::vector<float>& getMaxRs() { return mMaxR; }
   float getMinR(int layer) const { return mMinR[layer]; }
   float getMaxR(int layer) const { return mMaxR[layer]; }
   float getMSangle(int layer) const { return mMSangles[layer]; }
+  std::vector<float>& getMSangles() { return mMSangles; }
   float getPhiCut(int layer) const { return mPhiCuts[layer]; }
+  std::vector<float>& getPhiCuts() { return mPhiCuts; }
   float getPositionResolution(int layer) const { return mPositionResolution[layer]; }
+  std::vector<float>& getPositionResolutions() { return mPositionResolution; }
 
   gsl::span<Cluster> getClustersOnLayer(int rofId, int layerId);
   gsl::span<const Cluster> getClustersOnLayer(int rofId, int layerId) const;
@@ -209,8 +213,8 @@ class TimeFrame
   const unsigned long long& getRoadLabel(int i) const;
   bool isRoadFake(int i) const;
 
-  void setMultiplicityCutMask(const std::vector<bool>& cutMask) { mMultiplicityCutMask = cutMask; }
-  void setROFMask(const std::vector<bool>& rofMask) { mROFMask = rofMask; }
+  void setMultiplicityCutMask(const std::vector<uint8_t>& cutMask) { mMultiplicityCutMask = cutMask; }
+  void setROFMask(const std::vector<uint8_t>& rofMask) { mROFMask = rofMask; }
   void swapMasks() { mMultiplicityCutMask.swap(mROFMask); }
 
   int hasBogusClusters() const { return std::accumulate(mBogusClusters.begin(), mBogusClusters.end(), 0); }
@@ -289,6 +293,7 @@ class TimeFrame
   std::vector<std::vector<TrackITSExt>> mTracks;
   std::vector<std::vector<int>> mCellsNeighbours;
   std::vector<std::vector<int>> mCellsLookupTable;
+  std::vector<uint8_t> mMultiplicityCutMask;
 
   const o2::base::PropagatorImpl<float>* mPropagatorDevice = nullptr; // Needed only for GPU
  protected:
@@ -311,8 +316,8 @@ class TimeFrame
   std::vector<float> mPhiCuts;
   std::vector<float> mPositionResolution;
   std::vector<uint8_t> mClusterSize;
-  std::vector<bool> mMultiplicityCutMask;
-  std::vector<bool> mROFMask;
+
+  std::vector<uint8_t> mROFMask;
   std::vector<std::array<float, 2>> mPValphaX; /// PV x and alpha for track propagation
   std::vector<std::vector<MCCompLabel>> mTrackletLabels;
   std::vector<std::vector<MCCompLabel>> mCellLabels;
@@ -439,33 +444,33 @@ inline gsl::span<const Cluster> TimeFrame::getClustersPerROFrange(int rofMin, in
     return gsl::span<const Cluster>();
   }
   int startIdx{mROFramesClusters[layerId][rofMin]}; // First cluster of rofMin
-  int endIdx{mROFramesClusters[layerId][std::min(rofMin + range, mNrof)]};
+  int endIdx{mROFramesClusters[layerId][o2::gpu::CAMath::Min(rofMin + range, mNrof)]};
   return {&mClusters[layerId][startIdx], static_cast<gsl::span<Cluster>::size_type>(endIdx - startIdx)};
 }
 
 inline gsl::span<const int> TimeFrame::getROFramesClustersPerROFrange(int rofMin, int range, int layerId) const
 {
-  int chkdRange{std::min(range, mNrof - rofMin)};
+  int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)};
   return {&mROFramesClusters[layerId][rofMin], static_cast<gsl::span<int>::size_type>(chkdRange)};
 }
 
 inline gsl::span<const int> TimeFrame::getNClustersROFrange(int rofMin, int range, int layerId) const
 {
-  int chkdRange{std::min(range, mNrof - rofMin)};
+  int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)};
   return {&mNClustersPerROF[layerId][rofMin], static_cast<gsl::span<int>::size_type>(chkdRange)};
 }
 
 inline int TimeFrame::getTotalClustersPerROFrange(int rofMin, int range, int layerId) const
 {
   int startIdx{rofMin}; // First cluster of rofMin
-  int endIdx{std::min(rofMin + range, mNrof)};
+  int endIdx{o2::gpu::CAMath::Min(rofMin + range, mNrof)};
   return mROFramesClusters[layerId][endIdx] - mROFramesClusters[layerId][startIdx];
 }
 
 inline gsl::span<const int> TimeFrame::getIndexTablePerROFrange(int rofMin, int range, int layerId) const
 {
   const int iTableSize{mIndexTableUtils.getNphiBins() * mIndexTableUtils.getNzBins() + 1};
-  int chkdRange{std::min(range, mNrof - rofMin)};
+  int chkdRange{o2::gpu::CAMath::Min(range, mNrof - rofMin)};
   return {&mIndexTables[layerId][rofMin * iTableSize], static_cast<gsl::span<int>::size_type>(chkdRange * iTableSize)};
 }
 
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
index da0abbae9dc1f..409b20ea23235 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackerTraits.cxx
@@ -75,9 +75,9 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
   for (int rof0{startROF}; rof0 < endROF; ++rof0) {
     gsl::span<const Vertex> primaryVertices = mTrkParams[iteration].UseDiamond ? diamondSpan : tf->getPrimaryVertices(rof0);
     const int startVtx{iVertex >= 0 ? iVertex : 0};
-    const int endVtx{iVertex >= 0 ? std::min(iVertex + 1, static_cast<int>(primaryVertices.size())) : static_cast<int>(primaryVertices.size())};
-    int minRof = std::max(startROF, rof0 - mTrkParams[iteration].DeltaROF);
-    int maxRof = std::min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF);
+    const int endVtx{iVertex >= 0 ? o2::gpu::CAMath::Min(iVertex + 1, static_cast<int>(primaryVertices.size())) : static_cast<int>(primaryVertices.size())};
+    int minRof = o2::gpu::CAMath::Max(startROF, rof0 - mTrkParams[iteration].DeltaROF);
+    int maxRof = o2::gpu::CAMath::Min(endROF - 1, rof0 + mTrkParams[iteration].DeltaROF);
 #pragma omp parallel for num_threads(mNThreads)
     for (int iLayer = 0; iLayer < mTrkParams[iteration].TrackletsPerRoad(); ++iLayer) {
       gsl::span<const Cluster> layer0 = tf->getClustersOnLayer(rof0, iLayer);
@@ -128,7 +128,6 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
             if (layer1.empty()) {
               continue;
             }
-
             for (int iPhiCount{0}; iPhiCount < phiBinsNum; iPhiCount++) {
               int iPhiBin = (selectedBinsRect.y + iPhiCount) % mTrkParams[iteration].PhiBins;
               const int firstBinIndex{tf->mIndexTableUtils.getBinIndex(selectedBinsRect.x, iPhiBin)};
@@ -145,9 +144,7 @@ void TrackerTraits::computeLayerTracklets(const int iteration, int iROFslice, in
               }
               const int firstRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[firstBinIndex];
               const int maxRowClusterIndex = tf->getIndexTable(rof1, iLayer + 1)[maxBinIndex];
-
               for (int iNextCluster{firstRowClusterIndex}; iNextCluster < maxRowClusterIndex; ++iNextCluster) {
-
                 if (iNextCluster >= (int)layer1.size()) {
                   break;
                 }
@@ -668,7 +665,7 @@ void TrackerTraits::findRoads(const int iteration)
       if (rofs[1] != INT_MAX) {
         track.setNextROFbit();
       }
-      mTimeFrame->getTracks(std::min(rofs[0], rofs[1])).emplace_back(track);
+      mTimeFrame->getTracks(o2::gpu::CAMath::Min(rofs[0], rofs[1])).emplace_back(track);
     }
   }
 }
diff --git a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
index f00d87164d7d6..5b8a9bb1cb0f2 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/TrackingInterface.cxx
@@ -174,7 +174,7 @@ void ITSTrackingInterface::run(framework::ProcessingContext& pc)
   auto errorLogger = [&](std::string s) { LOG(error) << s; };
 
   FastMultEst multEst; // mult estimator
-  std::vector<bool> processingMask, processUPCMask;
+  std::vector<uint8_t> processingMask, processUPCMask;
   int cutVertexMult{0}, cutUPCVertex{0}, cutRandomMult = int(trackROFvec.size()) - multEst.selectROFs(trackROFvec, compClusters, physTriggers, processingMask);
   processUPCMask.resize(processingMask.size(), false);
   mTimeFrame->setMultiplicityCutMask(processingMask);
diff --git a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx
index 4eaddc8385b8a..e87e2289b49e7 100644
--- a/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx
+++ b/Detectors/ITSMFT/ITS/tracking/src/Vertexer.cxx
@@ -90,7 +90,7 @@ float Vertexer::clustersToVerticesHybrid(std::function<void(std::string s)> logg
     auto timeVertexingIteration = evaluateTask(
       &Vertexer::findVerticesHybrid, "Hybrid Vertexer vertex finding", [](std::string) {}, iteration);
 
-    printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration().size(), timeInit, timeTracklet, timeSelection, timeVertexing);
+    printEpilog(logger, true, nTracklets01, nTracklets12, mTimeFrame->getNLinesTotal(), mTimeFrame->getTotVertIteration()[iteration], timeInitIteration, timeTrackletIteration, timeSelectionIteration, timeVertexingIteration);
     timeInit += timeInitIteration;
     timeTracklet += timeTrackletIteration;
     timeSelection += timeSelectionIteration;
@@ -142,9 +142,9 @@ void Vertexer::printEpilog(std::function<void(std::string s)> logger,
                            const float initT, const float trackletT, const float selecT, const float vertexT)
 {
   float total = initT + trackletT + selecT + vertexT;
-  logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", trackletN01, trackletN12, trackletT));
-  logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid" : "", selectedN, selecT));
-  logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid" : "", vertexN, vertexT));
+  logger(fmt::format(" - {}Vertexer: found {} | {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", trackletN01, trackletN12, trackletT));
+  logger(fmt::format(" - {}Vertexer: selected {} tracklets in: {} ms", isHybrid ? "Hybrid " : "", selectedN, selecT));
+  logger(fmt::format(" - {}Vertexer: found {} vertices in: {} ms", isHybrid ? "Hybrid " : "", vertexN, vertexT));
   // logger(fmt::format(" - Timeframe {} vertexing completed in: {} ms, using {} thread(s).", mTimeFrameCounter++, total, mTraits->getNThreads()));
 }
 
diff --git a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx
index 01e649f982896..4a0470adcf07a 100644
--- a/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx
+++ b/Detectors/ITSMFT/ITS/workflow/src/CookedTrackerSpec.cxx
@@ -132,7 +132,7 @@ void CookedTrackerDPL::run(ProcessingContext& pc)
 
   const auto& multEstConf = FastMultEstConfig::Instance(); // parameters for mult estimation and cuts
   FastMultEst multEst;                                     // mult estimator
-  std::vector<bool> processingMask;
+  std::vector<uint8_t> processingMask;
   int cutVertexMult{0}, cutRandomMult = int(rofsinput.size()) - multEst.selectROFs(rofsinput, compClusters, physTriggers, processingMask);
 
   // auto processingMask_ephemeral = processingMask;