From 14ae1887b401639a8e6d441f9a7a7770d6afe9d0 Mon Sep 17 00:00:00 2001
From: Agata Momot <agata.momot@intel.com>
Date: Sun, 1 Dec 2024 20:07:30 +0100
Subject: [PATCH] Testing, testing, testing

---
 .github/workflows/bandit.yml                 |  30 --
 .github/workflows/benchmarks-nightly.yml     |  38 --
 .github/workflows/benchmarks-reusable.yml    |  27 +-
 .github/workflows/benchmarks.yml             |   1 +
 .github/workflows/build-fuzz-reusable.yml    |  75 ---
 .github/workflows/build-hw-reusable.yml      | 125 -----
 .github/workflows/cmake.yml                  | 335 --------------
 .github/workflows/codeql.yml                 |  81 ----
 .github/workflows/coverity.yml               |  81 ----
 .github/workflows/docs.yml                   |  81 ----
 .github/workflows/e2e_core.yml               | 214 ---------
 .github/workflows/e2e_cuda.yml               |  24 -
 .github/workflows/e2e_level_zero.yml         |  31 --
 .github/workflows/e2e_opencl.yml             |  24 -
 .github/workflows/examples-hw-level-zero.yml |  69 ---
 .github/workflows/labeler.yml                |  23 -
 .github/workflows/multi_device.yml           |  66 ---
 .github/workflows/nightly.yml                |  17 -
 .github/workflows/prerelease.yml             |  24 -
 .github/workflows/scorecard.yml              |  60 ---
 .github/workflows/source-checks.yml          |  71 ---
 .github/workflows/trivy.yml                  |  50 --
 scripts/benchmarks/benches/base.py           |  23 +-
 scripts/benchmarks/benches/compute.py        |  77 ++-
 scripts/benchmarks/benches/llamacpp.py       |  78 +---
 scripts/benchmarks/benches/oneapi.py         |  86 ++++
 scripts/benchmarks/benches/options.py        |   7 +-
 scripts/benchmarks/benches/result.py         |   6 +-
 scripts/benchmarks/benches/test.py           |  19 +-
 scripts/benchmarks/benches/umf.py            | 172 +++++++
 scripts/benchmarks/benches/velocity.py       | 122 ++++-
 scripts/benchmarks/main.py                   | 154 ++++--
 scripts/benchmarks/output_html.py            | 463 ++++++++++---------
 scripts/benchmarks/utils/utils.py            |   9 +-
 34 files changed, 857 insertions(+), 1906 deletions(-)
 delete mode 100644 .github/workflows/bandit.yml
 delete mode 100644 .github/workflows/benchmarks-nightly.yml
 delete mode 100644 .github/workflows/build-fuzz-reusable.yml
 delete mode 100644 .github/workflows/build-hw-reusable.yml
 delete mode 100644 .github/workflows/cmake.yml
 delete mode 100644 .github/workflows/codeql.yml
 delete mode 100644 .github/workflows/coverity.yml
 delete mode 100644 .github/workflows/docs.yml
 delete mode 100644 .github/workflows/e2e_core.yml
 delete mode 100644 .github/workflows/e2e_cuda.yml
 delete mode 100644 .github/workflows/e2e_level_zero.yml
 delete mode 100644 .github/workflows/e2e_opencl.yml
 delete mode 100644 .github/workflows/examples-hw-level-zero.yml
 delete mode 100644 .github/workflows/labeler.yml
 delete mode 100644 .github/workflows/multi_device.yml
 delete mode 100644 .github/workflows/nightly.yml
 delete mode 100644 .github/workflows/prerelease.yml
 delete mode 100644 .github/workflows/scorecard.yml
 delete mode 100644 .github/workflows/source-checks.yml
 delete mode 100644 .github/workflows/trivy.yml
 create mode 100644 scripts/benchmarks/benches/oneapi.py
 create mode 100644 scripts/benchmarks/benches/umf.py

diff --git a/.github/workflows/bandit.yml b/.github/workflows/bandit.yml
deleted file mode 100644
index 124046d258..0000000000
--- a/.github/workflows/bandit.yml
+++ /dev/null
@@ -1,30 +0,0 @@
-# Runs bandit security checker for code written in Python.
-name: Bandit
-
-on: [push, pull_request, workflow_dispatch]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  bandit:
-    name: Bandit
-    strategy:
-      matrix:
-        os: [ubuntu-latest, windows-latest]
-    runs-on: ${{matrix.os}}
-
-    steps:
-      - name: Clone the git repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Install pip packages
-        run: pip install -r third_party/requirements.txt
-
-      # Scan all files, except for dev. scripts
-      - name: Run Bandit
-        run: bandit -r . -x ./scripts/
diff --git a/.github/workflows/benchmarks-nightly.yml b/.github/workflows/benchmarks-nightly.yml
deleted file mode 100644
index 3da0d09c7a..0000000000
--- a/.github/workflows/benchmarks-nightly.yml
+++ /dev/null
@@ -1,38 +0,0 @@
-name: Compute Benchmarks Nightly
-
-on:
-  schedule:
-    - cron: '0 0 * * *'  # Runs at midnight UTC every day
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  nightly:
-    name: Compute Benchmarks Nightly level-zero
-    uses: ./.github/workflows/benchmarks-reusable.yml
-    with:
-      str_name: 'level_zero'
-      unit: 'gpu'
-      pr_no: 0
-      bench_script_params: '--save baseline'
-      sycl_config_params: ''
-      sycl_repo: 'intel/llvm'
-      sycl_commit: ''
-
-  nightly2:
-    # we need to wait until previous job is done so that the html report
-    # contains both runs
-    needs: nightly
-    name: Compute Benchmarks Nightly level-zero v2
-    uses: ./.github/workflows/benchmarks-reusable.yml
-    with:
-        str_name: 'level_zero_v2'
-        unit: 'gpu'
-        pr_no: 0
-        bench_script_params: '--save baseline-v2'
-        sycl_config_params: ''
-        sycl_repo: 'intel/llvm'
-        sycl_commit: ''
-        upload_report: true
diff --git a/.github/workflows/benchmarks-reusable.yml b/.github/workflows/benchmarks-reusable.yml
index 79cb35748e..ccf1a12b74 100644
--- a/.github/workflows/benchmarks-reusable.yml
+++ b/.github/workflows/benchmarks-reusable.yml
@@ -90,6 +90,8 @@ jobs:
         pip install --force-reinstall -r ${{github.workspace}}/ur-repo/third_party/benchmark_requirements.txt
 
     # We need to fetch special ref for proper PR's merge commit. Note, this ref may be absent if the PR is already merged.
+
+    # REMOVE start
     - name: Fetch PR's merge commit
       if: ${{ inputs.pr_no != 0 }}
       working-directory: ${{github.workspace}}/ur-repo
@@ -100,6 +102,7 @@ jobs:
         git checkout origin/pr/${PR_NO}/merge
         git rev-parse origin/pr/${PR_NO}/merge
 
+# HERE
     - name: Checkout SYCL
       uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
       with:
@@ -155,6 +158,27 @@ jobs:
 
     - name: Install UR
       run: cmake --install ${{github.workspace}}/ur_build
+    #   # remove end
+
+    - name: Checkout UMF
+      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
+      with:
+        repository: EuphoricThinking/unified-memory-framework
+        ref: 6aef9bf11b47d0b1390209aeafb6998f0da475c9
+        path: umf-repo
+        fetch-depth: 1
+        fetch-tags: false
+
+    - name: Configure UMF
+      run: >
+        cmake -DCMAKE_BUILD_TYPE=Release
+        -S${{github.workspace}}/umf-repo
+        -B${{github.workspace}}/umf_build
+        -DUMF_BUILD_BENCHMARKS=ON
+        -DUMF_TESTS_FAIL_ON_SKIP=ON
+
+    - name: Build UMF
+      run: cmake --build ${{github.workspace}}/umf_build -j $(nproc)
 
     - name: Run benchmarks
       working-directory: ${{ github.workspace }}/ur-repo/
@@ -164,6 +188,7 @@ jobs:
         ~/bench_workdir
         --sycl ${{ github.workspace }}/sycl_build
         --ur ${{ github.workspace }}/ur_install
+        --umf ${{ github.workspace }}/umf_build
         --adapter ${{ matrix.adapter.str_name }}
         ${{ inputs.upload_report && '--output-html' || '' }}
         ${{ inputs.bench_script_params }}
@@ -199,5 +224,5 @@ jobs:
       if: ${{ always() && inputs.upload_report }}
       uses: actions/cache/save@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
       with:
-        path: ur-repo/benchmark_results.html
+        path: umf-repo/benchmark_results.html
         key: benchmark-results-${{ matrix.adapter.str_name }}-${{ github.run_id }}
diff --git a/.github/workflows/benchmarks.yml b/.github/workflows/benchmarks.yml
index af62d40e85..07ceb1b0ff 100644
--- a/.github/workflows/benchmarks.yml
+++ b/.github/workflows/benchmarks.yml
@@ -1,4 +1,5 @@
 name: Compute Benchmarks
+# for tests
 
 on:
   workflow_dispatch:
diff --git a/.github/workflows/build-fuzz-reusable.yml b/.github/workflows/build-fuzz-reusable.yml
deleted file mode 100644
index 2cbd1b87ff..0000000000
--- a/.github/workflows/build-fuzz-reusable.yml
+++ /dev/null
@@ -1,75 +0,0 @@
----
-name: Build - Fuzztests on L0 HW - Reusable
-
-on:
-  workflow_call:
-    inputs:
-      test_label:
-        required: true
-        type: string
-
-permissions:
-  contents: read
-
-jobs:
-  fuzztest-build-hw:
-    name: Build and run fuzz tests on L0 HW
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks won't have the HW
-    strategy:
-      matrix:
-        build_type: [Debug, Release]
-        compiler: [{c: clang, cxx: clang++}]
-
-    runs-on: 'FUZZTESTS'
-    # In order to use sanitizers, vm.mmap_rnd_bits=28 must be set in the system,
-    # otherwise random SEGV at the start of the test occurs.
-    # Alternatively, clang 18.1.0 onwards with fixed sanitizers behavior can be used,
-    # if available.
-    # TODO: Remove this advice once clang 18.1.0 is available in the system (like ie. as an apt package).
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: Download DPC++
-      run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
-        mkdir dpcpp_compiler
-        tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
-
-    - name: Build level zero with gcc
-      run: |
-        git clone -b v1.18.5 --depth=1 https://github.com/oneapi-src/level-zero.git ${{github.workspace}}/level-zero
-        cd ${{github.workspace}}/level-zero
-        cmake -B build -DCMAKE_BUILD_TYPE=${{matrix.build_type}} -DCMAKE_C_COMPILER=gcc -DCMAKE_CXX_COMPILER=g++
-        cmake --build build -j $(nproc)
-
-    - name: Configure CMake
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DUR_ENABLE_TRACING=ON
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_BUILD_TESTS=ON
-        -DUR_USE_ASAN=ON
-        -DUR_USE_UBSAN=ON
-        -DUR_BUILD_ADAPTER_L0=ON
-        -DUR_LEVEL_ZERO_LOADER_LIBRARY=${{github.workspace}}/level-zero/build/lib/libze_loader.so
-        -DUR_LEVEL_ZERO_INCLUDE_DIR=${{github.workspace}}/level-zero/include/
-        -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-        -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Fuzz test
-      working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "${{inputs.test_label}}" --verbose
-
-    - name: Get information about platform
-      if: ${{ always() }}
-      run: .github/scripts/get_system_info.sh
diff --git a/.github/workflows/build-hw-reusable.yml b/.github/workflows/build-hw-reusable.yml
deleted file mode 100644
index 3e332c73fc..0000000000
--- a/.github/workflows/build-hw-reusable.yml
+++ /dev/null
@@ -1,125 +0,0 @@
----
-name: Build - Adapters on HW - Reusable
-
-on:
-  workflow_call:
-    inputs:
-      adapter_name:
-        required: true
-        type: string
-      other_adapter_name:
-        required: false
-        type: string
-        default: ""
-      runner_name:
-        required: true
-        type: string
-      platform:
-        description: "Platform string, `UR_CTS_ADAPTER_PLATFORM` will be set to this."
-        required: false
-        type: string
-        default: ""
-      static_loader:
-        required: false
-        type: string
-        default: OFF
-      static_adapter:
-        required: false
-        type: string
-        default: OFF
-
-permissions:
-  contents: read
-
-env:
-  UR_LOG_CUDA: "level:error;flush:error"
-  UR_LOG_HIP: "level:error;flush:error"
-  UR_LOG_LEVEL_ZERO: "level:error;flush:error"
-  UR_LOG_NATIVE_CPU: "level:error;flush:error"
-  UR_LOG_OPENCL: "level:error;flush:error"
-
-jobs:
-  adapter-build-hw:
-    name: Build & Test HW
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks won't have the HW
-    strategy:
-      matrix:
-        adapter: [{
-          name: "${{inputs.adapter_name}}",
-          other_name: "${{inputs.other_adapter_name}}",
-          platform: "${{inputs.platform}}",
-          static_Loader: "${{inputs.static_loader}}",
-          static_adapter: "${{inputs.static_loader}}"
-        }]
-        build_type: [Debug, Release]
-        compiler: [{c: gcc, cxx: g++}, {c: clang, cxx: clang++}]
-        # TODO: The latest L0 loader segfaults when built with clang.
-        exclude:
-         - adapter: {name: L0, platform: ""}
-           compiler: {c: clang, cxx: clang++}
-        # Exclude these configurations to avoid overloading the runners.
-         - adapter: {static_Loader: ON}
-           build_type: Release
-         - adapter: {static_Loader: ON}
-           compiler: {c: clang, cxx: clang++}
-         - adapter: {static_adapter: ON}
-           build_type: Release
-         - adapter: {static_adapter: ON}
-           compiler: {c: clang, cxx: clang++}
-
-    runs-on: ${{inputs.runner_name}}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: Download DPC++
-      run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
-        mkdir dpcpp_compiler
-        tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
-
-    - name: Configure CMake
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_ENABLE_TRACING=ON
-        -DUR_DEVELOPER_MODE=ON
-        -DUR_BUILD_TESTS=ON
-        -DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON
-        -DUR_CONFORMANCE_TEST_LOADER=${{ matrix.adapter.other_name != '' && 'ON' || 'OFF' }}
-        ${{ matrix.adapter.other_name != '' && format('-DUR_BUILD_ADAPTER_{0}=ON', matrix.adapter.other_name) || '' }}
-        -DUR_STATIC_LOADER=${{matrix.adapter.static_Loader}}
-        -DUR_STATIC_ADAPTER_${{matrix.adapter.name}}=${{matrix.adapter.static_adapter}}
-        -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-        -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
-        -DCMAKE_INSTALL_PREFIX=${{github.workspace}}/install
-        ${{ matrix.adapter.name == 'HIP' && '-DUR_CONFORMANCE_AMD_ARCH=gfx1030' || '' }}
-        ${{ matrix.adapter.name == 'HIP' && '-DUR_HIP_PLATFORM=AMD' || '' }}
-
-    - name: Build
-      # This is so that device binaries can find the sycl runtime library
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Install
-      # This is to check that install command does not fail
-      run: cmake --install ${{github.workspace}}/build
-
-    - name: Test adapter specific
-      working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" --timeout 180
-      # Don't run adapter specific tests when building multiple adapters
-      if: ${{ matrix.adapter.other_name == '' }}
-
-    - name: Test adapters
-      working-directory: ${{github.workspace}}/build
-      run: env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" --timeout 180
-
-    - name: Get information about platform
-      if: ${{ always() }}
-      run: .github/scripts/get_system_info.sh
diff --git a/.github/workflows/cmake.yml b/.github/workflows/cmake.yml
deleted file mode 100644
index 0a4ae99a58..0000000000
--- a/.github/workflows/cmake.yml
+++ /dev/null
@@ -1,335 +0,0 @@
-name: Build and test
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  ubuntu-build:
-    name: Build - Ubuntu
-    strategy:
-      matrix:
-        os: ['ubuntu-20.04', 'ubuntu-22.04']
-        build_type: [Debug, Release]
-        compiler: [{c: gcc, cxx: g++}]
-        libbacktrace: ['-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF']
-        pool_tracking: ['-DUMF_ENABLE_POOL_TRACKING=ON', '-DUMF_ENABLE_POOL_TRACKING=OFF']
-        latency_tracking: ['-DUR_ENABLE_LATENCY_HISTOGRAM=OFF']
-        include:
-          - os: 'ubuntu-22.04'
-            build_type: Release
-            compiler: {c: clang, cxx: clang++}
-            libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=OFF'
-          - os: 'ubuntu-22.04'
-            build_type: Release
-            compiler: {c: gcc, cxx: g++}
-            libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON'
-          - os: 'ubuntu-22.04'
-            build_type: Release
-            compiler: {c: clang, cxx: clang++}
-            libbacktrace: '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON'
-          - os: 'ubuntu-20.04'
-            build_type: Release
-            compiler: {c: gcc-7, cxx: g++-7}
-          - os: 'ubuntu-22.04'
-            build_type: Release
-            compiler: {c: clang, cxx: clang++}
-            latency_tracking: '-DUR_ENABLE_LATENCY_HISTOGRAM=ON'
-    runs-on: ${{ (matrix.os == 'ubuntu-22.04' && github.repository_owner == 'oneapi-src') && 'intel-ubuntu-22.04' || matrix.os }}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install apt packages
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y ${{matrix.compiler.c}} devscripts
-
-    - name: Install libhwloc
-      run: .github/scripts/install_hwloc.sh
-
-    - name: Setup PATH
-      run: echo "$HOME/.local/bin" >> $GITHUB_PATH
-
-    - name: Install g++-7
-      if: matrix.compiler.cxx == 'g++-7'
-      run: |
-        sudo apt-get install -y ${{matrix.compiler.cxx}}
-
-    - name: Install libbacktrace
-      if: matrix.libbacktrace == '-DVAL_USE_LIBBACKTRACE_BACKTRACE=ON'
-      run: |
-        git clone https://github.com/ianlancetaylor/libbacktrace.git
-        cd libbacktrace
-        ./configure
-        make
-        sudo make install
-        cd ..
-
-    - name: Download DPC++
-      if: matrix.os == 'ubuntu-22.04'
-      run: |
-        sudo apt install libncurses5
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-09-27/sycl_linux.tar.gz
-        mkdir -p ${{github.workspace}}/dpcpp_compiler
-        tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C ${{github.workspace}}/dpcpp_compiler
-
-    - name: Configure CMake
-      if: matrix.os == 'ubuntu-22.04'
-      # WEXTRA: https://github.com/oneapi-src/unified-runtime/issues/2109
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DUR_ENABLE_TRACING=ON
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_BUILD_TESTS=ON
-        -DUR_FORMAT_CPP_STYLE=OFF
-        -DUR_DEVELOPER_MODE=ON
-        -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-        -DUR_CONFORMANCE_TEST_LOADER=OFF
-        ${{matrix.libbacktrace}}
-        ${{matrix.pool_tracking}}
-        ${{matrix.latency_tracking}}
-
-    - name: Configure CMake
-      if: matrix.os == 'ubuntu-20.04'
-      # WEXTRA: https://github.com/oneapi-src/unified-runtime/issues/2109
-      # Note: Disable Werror, since 20.04 raises different ones than 22.04
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DUR_ENABLE_TRACING=ON
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_BUILD_TESTS=ON
-        -DUR_FORMAT_CPP_STYLE=OFF
-        -DUR_DEVELOPER_MODE=OFF
-        ${{matrix.libbacktrace}}
-        ${{matrix.pool_tracking}}
-        ${{matrix.latency_tracking}}
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Verify hardening flags have been set
-      run: cmake --build ${{github.workspace}}/build --target verify-hardening
-      # https://github.com/oneapi-src/unified-runtime/issues/2120
-      if: ${{ matrix.compiler.cxx != 'clang++' && matrix.os != 'ubuntu-20.04' }}
-
-    - name: Test
-      working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace"
-
-  fuzztest:
-    name: Fuzz tests short
-    uses: ./.github/workflows/build-fuzz-reusable.yml
-    with:
-      test_label: "fuzz-short"
-
-  level-zero:
-    name: Level Zero
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: L0
-      runner_name: L0
-
-  level-zero-v2:
-    name: Level Zero V2
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: L0_V2
-      runner_name: L0
-
-  level-zero-static:
-    name: Level Zero static
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: L0
-      runner_name: L0
-      static_loader: ON
-      static_adapter: ON
-
-  opencl:
-    name: OpenCL
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: OPENCL
-      runner_name: OPENCL
-      platform: "Intel(R) OpenCL"
-
-  cuda:
-    name: CUDA
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: CUDA
-      runner_name: CUDA
-
-  hip:
-    name: HIP
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: HIP
-      runner_name: HIP
-
-  native-cpu:
-    name: Native CPU
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: NATIVE_CPU
-      runner_name: NATIVE_CPU
-
-  # Native CPU jobs are here to force the loader to be used (UR will not use the loader if there is only one target)
-  combined-opencl-native-cpu:
-    name: OpenCL + Native CPU (Loader)
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: OPENCL
-      other_adapter_name: NATIVE_CPU
-      runner_name: OPENCL
-      platform: "OPENCL:Intel(R) OpenCL"
-
-  combined-level-zero-native-cpu:
-    name: Level Zero + Native CPU (Loader)
-    uses: ./.github/workflows/build-hw-reusable.yml
-    with:
-      adapter_name: L0
-      other_adapter_name: NATIVE_CPU
-      runner_name: L0
-
-  e2e-level-zero:
-    name: E2E L0
-    permissions:
-      contents: read
-      pull-requests: write
-    needs: [ubuntu-build, level-zero]
-    uses: ./.github/workflows/e2e_level_zero.yml
-
-  e2e-opencl:
-    name: E2E OpenCL
-    permissions:
-      contents: read
-      pull-requests: write
-    needs: [ubuntu-build, opencl]
-    uses: ./.github/workflows/e2e_opencl.yml
-
-  # Causes hangs: https://github.com/oneapi-src/unified-runtime/issues/2398
-  #e2e-cuda:
-  #  name: E2E CUDA
-  #  permissions:
-  #    contents: read
-  #    pull-requests: write
-  #  needs: [ubuntu-build, cuda]
-  #  uses: ./.github/workflows/e2e_cuda.yml
-
-  windows-build:
-    name: Build - Windows
-    strategy:
-      matrix:
-        os: ['windows-2019', 'windows-2022']
-        adapter: [
-          {name: None, var: ''}, {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'},
-          {name: None, var: ''}, {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'},
-          {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'}
-        ]
-
-        # TODO: building level zero loader on windows-2019 and clang-cl is currently broken
-        exclude:
-         - os: 'windows-2019'
-           adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'}
-         - os: 'windows-2019'
-           adapter: {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'}
-         - os: 'windows-2019'
-           adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'}
-         - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON'}
-           compiler: {c: clang-cl, cxx: clang-cl}
-         - adapter: {name: L0_V2, var: '-DUR_BUILD_ADAPTER_L0_V2=ON'}
-           compiler: {c: clang-cl, cxx: clang-cl}
-         - adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'}
-           compiler: {c: clang-cl, cxx: clang-cl}
-
-        build_type: [Debug, Release]
-        # TODO: clang-cl seems to be fully broken (https://github.com/oneapi-src/unified-runtime/issues/2348)
-        #compiler: [{c: cl, cxx: cl}, {c: clang-cl, cxx: clang-cl}]
-        compiler: [{c: cl, cxx: cl}]
-        include:
-          #- compiler: {c: clang-cl, cxx: clang-cl}
-          #  toolset: "-T ClangCL"
-          - os: 'windows-2022'
-            adapter: {name: L0, var: '-DUR_BUILD_ADAPTER_L0=ON -DUR_STATIC_ADAPTER_L0=ON'}
-            build_type: 'Release'
-            compiler: {c: cl, cxx: cl}
-
-    runs-on: ${{matrix.os}}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install hwloc
-      run: vcpkg install hwloc:x64-windows
-
-    - name: Configure CMake
-      env:
-        VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows"
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        ${{matrix.toolset}}
-        -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}"
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DCMAKE_POLICY_DEFAULT_CMP0094=NEW
-        -DUR_ENABLE_TRACING=ON
-        -DUR_DEVELOPER_MODE=ON
-        -DUR_BUILD_TESTS=ON
-        -DUR_FORMAT_CPP_STYLE=OFF
-        -DUR_CONFORMANCE_TEST_LOADER=OFF
-        ${{matrix.adapter.var}}
-
-    - name: Build all
-      run: cmake --build ${{github.workspace}}/build --config ${{matrix.build_type}} -j $Env:NUMBER_OF_PROCESSORS
-
-    - name: Test
-      working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "umf|loader|validation|tracing|unit|urtrace"
-
-  macos-build:
-    name: Build - MacOS
-    strategy:
-        matrix:
-          os: ['macos-13']
-    runs-on: ${{matrix.os}}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
-      with:
-        python-version: 3.9
-
-    - name: Install prerequisites
-      run: python3 -m pip install -r third_party/requirements.txt
-
-    - name: Install hwloc
-      run: brew install hwloc
-
-    - name: Configure CMake
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DUR_ENABLE_TRACING=ON
-        -DUR_DEVELOPER_MODE=ON
-        -DCMAKE_BUILD_TYPE=Release
-        -DUR_BUILD_TESTS=ON
-        -DUR_FORMAT_CPP_STYLE=ON
-        -DUMF_ENABLE_POOL_TRACKING=ON
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(sysctl -n hw.logicalcpu)
diff --git a/.github/workflows/codeql.yml b/.github/workflows/codeql.yml
deleted file mode 100644
index fdc5d0c0c0..0000000000
--- a/.github/workflows/codeql.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-name: "CodeQL"
-
-on: [push]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  analyze-ubuntu:
-    name: Analyze on Ubuntu
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    permissions:
-      security-events: write
-
-    strategy:
-      fail-fast: false
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
-      with:
-        languages: cpp, python
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: Install apt packages
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y libhwloc-dev
-
-    - name: Configure CMake
-      run: cmake -B ${{github.workspace}}/build -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_ENABLE_TRACING=ON -DUR_BUILD_TOOLS=ON -DUMF_ENABLE_POOL_TRACKING=ON
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
-
-  analyze-windows:
-    name: Analyze on Windows
-    runs-on: windows-latest
-    permissions:
-      security-events: write
-
-    strategy:
-      fail-fast: false
-
-    steps:
-    - name: Checkout repository
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Initialize CodeQL
-      uses: github/codeql-action/init@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
-      with:
-        languages: cpp, python
-
-    - name: Install pip packages
-      run: python3 -m pip install -r third_party/requirements.txt
-
-    - name: Install hwloc
-      run: vcpkg install hwloc:x64-windows
-
-    - name: Configure CMake
-      env:
-        VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows"
-      run: cmake -B ${{github.workspace}}/build -DCMAKE_POLICY_DEFAULT_CMP0094=NEW -DUR_DEVELOPER_MODE=ON -DUR_BUILD_TESTS=ON -DUR_ENABLE_TRACING=ON -DUR_BUILD_TOOLS=ON -DUMF_ENABLE_POOL_TRACKING=ON -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}"
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc) --config Release
-
-    - name: Perform CodeQL Analysis
-      uses: github/codeql-action/analyze@f079b8493333aace61c81488f8bd40919487bd9f # v3.25.7
diff --git a/.github/workflows/coverity.yml b/.github/workflows/coverity.yml
deleted file mode 100644
index d7d8bf937b..0000000000
--- a/.github/workflows/coverity.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Coverity - static analysis build. It requires Coverity's token (set in CI's secret).
-name: coverity-unified-runtime
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Run every day at 22:00 UTC
-    - cron: '0 22 * * *'
-
-permissions:
-  contents: read
-
-jobs:
-  coverity:
-    name: Coverity
-    # run only on upstream; forks don't have token for upstream's cov project
-    if: github.repository == 'oneapi-src/unified-runtime'
-    runs-on: ubuntu-latest
-
-    steps:
-      - name: Checkout repository
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          fetch-depth: 0
-
-      - name: Install dependencies
-        run: |
-          wget https://developer.download.nvidia.com/compute/cuda/repos/ubuntu2204/x86_64/cuda-keyring_1.1-1_all.deb
-          sudo dpkg -i cuda-keyring_1.1-1_all.deb
-          sudo apt-get update
-          sudo apt-get install -y libhwloc-dev libtbb-dev cuda-toolkit-12-6
-
-      - name: Install pip packages
-        run: pip install -r third_party/requirements.txt
-
-      - name: Download Coverity
-        run: |
-          wget -O coverity_tool.tgz -nv https://scan.coverity.com/download/linux64 \
-            --post-data "token=${{ secrets.COVERITY_SCAN_TOKEN }}&project=oneapi-src%2Funified-runtime"
-
-      - name: Extract Coverity
-        run: tar xzf coverity_tool.tgz
-
-      # TODO: enable HIP adapter as well (requires proper package(s) installation)
-      - name: Configure CMake
-        run: >
-          cmake
-          -B ${{github.workspace}}/build
-          -DCMAKE_BUILD_TYPE=Release
-          -DUR_DEVELOPER_MODE=OFF
-          -DUR_FORMAT_CPP_STYLE=ON
-          -DUR_ENABLE_TRACING=ON
-          -DUR_BUILD_TESTS=ON
-          -DUR_BUILD_ADAPTER_L0=ON
-          -DUR_BUILD_ADAPTER_CUDA=ON
-          -DCUDA_CUDA_LIBRARY=/usr/local/cuda-12.6/targets/x86_64-linux/lib/stubs/libcuda.so
-          -DUR_BUILD_ADAPTER_NATIVE_CPU=ON
-          -DUR_BUILD_ADAPTER_HIP=OFF
-          -DUR_BUILD_ADAPTER_OPENCL=ON
-
-      - name: Build
-        run: |
-          export COVERITY_DIR=$(find . -maxdepth 1 -type d -name "cov-analysis-linux64-*" | head -n 1)
-          if [ -n "$COVERITY_DIR" ]; then
-            export PATH="$PATH:$COVERITY_DIR/bin"
-          fi
-          cov-build --dir ${{github.workspace}}/cov-int cmake --build ${{github.workspace}}/build --config Release -j$(nproc)
-
-      - name: Create tarball to analyze
-        run: tar czvf cov-int_ur.tgz cov-int
-
-      - name: Push tarball to scan
-        run: |
-          BRANCH_NAME=$(echo ${GITHUB_REF_NAME})
-          COMMIT_ID=$(echo $GITHUB_SHA)
-          curl --form token=${{ secrets.COVERITY_SCAN_TOKEN }} \
-          --form email=bb-ur@intel.com \
-          --form file=@cov-int_ur.tgz \
-          --form version="$COMMIT_ID" \
-          --form description="$BRANCH_NAME:$COMMIT_ID" \
-          https://scan.coverity.com/builds\?project\=oneapi-src%2Funified-runtime
diff --git a/.github/workflows/docs.yml b/.github/workflows/docs.yml
deleted file mode 100644
index b4c40334d4..0000000000
--- a/.github/workflows/docs.yml
+++ /dev/null
@@ -1,81 +0,0 @@
-# Simple workflow for deploying static content to GitHub Pages
-name: Deploy documentation to Pages
-
-on:
-  # Runs on pushes targeting the default branch
-  push:
-    branches: ["main"]
-
-  # Allows you to run this workflow manually from the Actions tab
-  workflow_dispatch:
-
-# Sets permissions of the GITHUB_TOKEN to allow deployment to GitHub Pages
-permissions:
-  contents: read
-  pages: write
-  id-token: write
-
-# Allow one concurrent deployment
-concurrency:
-  group: "pages"
-  cancel-in-progress: true
-
-jobs:
-  # Build job
-  build:
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    steps:
-      - name: Checkout
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
-        with:
-          python-version: 3.9
-
-      - name: Install apt package
-        run: |
-          sudo apt-get update
-          sudo apt-get install -y doxygen
-
-      - name: Install prerequisites
-        run: python3 -m pip install -r third_party/requirements.txt
-
-      - name: Setup Pages
-        uses: actions/configure-pages@1f0c5cde4bc74cd7e1254d0cb4de8d49e9068c7d # v4.0.0
-
-      - name: Build Documentation
-        working-directory: ${{github.workspace}}/scripts
-        run: |
-          python3 run.py --core
-          mkdir -p ${{ github.workspace }}/ur-repo/
-          mkdir -p ${{github.workspace}}/docs/html
-
-      - name: Download benchmark HTML
-        id: download-bench-html
-        uses: actions/cache/restore@6849a6489940f00c2f30c0fb92c6274307ccb58a # v4.1.2
-        with:
-          path: ur-repo/benchmark_results.html
-          key: benchmark-results-
-
-      - name: Move benchmark HTML
-        # exact or partial cache hit
-        if: steps.download-bench-html.outputs.cache-hit != ''
-        run: |
-          mv ${{ github.workspace }}/ur-repo/benchmark_results.html ${{ github.workspace }}/docs/html/
-
-      - name: Upload artifact
-        uses: actions/upload-pages-artifact@0252fc4ba7626f0298f0cf00902a25c6afc77fa8 # v3.0.0
-        with:
-          path: ${{github.workspace}}/docs/html
-
-  # Deployment job
-  deploy:
-    environment:
-      name: github-pages
-      url: ${{ steps.deployment.outputs.page_url }}
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    needs: build
-    steps:
-      - name: Deploy to GitHub Pages
-        id: deployment
-        uses: actions/deploy-pages@87c3283f01cd6fe19a0ab93a23b2f6fcba5a8e42 # v4.0.3
diff --git a/.github/workflows/e2e_core.yml b/.github/workflows/e2e_core.yml
deleted file mode 100644
index f12913c648..0000000000
--- a/.github/workflows/e2e_core.yml
+++ /dev/null
@@ -1,214 +0,0 @@
-name: E2E build & run
-
-on:
-  # this workflow can by only triggered by other workflows
-  # for example by: e2e_cuda.yml or e2e_opencl.yml
-  workflow_call:
-    # acceptable input from adapter-specific workflows
-    inputs:
-      name:
-        description: Adapter name
-        type: string
-        required: true
-      str_name:
-        description: Formatted adapter name
-        type: string
-        required: true
-      prefix:
-        description: Prefix for cmake parameter
-        type: string
-        required: true
-      config:
-        description: Params for sycl configuration
-        type: string
-        required: true
-      unit:
-        description: Test unit (cpu/gpu)
-        type: string
-        required: true
-      runner_tag:
-        description: Tag defifned for the runner
-        type: string
-        required: true
-      xfail:
-        description: Allow test failures
-        type: string
-        required: false
-      xfail_not:
-        description: Not xfail
-        type: string
-        required: false
-      filter_out:
-        description: Tests to filter out completely
-        type: string
-        required: false
-      extra_lit_flags:
-        description: Additional llvm-lit flags to use
-        type: string
-        required: false
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  changed-files:
-    name: Check for changed files
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    outputs:
-      any_changed: ${{ steps.get-changed.outputs.any_changed }}
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-    - name: Get changed files
-      id: get-changed
-      uses: tj-actions/changed-files@d6babd6899969df1a11d14c368283ea4436bca78 # v44.5.2
-      with:
-        files: |
-          source/adapters/${{inputs.str_name}}/**
-          source/loader/**
-          .github/workflows/e2e*
-
-  e2e-build-hw:
-    # We want to run the job only if there are changes in the specific adapter
-    if: needs.changed-files.outputs.any_changed == 'true'
-    name: Build SYCL, UR, run E2E
-    needs: changed-files
-    permissions:
-      contents: read
-      pull-requests: write
-
-    # Allow failures, since SYCL tests and API may be not stable
-    continue-on-error: true
-    strategy:
-      matrix:
-        adapter: [
-          {name: "${{inputs.name}}",
-          str_name: "${{inputs.str_name}}",
-          prefix: "${{inputs.prefix}}",
-          config: "${{inputs.config}}",
-          unit: "${{inputs.unit}}",
-          extra_lit_flags: "${{inputs.extra_lit_flags}}"},
-        ]
-        build_type: [Release]
-        compiler: [{c: clang, cxx: clang++}]
-
-    runs-on: ${{inputs.runner_tag}}
-
-    steps:
-    # Workspace on self-hosted runners is not cleaned automatically.
-    # We have to delete the files created outside of using actions.
-    - name: Cleanup self-hosted workspace
-      if: always()
-      run: |
-        ls -la ./
-        rm -rf ./* || true
-
-    - name: Checkout UR
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        path: ur-repo
-
-    - name: Checkout SYCL
-      uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-      with:
-        repository: intel/llvm
-        ref: refs/heads/sycl
-        path: sycl-repo
-
-    - name: Set CUDA env vars
-      if: matrix.adapter.name == 'CUDA'
-      run: |
-        echo "CUDA_LIB_PATH=/usr/local/cuda/lib64/stubs" >> $GITHUB_ENV
-        echo "LD_LIBRARY_PATH=/usr/local/cuda/compat/:/usr/local/cuda/lib64:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-
-    - name: Configure SYCL
-      run: >
-        python3 sycl-repo/buildbot/configure.py
-        -t ${{matrix.build_type}}
-        -o ${{github.workspace}}/sycl_build
-        --cmake-gen "Ninja"
-        --ci-defaults ${{matrix.adapter.config}}
-        --cmake-opt="-DLLVM_INSTALL_UTILS=ON"
-        --cmake-opt="-DSYCL_PI_TESTS=OFF"
-        --cmake-opt="-DSYCL_UR_USE_FETCH_CONTENT=OFF"
-        --cmake-opt="-DSYCL_UR_SOURCE_DIR=${{github.workspace}}/ur-repo/"
-        --cmake-opt=-DCMAKE_C_COMPILER_LAUNCHER=ccache
-        --cmake-opt=-DCMAKE_CXX_COMPILER_LAUNCHER=ccache
-
-    - name: Build SYCL
-      run: cmake --build ${{github.workspace}}/sycl_build -j
-
-    - name: Set extra llvm-lit options
-      if: matrix.adapter.extra_lit_flags != ''
-      run: echo "LIT_OPTS=${{matrix.adapter.extra_lit_flags}}" >> $GITHUB_ENV
-
-    - name: Run check-sycl
-      # Remove after fixing SYCL test :: abi/layout_handler.cpp
-      # This issue does not affect further execution of e2e with UR.
-      continue-on-error: true
-      run: cmake --build ${{github.workspace}}/sycl_build --target check-sycl
-
-    - name: Set additional env. vars
-      run: |
-        echo "${{github.workspace}}/sycl_build/bin" >> $GITHUB_PATH
-        echo "LD_LIBRARY_PATH=${{github.workspace}}/sycl_build/lib:$LD_LIBRARY_PATH" >> $GITHUB_ENV
-
-    # Running (newly built) sycl-ls sets up some extra variables
-    - name: Setup SYCL variables
-      run: |
-        which clang++ sycl-ls
-        SYCL_UR_TRACE=-1 sycl-ls
-
-    - name: Build e2e tests
-      run: >
-        cmake
-        -GNinja
-        -B ${{github.workspace}}/build-e2e/
-        -S ${{github.workspace}}/sycl-repo/sycl/test-e2e/
-        -DSYCL_TEST_E2E_TARGETS="${{matrix.adapter.prefix}}${{matrix.adapter.str_name}}:${{matrix.adapter.unit}}"
-        -DCMAKE_CXX_COMPILER="$(which clang++)"
-        -DLLVM_LIT="${{github.workspace}}/sycl-repo/llvm/utils/lit/lit.py"
-
-    - name: Set LIT_XFAIL
-      if: inputs.xfail != ''
-      run: echo "LIT_XFAIL=${{inputs.xfail}}" >> $GITHUB_ENV
-
-    - name: Set LIT_FILTER_OUT
-      if: inputs.filter_out != ''
-      run: echo "LIT_FILTER_OUT=${{inputs.filter_out}}" >> $GITHUB_ENV
-
-    - name: Set LIT_XFAIL_NOT
-      if: inputs.xfail_not != ''
-      run: echo "LIT_XFAIL_NOT=${{inputs.xfail_not}}" >> $GITHUB_ENV
-
-    # TODO: remove once intel/llvm lit tests can properly recognize the GPU
-    - name: Configure hardware platform feature for L0
-      if: matrix.adapter.name == 'L0'
-      run: |
-        sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc-1T")' build-e2e/lit.site.cfg.py
-        sed -i '/import lit.llvm/i config.available_features.add("gpu-intel-pvc")' build-e2e/lit.site.cfg.py
-
-    - name: Run e2e tests
-      id: tests
-      run: ninja -C build-e2e check-sycl-e2e || echo "e2e tests have failed. Ignoring failure."
-
-    # FIXME: Requires pull-request: write permissions but this is only granted
-    # on pull requests from forks if using pull_request_target workflow
-    # trigger but not the pull_request trigger..
-    # - name: Add comment to PR
-    #   uses: actions/github-script@60a0d83039c74a4aee543508d2ffcb1c3799cdea # v7.0.1
-    #   if: ${{ always() }}
-    #   with:
-    #     script: |
-    #       const adapter = '${{ matrix.adapter.name }}';
-    #       const url = '${{ github.server_url }}/${{ github.repository }}/actions/runs/${{ github.run_id }}';
-    #       const test_status = '${{ steps.tests.outcome }}';
-    #       const job_status = '${{ job.status }}';
-    #       const body = `E2E ${adapter} build:\n${url}\nJob status: ${job_status}. Test status: ${test_status}`;
-
-    #       github.rest.issues.createComment({
-    #         issue_number: context.issue.number,
-    #         owner: context.repo.owner,
-    #         repo: context.repo.repo,
-    #         body: body
-    #       })
diff --git a/.github/workflows/e2e_cuda.yml b/.github/workflows/e2e_cuda.yml
deleted file mode 100644
index c2f1d969b8..0000000000
--- a/.github/workflows/e2e_cuda.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: E2E Cuda
-
-on:
-  workflow_call:
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  e2e-build-hw:
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
-    name: Start e2e job
-    # use core flow, run it with cuda specific parameters
-    uses: ./.github/workflows/e2e_core.yml
-    with:
-      name: "CUDA"
-      runner_tag: "CUDA_E2E"
-      str_name: "cuda"
-      prefix: "ext_oneapi_"
-      config: "--cuda"
-      unit: "gpu"
-      extra_lit_flags: "-sv --max-time=3600"
-      xfail: "Regression/device_num.cpp"
diff --git a/.github/workflows/e2e_level_zero.yml b/.github/workflows/e2e_level_zero.yml
deleted file mode 100644
index 1fd814f271..0000000000
--- a/.github/workflows/e2e_level_zero.yml
+++ /dev/null
@@ -1,31 +0,0 @@
-name: E2E Level Zero
-
-on:
-  workflow_call:
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  e2e-build-hw:
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
-    name: Start e2e job
-    # use core flow, run it with L0 specific parameters
-    uses: ./.github/workflows/e2e_core.yml
-    with:
-      name: "L0"
-      runner_tag: "L0_E2E"
-      str_name: "level_zero"
-      prefix: "ext_oneapi_"
-      config: ""
-      unit: "gpu"
-      # Failing tests
-      xfail: "InvokeSimd/Regression/call_vadd_1d_spill.cpp;InvokeSimd/Regression/ImplicitSubgroup/call_vadd_1d_spill.cpp;ESIMD/mask_expand_load.cpp;Matrix/joint_matrix_prefetch.cpp;ESIMD/mask_expand_load.cpp;Matrix/SPVCooperativeMatrix/joint_matrix_prefetch.cpp;Matrix/joint_matrix_bf16_fill_k_cache_prefetch.cpp;Matrix/SPVCooperativeMatrix/element_wise_ops.cpp;"
-      # Unexpectedly Passed Tests
-      xfail_not: ""
-      # Flaky tests
-      filter_out: "Basic/accessor/accessor.cpp|DeviceArchitecture/device_architecture_comparison_on_device_aot.cpp|Graph/Explicit/interop-level-zero-launch-kernel.cpp|Graph/RecordReplay/interop-level-zero-launch-kernel.cpp|syclcompat/launch/launch_policy_lmem.cpp"
-      # These runners by default spawn upwards of 260 workers.
-      # We also add a time out just in case some test hangs
-      extra_lit_flags: "--param gpu-intel-pvc=True --param gpu-intel-pvc-1T=True -sv -j 100 --max-time=3600"
diff --git a/.github/workflows/e2e_opencl.yml b/.github/workflows/e2e_opencl.yml
deleted file mode 100644
index e4714b2434..0000000000
--- a/.github/workflows/e2e_opencl.yml
+++ /dev/null
@@ -1,24 +0,0 @@
-name: E2E OpenCL
-
-on:
-  workflow_call:
-
-permissions:
-  contents: read
-  pull-requests: write
-
-jobs:
-  e2e-build-hw:
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks will not have the HW
-    name: Start e2e job
-    # use core flow, run it with OpenCL specific parameters
-    uses: ./.github/workflows/e2e_core.yml
-    with:
-      name: "OPENCL"
-      runner_tag: "OPENCL"
-      str_name: "opencl"
-      prefix: ""
-      config: ""
-      unit: "cpu"
-      xfail: "AOT/double.cpp;AOT/half.cpp;AOT/reqd-sg-size.cpp;Basic/built-ins/marray_geometric.cpp;KernelCompiler/kernel_compiler_spirv.cpp;KernelCompiler/opencl_queries.cpp;NonUniformGroups/ballot_group.cpp;NonUniformGroups/ballot_group_algorithms.cpp;NonUniformGroups/fixed_size_group_algorithms.cpp;NonUniformGroups/opportunistic_group.cpp;NonUniformGroups/opportunistic_group_algorithms.cpp;NonUniformGroups/tangle_group.cpp;NonUniformGroups/tangle_group_algorithms.cpp"
-      extra_lit_flags: "-sv --max-time=3600"
diff --git a/.github/workflows/examples-hw-level-zero.yml b/.github/workflows/examples-hw-level-zero.yml
deleted file mode 100644
index cf28b8e258..0000000000
--- a/.github/workflows/examples-hw-level-zero.yml
+++ /dev/null
@@ -1,69 +0,0 @@
----
-name: Examples - Adapters on Level Zero HW
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  examples:
-    name: Examples on HW
-    # if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks won't have the HW
-    if: false  # temporaily disabled due to conda env setup issues
-    strategy:
-      matrix:
-        adapter: [
-          {name: L0}
-        ]
-        build_type: [Debug, Release]
-        compiler: [{c: gcc, cxx: g++}, {c: clang, cxx: clang++}]
-
-    runs-on: ${{matrix.adapter.name}}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: Init conda env
-      uses: conda-incubator/setup-miniconda@9f54435e0e72c53962ee863144e47a4b094bfd35 # v2.3.0
-      with:
-          miniconda-version: "latest"
-          activate-environment: examples
-          environment-file: third_party/deps.yml
-          auto-activate-base: false
-
-    - name: Configure CMake
-      shell: bash -el {0}
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON
-        -DUR_BUILD_EXAMPLE_CODEGEN=ON
-        -DUR_DEVELOPER_MODE=ON
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Test codegen example
-      working-directory: ${{github.workspace}}/build
-      run: bin/codegen
-
-    # conda init adds content to user's profile making it failing (if conda is gone)
-    - name: Cleanup after conda init
-      run: |
-        cat ${HOME}/.profile || true
-        rm ${HOME}/.profile || true
-
-    - name: Get information about platform
-      if: ${{ always() }}
-      run: .github/scripts/get_system_info.sh
diff --git a/.github/workflows/labeler.yml b/.github/workflows/labeler.yml
deleted file mode 100644
index faf7060503..0000000000
--- a/.github/workflows/labeler.yml
+++ /dev/null
@@ -1,23 +0,0 @@
-# Automatically add labels to pull requests based on globs in the
-# .github/labeler.yml config file. For documentation see:
-# https://github.com/marketplace/actions/labeler
----
-name: Pull Request Labeler
-
-on: [ pull_request_target ]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  labeler:
-    permissions:
-      contents: read
-      pull-requests: write
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    steps:
-      - uses: actions/labeler@8558fd74291d67161a8a78ce36a881fa63b766a9 # v5.0.0
diff --git a/.github/workflows/multi_device.yml b/.github/workflows/multi_device.yml
deleted file mode 100644
index 48a804bdf8..0000000000
--- a/.github/workflows/multi_device.yml
+++ /dev/null
@@ -1,66 +0,0 @@
----
-name: Multi Device testing
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  examples:
-    name: Multi Device testing
-    if: github.repository == 'oneapi-src/unified-runtime'  # run only on upstream; forks won't have the HW
-    strategy:
-      matrix:
-        adapter: [
-          {name: L0},
-          {name: L0_V2}
-        ]
-        build_type: [Debug, Release]
-        compiler: [{c: gcc, cxx: g++}] # TODO: investigate why memory-adapter-level_zero hangs with clang
-
-    runs-on: "${{matrix.adapter.name}}_2T"
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: Download DPC++
-      run: |
-        wget -O ${{github.workspace}}/dpcpp_compiler.tar.gz https://github.com/intel/llvm/releases/download/nightly-2024-01-29/sycl_linux.tar.gz
-        mkdir dpcpp_compiler
-        tar -xvf ${{github.workspace}}/dpcpp_compiler.tar.gz -C dpcpp_compiler
-
-    - name: Configure CMake
-      shell: bash -el {0}
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_C_COMPILER=${{matrix.compiler.c}}
-        -DCMAKE_CXX_COMPILER=${{matrix.compiler.cxx}}
-        -DCMAKE_BUILD_TYPE=${{matrix.build_type}}
-        -DUR_DEVELOPER_MODE=ON
-        -DUR_BUILD_TESTS=ON
-        -DUR_BUILD_ADAPTER_${{matrix.adapter.name}}=ON
-        -DUR_CONFORMANCE_TEST_LOADER=OFF
-        -DUR_TEST_DEVICES_COUNT=2
-        -DUR_DPCXX=${{github.workspace}}/dpcpp_compiler/bin/clang++
-        -DUR_SYCL_LIBRARY_DIR=${{github.workspace}}/dpcpp_compiler/lib
-
-    - name: Build
-      run: cmake --build ${{github.workspace}}/build -j $(nproc)
-
-    - name: Test adapter specific
-      working-directory: ${{github.workspace}}/build
-      run: ctest -C ${{matrix.build_type}} --output-on-failure -L "adapter-specific" -E "test-adapter-level_zero_multi_queue" --timeout 180
-      # TODO: investigate why test-adapter-level_zero_multi_queue fails on newer driver
-
-    - name: Test adapters
-      working-directory: ${{github.workspace}}/build
-      run: env UR_CTS_ADAPTER_PLATFORM="${{matrix.adapter.platform}}" ctest -C ${{matrix.build_type}} --output-on-failure -L "conformance" -E "exp_command_buffer" --timeout 180
diff --git a/.github/workflows/nightly.yml b/.github/workflows/nightly.yml
deleted file mode 100644
index 06d4026676..0000000000
--- a/.github/workflows/nightly.yml
+++ /dev/null
@@ -1,17 +0,0 @@
-name: Nightly
-
-on:
-  workflow_dispatch:
-  schedule:
-    # Run every day at 23:00 UTC
-    - cron: '0 23 * * *'
-
-permissions:
-  contents: read
-
-jobs:
-  fuzztest:
-    name: Fuzz tests long
-    uses: ./.github/workflows/build-fuzz-reusable.yml
-    with:
-      test_label: "fuzz-long"
diff --git a/.github/workflows/prerelease.yml b/.github/workflows/prerelease.yml
deleted file mode 100644
index f466cc693e..0000000000
--- a/.github/workflows/prerelease.yml
+++ /dev/null
@@ -1,24 +0,0 @@
----
-name: Deploy weekly prerelease
-
-on:
-  schedule:
-    # At 23:00 on Friday, GitHub actions schedule is in UTC time.
-    - cron: 0 23 * * 5
-
-permissions:
-  contents: read
-
-jobs:
-  weekly-prerelease:
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    permissions:
-      contents: write
-    steps:
-      - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Create weekly prerelease
-        run:
-          gh release create --prerelease --title "Weekly Stable Snapshot $(date +%Y/%m/%d)" weekly-$(date +%Y-%m-%d)
-        env:
-          GITHUB_TOKEN: ${{secrets.GITHUB_TOKEN}}
diff --git a/.github/workflows/scorecard.yml b/.github/workflows/scorecard.yml
deleted file mode 100644
index 693cfdd9e5..0000000000
--- a/.github/workflows/scorecard.yml
+++ /dev/null
@@ -1,60 +0,0 @@
-# Scorecard analysis, looking for vulnerabilities and bad practices in the repo.
-name: Scorecard supply-chain security
-on:
-  # For Branch-Protection check. Only the default branch is supported. See
-  # https://github.com/ossf/scorecard/blob/main/docs/checks.md#branch-protection
-  branch_protection_rule:
-  workflow_dispatch:
-  schedule:
-    # Runs at 22:45 UTC on Thursday.
-    - cron: '45 22 * * 4'
-  push:
-    branches: [ "main" ]
-
-# Declare default permissions as read only.
-permissions: read-all
-
-jobs:
-  analysis:
-    name: Scorecard analysis
-    runs-on: ubuntu-latest
-    permissions:
-      # Needed to upload the results to code-scanning dashboard.
-      security-events: write
-      # Needed to publish results and get a badge (see publish_results below).
-      id-token: write
-
-    steps:
-      - name: "Checkout code"
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-        with:
-          persist-credentials: false
-
-      - name: "Run analysis"
-        uses: ossf/scorecard-action@0864cf19026789058feabb7e87baa5f140aac736 # v2.3.1
-        with:
-          results_file: scorecard_results.sarif
-          results_format: sarif
-          # (Optional) "write" PAT token. Uncomment the `repo_token` line below if:
-          # - you want to enable the Branch-Protection check on a *public* repository, or
-          # - you are installing Scorecard on a *private* repository
-          # To create the PAT, follow the steps in https://github.com/ossf/scorecard-action#authentication-with-pat.
-          # repo_token: ${{ secrets.SCORECARD_TOKEN }}
-
-          # Publish results to OpenSSF REST API for easy access by consumers
-          # Allows the repository to include the Scorecard badge.
-          # See https://github.com/ossf/scorecard-action#publishing-results.
-          publish_results: true
-
-      - name: "Upload artifact"
-        uses: actions/upload-artifact@5d5d22a31266ced268874388b861e4b58bb5c2f3 # 4.3.1
-        with:
-          name: Scorecard results
-          path: scorecard_results.sarif
-          retention-days: 5
-
-      # Upload the results to GitHub's code scanning dashboard.
-      - name: "Upload to code-scanning"
-        uses: github/codeql-action/upload-sarif@05963f47d870e2cb19a537396c1f668a348c7d8f # v3.24.8
-        with:
-          sarif_file: scorecard_results.sarif
diff --git a/.github/workflows/source-checks.yml b/.github/workflows/source-checks.yml
deleted file mode 100644
index e73f403320..0000000000
--- a/.github/workflows/source-checks.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-name: Source Checks
-
-on: [push, pull_request]
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  source-checks:
-    name: Source Checks
-    strategy:
-      matrix:
-        os: ['ubuntu-22.04', 'windows-2022']
-
-    runs-on: ${{matrix.os}}
-
-    steps:
-    - uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-    - uses: actions/setup-python@0a5c61591373683505ea898e09a3ea4f39ef2b9c # v5.0.0
-      with:
-        python-version: 3.9
-
-    - name: Install pip packages
-      run: pip install -r third_party/requirements.txt
-
-    - name: "[Lin] Install doxygen"
-      if: matrix.os == 'ubuntu-22.04'
-      run: |
-        sudo apt-get update
-        sudo apt-get install -y doxygen
-
-    - name: "[Win] Install doxygen"
-      if: matrix.os == 'windows-2022'
-      run: |
-        $WorkingDir = $PWD.Path
-        Invoke-WebRequest -Uri https://github.com/doxygen/doxygen/releases/download/Release_1_9_8/doxygen-1.9.8.windows.x64.bin.zip -OutFile "$WorkingDir\doxygen.zip"
-        Expand-Archive -Path "$WorkingDir\doxygen.zip"
-        Add-Content $env:GITHUB_PATH "$WorkingDir\doxygen"
-
-    - name: "[Lin] Install hwloc"
-      if: matrix.os == 'ubuntu-22.04'
-      run: .github/scripts/install_hwloc.sh
-
-    - name: "[Win] Install hwloc"
-      if: matrix.os == 'windows-2022'
-      run: vcpkg install hwloc:x64-windows
-
-    - name: Configure CMake
-      env:
-        VCPKG_PATH: "C:/vcpkg/packages/hwloc_x64-windows"
-      run: >
-        cmake
-        -B${{github.workspace}}/build
-        -DCMAKE_PREFIX_PATH="${{env.VCPKG_PATH}}"
-        -DUR_ENABLE_TRACING=OFF
-        -DCMAKE_BUILD_TYPE=Debug
-        -DUR_BUILD_TESTS=OFF
-        -DUR_FORMAT_CPP_STYLE=ON
-
-    # Verifying license should be enough on a single OS
-    - name: Verify that each source file contains a license
-      if: matrix.os == 'ubuntu-22.04'
-      run: cmake --build ${{github.workspace}}/build --target verify-licenses
-
-    - name: Generate source from spec, check for uncommitted diff
-      run: cmake --build ${{github.workspace}}/build --target check-generated
diff --git a/.github/workflows/trivy.yml b/.github/workflows/trivy.yml
deleted file mode 100644
index c2ef1d47e7..0000000000
--- a/.github/workflows/trivy.yml
+++ /dev/null
@@ -1,50 +0,0 @@
-# Runs linter for Docker files
-name: Trivy
-
-on:
-  workflow_dispatch:
-  push:
-  pull_request:
-    paths:
-      - '.github/docker/*Dockerfile'
-      - '.github/workflows/trivy.yml'
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.head_ref || github.run_id }}
-  cancel-in-progress: true
-
-permissions:
-  contents: read
-
-jobs:
-  linux:
-    name: Trivy
-    runs-on: ${{ github.repository_owner == 'oneapi-src' && 'intel-ubuntu-22.04' || 'ubuntu-latest' }}
-    permissions:
-      security-events: write
-
-    steps:
-      - name: Clone repo
-        uses: actions/checkout@b4ffde65f46336ab88eb53be808477a3936bae11 # v4.1.1
-
-      - name: Run Trivy
-        uses: aquasecurity/trivy-action@84384bd6e777ef152729993b8145ea352e9dd3ef # v0.17.0
-        with:
-          scan-type: 'config'
-          hide-progress: false
-          format: 'sarif'
-          output: 'trivy-results.sarif'
-          exit-code: 1  # Fail if issue found
-          # file with suppressions: .trivyignore (in root dir)
-
-      - name: Print report and trivyignore file
-        run: |
-          echo "### Trivy ignore content:"
-          cat .trivyignore
-          echo "### Trivy report:"
-          cat trivy-results.sarif
-
-      - name: Upload results
-        uses: github/codeql-action/upload-sarif@e8893c57a1f3a2b659b6b55564fdfdbbd2982911 # v3.24.0
-        with:
-          sarif_file: 'trivy-results.sarif'
diff --git a/scripts/benchmarks/benches/base.py b/scripts/benchmarks/benches/base.py
index feeaa568b6..13c2a8ef92 100644
--- a/scripts/benchmarks/benches/base.py
+++ b/scripts/benchmarks/benches/base.py
@@ -35,24 +35,26 @@ def run_bench(self, command, env_vars, ld_library=[]):
         return run(
             command=command,
             env_vars=env_vars_with_forced_adapter,
-            add_sycl=True,
+            add_sycl=options.sycl is not None,
             cwd=options.benchmark_cwd,
             ld_library=ld_library
         ).stdout.decode()
 
-    def create_data_path(self, name):
-        data_path = os.path.join(self.directory, "data", name)
-
-        if options.rebuild and Path(data_path).exists():
-           shutil.rmtree(data_path)
+    def create_data_path(self, name, skip_data_dir = False):
+        if skip_data_dir:
+            data_path = os.path.join(self.directory, name)
+        else:
+            data_path = os.path.join(self.directory, 'data', name)
+            if options.rebuild and Path(data_path).exists():
+                shutil.rmtree(data_path)
 
         Path(data_path).mkdir(parents=True, exist_ok=True)
 
         return data_path
 
-    def download(self, name, url, file, untar = False):
-        self.data_path = self.create_data_path(name)
-        return download(self.data_path, url, file, True)
+    def download(self, name, url, file, untar = False, unzip = False, skip_data_dir = False):
+        self.data_path = self.create_data_path(name, skip_data_dir)
+        return download(self.data_path, url, file, untar, unzip)
 
     def name(self):
         raise NotImplementedError()
@@ -69,9 +71,6 @@ def run(self, env_vars) -> list[Result]:
     def teardown(self):
         raise NotImplementedError()
 
-    def ignore_iterations(self):
-        return False
-
 class Suite:
     def benchmarks(self) -> list[Benchmark]:
         raise NotImplementedError()
diff --git a/scripts/benchmarks/benches/compute.py b/scripts/benchmarks/benches/compute.py
index f872399e9e..229a50e84d 100644
--- a/scripts/benchmarks/benches/compute.py
+++ b/scripts/benchmarks/benches/compute.py
@@ -50,6 +50,8 @@ def benchmarks(self) -> list[Benchmark]:
             return []
 
         benches = [
+            SubmitKernelL0(self, 0),
+            SubmitKernelL0(self, 1),
             SubmitKernelSYCL(self, 0),
             SubmitKernelSYCL(self, 1),
             QueueInOrderMemcpy(self, 0, 'Device', 'Device', 1024),
@@ -59,14 +61,16 @@ def benchmarks(self) -> list[Benchmark]:
             ExecImmediateCopyQueue(self, 0, 1, 'Device', 'Device', 1024),
             ExecImmediateCopyQueue(self, 1, 1, 'Device', 'Host', 1024),
             VectorSum(self),
-            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1),
-            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1),
-            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1),
-            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1),
-            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 1, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 1, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 1, 1, 1),
+            MemcpyExecute(self, 400, 1, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 100, 8, 102400, 10, 0, 1, 1),
+            MemcpyExecute(self, 400, 8, 1024, 1000, 0, 1, 1),
+            MemcpyExecute(self, 10, 16, 1024, 10000, 0, 1, 1),
+            MemcpyExecute(self, 4096, 1, 1024, 10, 0, 1, 0),
+            MemcpyExecute(self, 4096, 4, 1024, 10, 0, 1, 0),
         ]
 
         if options.ur is not None:
@@ -82,7 +86,7 @@ def parse_unit_type(compute_unit):
         return "instr"
     elif "[us]" in compute_unit:
         return "μs"
-    return "unknown"
+    return compute_unit.replace("[", "").replace("]", "")
 
 class ComputeBenchmark(Benchmark):
     def __init__(self, bench, name, test):
@@ -100,6 +104,9 @@ def extra_env_vars(self) -> dict:
     def setup(self):
         self.benchmark_bin = os.path.join(self.bench.directory, 'compute-benchmarks-build', 'bin', self.bench_name)
 
+    def explicit_group(self):
+        return ""
+
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
@@ -114,9 +121,10 @@ def run(self, env_vars) -> list[Result]:
         result = self.run_bench(command, env_vars)
         parsed_results = self.parse_output(result)
         ret = []
-        for label, mean, unit in parsed_results:
-            extra_label = " CPU count" if parse_unit_type(unit) == "CPU count" else ""
-            ret.append(Result(label=self.name() + extra_label, value=mean, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
+        for label, median, stddev, unit in parsed_results:
+            extra_label = " CPU count" if parse_unit_type(unit) == "instr" else ""
+            explicit_group = self.explicit_group() + extra_label if self.explicit_group() != "" else ""
+            ret.append(Result(label=self.name() + extra_label, explicit_group=explicit_group, value=median, stddev=stddev, command=command, env=env_vars, stdout=result, unit=parse_unit_type(unit)))
         return ret
 
     def parse_output(self, output):
@@ -131,8 +139,11 @@ def parse_output(self, output):
             try:
                 label = data_row[0]
                 mean = float(data_row[1])
+                median = float(data_row[2])
+                # compute benchmarks report stddev as %
+                stddev = mean * (float(data_row[3].strip('%')) / 100.0)
                 unit = data_row[7]
-                results.append((label, mean, unit))
+                results.append((label, median, stddev, unit))
             except (ValueError, IndexError) as e:
                 raise ValueError(f"Error parsing output: {e}")
         if len(results) == 0:
@@ -151,6 +162,9 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_sycl SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -171,6 +185,32 @@ def name(self):
         order = "in order" if self.ioq else "out of order"
         return f"api_overhead_benchmark_ur SubmitKernel {order}"
 
+    def explicit_group(self):
+        return "SubmitKernel"
+
+    def bin_args(self) -> list[str]:
+        return [
+            f"--Ioq={self.ioq}",
+            "--DiscardEvents=0",
+            "--MeasureCompletion=0",
+            "--iterations=100000",
+            "--Profiling=0",
+            "--NumKernels=10",
+            "--KernelExecTime=1"
+        ]
+
+class SubmitKernelL0(ComputeBenchmark):
+    def __init__(self, bench, ioq):
+        self.ioq = ioq
+        super().__init__(bench, "api_overhead_benchmark_l0", "SubmitKernel")
+
+    def name(self):
+        order = "in order" if self.ioq else "out of order"
+        return f"api_overhead_benchmark_l0 SubmitKernel {order}"
+
+    def explicit_group(self):
+        return "SubmitKernel"
+
     def bin_args(self) -> list[str]:
         return [
             f"--Ioq={self.ioq}",
@@ -255,6 +295,10 @@ def __init__(self, bench, type, size, placement):
     def name(self):
         return f"memory_benchmark_sycl StreamMemory, placement {self.placement}, type {self.type}, size {self.size}"
 
+    # measurement is in GB/s
+    def lower_is_better(self):
+        return False
+
     def bin_args(self) -> list[str]:
         return [
             "--iterations=10000",
@@ -282,22 +326,23 @@ def bin_args(self) -> list[str]:
         ]
 
 class MemcpyExecute(ComputeBenchmark):
-    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM):
+    def __init__(self, bench, numOpsPerThread, numThreads, allocSize, iterations, srcUSM, dstUSM, useEvent):
         self.numOpsPerThread = numOpsPerThread
         self.numThreads = numThreads
         self.allocSize = allocSize
         self.iterations = iterations
         self.srcUSM = srcUSM
         self.dstUSM = dstUSM
+        self.useEvents = useEvent
         super().__init__(bench, "multithread_benchmark_ur", "MemcpyExecute")
 
     def name(self):
-        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}"
+        return f"multithread_benchmark_ur MemcpyExecute opsPerThread:{self.numOpsPerThread}, numThreads:{self.numThreads}, allocSize:{self.allocSize} srcUSM:{self.srcUSM} dstUSM:{self.dstUSM}" + (" without events" if not self.useEvents else "")
 
     def bin_args(self) -> list[str]:
         return [
             "--Ioq=1",
-            "--UseEvents=1",
+            f"--UseEvents={self.useEvents}",
             "--MeasureCompletion=1",
             "--UseQueuePerThread=1",
             f"--AllocSize={self.allocSize}",
diff --git a/scripts/benchmarks/benches/llamacpp.py b/scripts/benchmarks/benches/llamacpp.py
index 50dd8d04c6..2dbdb5cbcf 100644
--- a/scripts/benchmarks/benches/llamacpp.py
+++ b/scripts/benchmarks/benches/llamacpp.py
@@ -6,85 +6,14 @@
 import csv
 import io
 from pathlib import Path
-import re
-import shutil
 from utils.utils import download, git_clone
 from .base import Benchmark, Suite
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
 import os
 
-class OneAPI:
-    # random unique number for benchmark oneAPI installation
-    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
-    def __init__(self, directory):
-        self.oneapi_dir = os.path.join(directory, 'oneapi')
-        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
-        # delete if some option is set?
-
-        # can we just hardcode these links?
-        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
-        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
-        return
-
-    def install_package(self, name, url):
-        package_path = os.path.join(self.oneapi_dir, name)
-        if Path(package_path).exists():
-            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
-            return
-
-        package = download(self.oneapi_dir, url, f'package_{name}.sh')
-        try:
-            print(f"installing f{name}")
-            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
-        except:
-            print("oneAPI installation likely exists already")
-            return
-        print(f"f{name} installation complete")
-
-    def package_dir(self, package, dir):
-        return os.path.join(self.oneapi_dir, package, 'latest', dir)
-
-    def package_cmake(self, package):
-        package_lib = self.package_dir(package, 'lib')
-        return os.path.join(package_lib, 'cmake', package)
-
-    def mkl_lib(self):
-        return self.package_dir('mkl', 'lib')
-
-    def mkl_include(self):
-        return self.package_dir('mkl', 'include')
-
-    def mkl_cmake(self):
-        return self.package_cmake('mkl')
-
-    def dnn_lib(self):
-        return self.package_dir('dnnl', 'lib')
-
-    def dnn_include(self):
-        return self.package_dir('dnnl', 'include')
-
-    def dnn_cmake(self):
-        return self.package_cmake('dnnl')
-
-    def tbb_lib(self):
-        return self.package_dir('tbb', 'lib')
-
-    def tbb_cmake(self):
-        return self.package_cmake('tbb')
-
-    def compiler_lib(self):
-        return self.package_dir('compiler', 'lib')
-
-    def ld_libraries(self):
-        return [
-            self.compiler_lib(),
-            self.mkl_lib(),
-            self.tbb_lib(),
-            self.dnn_lib()
-        ]
-
 class LlamaCppBench(Suite):
     def __init__(self, directory):
         if options.sycl is None:
@@ -103,7 +32,7 @@ def setup(self):
 
         self.model = download(self.models_dir, "https://huggingface.co/microsoft/Phi-3-mini-4k-instruct-gguf/resolve/main/Phi-3-mini-4k-instruct-q4.gguf", "Phi-3-mini-4k-instruct-q4.gguf")
 
-        self.oneapi = OneAPI(self.directory)
+        self.oneapi = get_oneapi()
 
         self.build_path = create_build_path(self.directory, 'llamacpp-build')
 
@@ -147,9 +76,6 @@ def name(self):
     def lower_is_better(self):
         return False
 
-    def ignore_iterations(self):
-        return True
-
     def run(self, env_vars) -> list[Result]:
         command = [
             f"{self.benchmark_bin}",
diff --git a/scripts/benchmarks/benches/oneapi.py b/scripts/benchmarks/benches/oneapi.py
new file mode 100644
index 0000000000..414c4aa64a
--- /dev/null
+++ b/scripts/benchmarks/benches/oneapi.py
@@ -0,0 +1,86 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+from pathlib import Path
+from utils.utils import download, run
+from .options import options
+import os
+
+class OneAPI:
+    # random unique number for benchmark oneAPI installation
+    ONEAPI_BENCHMARK_INSTANCE_ID = 98765
+    def __init__(self):
+        self.oneapi_dir = os.path.join(options.workdir, 'oneapi')
+        Path(self.oneapi_dir).mkdir(parents=True, exist_ok=True)
+        # delete if some option is set?
+
+        # can we just hardcode these links?
+        self.install_package('dnnl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/87e117ab-039b-437d-9c80-dcd5c9e675d5/intel-onednn-2025.0.0.862_offline.sh')
+        self.install_package('mkl', 'https://registrationcenter-download.intel.com/akdlm/IRC_NAS/79153e0f-74d7-45af-b8c2-258941adf58a/intel-onemkl-2025.0.0.940_offline.sh')
+        return
+
+    def install_package(self, name, url):
+        package_path = os.path.join(self.oneapi_dir, name)
+        if Path(package_path).exists():
+            print(f"{package_path} exists, skipping installing oneAPI package {name}...")
+            return
+
+        package = download(self.oneapi_dir, url, f'package_{name}.sh')
+        try:
+            print(f"installing f{name}")
+            run(f"sh {package} -a -s --eula accept --install-dir {self.oneapi_dir} --instance f{self.ONEAPI_BENCHMARK_INSTANCE_ID}")
+        except:
+            print("oneAPI installation likely exists already")
+            return
+        print(f"f{name} installation complete")
+
+    def package_dir(self, package, dir):
+        return os.path.join(self.oneapi_dir, package, 'latest', dir)
+
+    def package_cmake(self, package):
+        package_lib = self.package_dir(package, 'lib')
+        return os.path.join(package_lib, 'cmake', package)
+
+    def mkl_lib(self):
+        return self.package_dir('mkl', 'lib')
+
+    def mkl_include(self):
+        return self.package_dir('mkl', 'include')
+
+    def mkl_cmake(self):
+        return self.package_cmake('mkl')
+
+    def dnn_lib(self):
+        return self.package_dir('dnnl', 'lib')
+
+    def dnn_include(self):
+        return self.package_dir('dnnl', 'include')
+
+    def dnn_cmake(self):
+        return self.package_cmake('dnnl')
+
+    def tbb_lib(self):
+        return self.package_dir('tbb', 'lib')
+
+    def tbb_cmake(self):
+        return self.package_cmake('tbb')
+
+    def compiler_lib(self):
+        return self.package_dir('compiler', 'lib')
+
+    def ld_libraries(self):
+        return [
+            self.compiler_lib(),
+            self.mkl_lib(),
+            self.tbb_lib(),
+            self.dnn_lib()
+        ]
+
+oneapi_instance = None
+
+def get_oneapi() -> OneAPI: # oneAPI singleton
+    if not hasattr(get_oneapi, "instance"):
+        get_oneapi.instance = OneAPI()
+    return get_oneapi.instance
diff --git a/scripts/benchmarks/benches/options.py b/scripts/benchmarks/benches/options.py
index 5997cdedb8..f793c1fa36 100644
--- a/scripts/benchmarks/benches/options.py
+++ b/scripts/benchmarks/benches/options.py
@@ -8,19 +8,24 @@ class Compare(Enum):
 
 @dataclass
 class Options:
+    workdir: str = None
     sycl: str = None
     ur: str = None
+    umf: str = None
     ur_adapter: str = None
     rebuild: bool = True
     benchmark_cwd: str = "INVALID"
     timeout: float = 600
-    iterations: int = 5
+    iterations: int = 3
     verbose: bool = False
     compare: Compare = Compare.LATEST
     compare_max: int = 10 # average/median over how many results
     output_html: bool = False
     output_markdown: bool = True
     dry_run: bool = False
+    # these two should probably be merged into one setting
+    stddev_threshold: float = 0.02
+    epsilon: float = 0.02
 
 options = Options()
 
diff --git a/scripts/benchmarks/benches/result.py b/scripts/benchmarks/benches/result.py
index 7d40040607..c975fa792d 100644
--- a/scripts/benchmarks/benches/result.py
+++ b/scripts/benchmarks/benches/result.py
@@ -18,7 +18,11 @@ class Result:
     stdout: str
     passed: bool = True
     unit: str = ""
-    # values should not be set by the benchmark
+    explicit_group: str = ""
+    # stddev can be optionally set by the benchmark,
+    # if not set, it will be calculated automatically.
+    stddev: float = 0.0
+    # values below should not be set by the benchmark
     name: str = ""
     lower_is_better: bool = True
     git_hash: str = ''
diff --git a/scripts/benchmarks/benches/test.py b/scripts/benchmarks/benches/test.py
index 802688f032..efe789f678 100644
--- a/scripts/benchmarks/benches/test.py
+++ b/scripts/benchmarks/benches/test.py
@@ -20,30 +20,31 @@ def setup(self):
 
     def benchmarks(self) -> list[Benchmark]:
         bench_configs = [
-            ("Memory Bandwidth", 2000, 200),
-            ("Latency", 100, 20),
-            ("Throughput", 1500, 150),
-            ("FLOPS", 3000, 300),
-            ("Cache Miss Rate", 250, 25),
+            ("Memory Bandwidth", 2000, 200, "Foo Group"),
+            ("Latency", 100, 20, "Bar Group"),
+            ("Throughput", 1500, 150, "Foo Group"),
+            ("FLOPS", 3000, 300, "Foo Group"),
+            ("Cache Miss Rate", 250, 25, "Bar Group"),
         ]
 
         result = []
-        for base_name, base_value, base_diff in bench_configs:
+        for base_name, base_value, base_diff, group in bench_configs:
             for variant in range(6):
                 value_multiplier = 1.0 + (variant * 0.2)
                 name = f"{base_name} {variant+1}"
                 value = base_value * value_multiplier
                 diff = base_diff * value_multiplier
 
-                result.append(TestBench(name, value, diff))
+                result.append(TestBench(name, value, diff, group))
 
         return result
 
 class TestBench(Benchmark):
-    def __init__(self, name, value, diff):
+    def __init__(self, name, value, diff, group = ''):
         self.bname = name
         self.value = value
         self.diff = diff
+        self.group = group
         super().__init__("")
 
     def name(self):
@@ -58,7 +59,7 @@ def setup(self):
     def run(self, env_vars) -> list[Result]:
         random_value = self.value + random.uniform(-1 * (self.diff), self.diff)
         return [
-            Result(label=self.name(), value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
+            Result(label=self.name(), explicit_group=self.group, value=random_value, command="", env={"A": "B"}, stdout="no output", unit="ms")
         ]
 
     def teardown(self):
diff --git a/scripts/benchmarks/benches/umf.py b/scripts/benchmarks/benches/umf.py
new file mode 100644
index 0000000000..7725943271
--- /dev/null
+++ b/scripts/benchmarks/benches/umf.py
@@ -0,0 +1,172 @@
+# Copyright (C) 2024 Intel Corporation
+# Part of the Unified-Runtime Project, under the Apache License v2.0 with LLVM Exceptions.
+# See LICENSE.TXT
+# SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+import random
+from utils.utils import git_clone
+from .base import Benchmark, Suite
+from .result import Result
+from utils.utils import run, create_build_path
+from .options import options
+import os
+import csv
+import io
+
+def isUMFAvailable():
+    return options.umf is not None
+
+class UMFSuite(Suite):    
+    def __init__(self, directory):
+        self.directory = directory
+        if not isUMFAvailable():
+            print("UMF not provided. Related benchmarks will not run")
+    
+    def setup(self):
+        if not isUMFAvailable():
+            return []
+        self.built = True
+
+    def benchmarks(self) -> list[Benchmark]:
+        if not isUMFAvailable():
+            return
+        
+        benches = [
+            GBench(self),
+        ]
+
+        return benches
+
+class ComputeUMFBenchmark(Benchmark):
+    def __init__(self, bench, name):
+        self.bench = bench
+        self.bench_name = name
+
+        self.col_name = None
+        self.col_iterations = None
+        self.col_real_time = None
+        self.col_cpu_time = None
+        self.col_time_unit = None
+
+        self.col_statistics_time = None
+
+        super().__init__(bench.directory)
+
+    def bin_args(self) -> list[str]:
+        return []
+
+    def extra_env_vars(self) -> dict:
+        return {}
+
+    def setup(self):
+        if not isUMFAvailable():
+            print("UMF prefix path not provided")
+            return
+
+        self.benchmark_bin = os.path.join(options.umf, 'benchmark', self.bench_name)
+
+    def run(self, env_vars) -> list[Result]:
+        command = [
+            f"{self.benchmark_bin}",
+        ]
+
+        command += self.bin_args()
+        env_vars.update(self.extra_env_vars())
+
+        result = self.run_bench(command, env_vars)
+        parsed = self.parse_output(result)
+        results = []
+        for r in parsed:
+            (config, pool, mean) = r
+            label = f"{config} {pool}"
+            results.append(Result(label=label, value=mean, command=command, env=env_vars, stdout=result, unit="ns", explicit_group=config))
+        return results
+
+    # if different time units - convert TODO safety check for time units
+    def parse_output(self, output):
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+        next(reader, None)
+        data_row = next(reader, None)
+        if data_row is None:
+            raise ValueError("Benchmark output does not contain data.")
+        try:
+            label = data_row[0]
+            mean = float(data_row[1])
+            return (label, mean)
+        except (ValueError, IndexError) as e:
+            raise ValueError(f"Error parsing output: {e}")
+        
+        
+        
+    # Implementation with self.col_* indices could lead to the division by None
+    def get_mean(self, datarow):
+        raise NotImplementedError()
+
+    def teardown(self):
+        return
+
+class GBench(ComputeUMFBenchmark):
+    def __init__(self, bench):
+        super().__init__(bench, "umf-benchmark")
+
+        self.col_name = 0
+        self.col_iterations = 1
+        self.col_real_time = 2
+        self.col_cpu_time = 3
+        self.col_time_unit = 4
+
+        self.idx_pool = 0
+        self.idx_config = 1
+        self.name_separator = '/'
+
+        self.col_statistics_time = self.col_real_time
+
+    def name(self):
+        return self.bench_name
+
+    # --benchmark_format describes stdout output
+    # --benchmark_out=<file> and --benchmark_out_format=<format>
+    # describe output to a file 
+    def bin_args(self):
+        return ["--benchmark_format=csv"]
+
+    # the default unit
+    # might be changed globally with --benchmark_time_unit={ns|us|ms|s}
+    # the change affects only benchmark where time unit has not been set
+    # explicitly
+    def unit(self):
+        return "ns"
+
+    def get_pool_and_config(self, full_name):
+        list_split = full_name.split(self.name_separator, 1)
+        if len(list_split) != 2:
+            raise ValueError("Incorrect benchmark name format: ", full_name)
+        
+        return list_split[self.idx_pool], list_split[self.idx_config]
+
+    def get_mean(self, datarow):
+        return float(datarow[self.col_statistics_time])
+
+    def parse_output(self, output):        
+        csv_file = io.StringIO(output)
+        reader = csv.reader(csv_file)
+
+        data_row = next(reader, None)
+        if data_row is None:
+            raise ValueError("Benchmark output does not contain data.")
+
+        results = []
+        for row in reader:
+            try:
+                full_name = row[self.col_name]
+                pool, config = self.get_pool_and_config(full_name)
+                mean = self.get_mean(row)
+                results.append((config, pool, mean))
+            except KeyError as e:
+                raise ValueError(f"Error parsing output: {e}")
+
+        return results
+    
+
+    
\ No newline at end of file
diff --git a/scripts/benchmarks/benches/velocity.py b/scripts/benchmarks/benches/velocity.py
index 605cf03fd4..705421d963 100644
--- a/scripts/benchmarks/benches/velocity.py
+++ b/scripts/benchmarks/benches/velocity.py
@@ -10,6 +10,9 @@
 from .result import Result
 from utils.utils import run, create_build_path
 from .options import options
+from .oneapi import get_oneapi
+import shutil
+
 import os
 
 class VelocityBench(Suite):
@@ -35,7 +38,10 @@ def benchmarks(self) -> list[Benchmark]:
             CudaSift(self),
             Easywave(self),
             QuickSilver(self),
-            SobelFilter(self)
+            SobelFilter(self),
+            DLCifar(self),
+            DLMnist(self),
+            SVM(self)
         ]
 
 class VelocityBase(Benchmark):
@@ -50,6 +56,12 @@ def __init__(self, name: str, bin_name: str, vb: VelocityBench, unit: str):
     def download_deps(self):
         return
 
+    def extra_cmake_args(self) -> list[str]:
+        return []
+
+    def ld_libraries(self) -> list[str]:
+        return []
+
     def setup(self):
         self.download_deps()
         self.benchmark_bin = os.path.join(self.directory, self.bench_name, self.bin_name)
@@ -62,8 +74,10 @@ def setup(self):
             f"-S {self.code_path}",
             f"-DCMAKE_BUILD_TYPE=Release"
         ]
+        configure_command += self.extra_cmake_args()
+
         run(configure_command, {'CC': 'clang', 'CXX':'clang++'}, add_sycl=True)
-        run(f"cmake --build {build_path} -j", add_sycl=True)
+        run(f"cmake --build {build_path} -j", add_sycl=True, ld_library=self.ld_libraries())
 
     def bin_args(self) -> list[str]:
         return []
@@ -82,7 +96,7 @@ def run(self, env_vars) -> list[Result]:
         ]
         command += self.bin_args()
 
-        result = self.run_bench(command, env_vars)
+        result = self.run_bench(command, env_vars, ld_library=self.ld_libraries())
 
         return [ Result(label=self.name(), value=self.parse_output(result), command=command, env=env_vars, stdout=result, unit=self.unit) ]
 
@@ -136,7 +150,6 @@ def __init__(self, vb: VelocityBench):
 
     def download_deps(self):
         self.download("sobel_filter", "https://github.com/oneapi-src/Velocity-Bench/raw/main/sobel_filter/res/sobel_filter_data.tgz?download=", "sobel_filter_data.tgz", untar=True)
-        return
 
     def name(self):
         return "Velocity-Bench Sobel Filter"
@@ -228,7 +241,6 @@ def get_last_elapsed_time(self, log_file_path) -> float:
     def parse_output(self, stdout: str) -> float:
         return self.get_last_elapsed_time(os.path.join(options.benchmark_cwd, "easywave.log"))
 
-
 class CudaSift(VelocityBase):
     def __init__(self, vb: VelocityBench):
         super().__init__("cudaSift", "cudaSift", vb, "ms")
@@ -248,3 +260,103 @@ def parse_output(self, stdout: str) -> float:
             return float(match.group(1))
         else:
             raise ValueError("Failed to parse benchmark output.")
+
+class DLCifar(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-cifar", "dl-cifar_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-cifar hardcodes the path to this dataset as "../../datasets/cifar-10-binary"...
+        self.download("datasets", "https://www.cs.toronto.edu/~kriz/cifar-10-binary.tar.gz", "cifar-10-binary.tar.gz", untar=True, skip_data_dir=True)
+        return
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-cifar"
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-cifar - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class DLMnist(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("dl-mnist", "dl-mnist-sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def download_deps(self):
+        # TODO: dl-mnist hardcodes the path to this dataset as "../../datasets/"...
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-images-idx3-ubyte.gz", "train-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/train-labels-idx1-ubyte.gz", "train-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-images-idx3-ubyte.gz", "t10k-images.idx3-ubyte.gz", unzip=True, skip_data_dir=True)
+        self.download("datasets", "https://raw.githubusercontent.com/fgnt/mnist/master/t10k-labels-idx1-ubyte.gz", "t10k-labels.idx1-ubyte.gz", unzip=True, skip_data_dir=True)
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench dl-mnist"
+
+    def bin_args(self):
+        return [
+            "-conv_algo", "ONEDNN_AUTO"
+        ]
+
+    # TODO: This shouldn't be required.
+    # The application crashes with a segfault without it.
+    def extra_env_vars(self):
+        return {
+            "NEOReadDebugKeys":"1",
+            "DisableScratchPages":"0",
+        }
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'dl-mnist - total time for whole calculation: (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
+
+class SVM(VelocityBase):
+    def __init__(self, vb: VelocityBench):
+        self.oneapi = get_oneapi()
+        super().__init__("svm", "svm_sycl", vb, "s")
+
+    def ld_libraries(self):
+        return self.oneapi.ld_libraries()
+
+    def extra_cmake_args(self):
+        return [
+            f"-DCMAKE_CXX_FLAGS=-O3 -fsycl -ffast-math -I{self.oneapi.dnn_include()} -I{self.oneapi.mkl_include()} -L{self.oneapi.dnn_lib()} -L{self.oneapi.mkl_lib()}"
+        ]
+
+    def name(self):
+        return "Velocity-Bench svm"
+
+    def bin_args(self):
+        return [
+            f"{self.code_path}/a9a",
+            f"{self.code_path}/a.m",
+        ]
+
+    def parse_output(self, stdout: str) -> float:
+        match = re.search(r'Total      elapsed time : (\d+\.\d+) s', stdout)
+        if match:
+            return float(match.group(1))
+        else:
+            raise ValueError("Failed to parse benchmark output.")
diff --git a/scripts/benchmarks/main.py b/scripts/benchmarks/main.py
index c83825c9e5..e692c80972 100755
--- a/scripts/benchmarks/main.py
+++ b/scripts/benchmarks/main.py
@@ -14,22 +14,114 @@
 from output_markdown import generate_markdown
 from output_html import generate_html
 from history import BenchmarkHistory
+from benches.umf import *
 from utils.utils import prepare_workdir;
 
 import argparse
 import re
+import statistics
 
 # Update this if you are changing the layout of the results files
 INTERNAL_WORKDIR_VERSION = '2.0'
 
+def run_iterations(benchmark: Benchmark, env_vars, iters: int, results: dict[str, list[Result]]):
+    for iter in range(iters):
+        print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
+        bench_results = benchmark.run(env_vars)
+        if bench_results is None:
+            print(f"did not finish (OK for sycl-bench).")
+            break
+
+        for bench_result in bench_results:
+            # TODO: report failures in markdown/html ?
+            if not bench_result.passed:
+                print(f"complete ({bench_result.label}: verification FAILED)")
+                continue
+
+            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
+
+            bench_result.name = bench_result.label
+            bench_result.lower_is_better = benchmark.lower_is_better()
+
+            if bench_result.label not in results:
+                results[bench_result.label] = []
+
+            results[bench_result.label].append(bench_result)
+
+# https://www.statology.org/modified-z-score/
+def modified_z_score(values: list[float]) -> list[float]:
+    median = statistics.median(values)
+    mad = statistics.median([abs(v - median) for v in values])
+    if mad == 0:
+        return [0] * len(values)
+    return [(0.6745 * (v - median)) / mad for v in values]
+
+def remove_outliers(results: dict[str, list[Result]], threshold: float = 3.5) -> dict[str, list[Result]]:
+    new_results = {}
+    for key, rlist in results.items():
+        # don't eliminate outliers on first pass
+        if len(rlist) <= options.iterations:
+            new_results[key] = rlist
+            continue
+
+        values = [r.value for r in rlist]
+        z_scores = modified_z_score(values)
+        filtered_rlist = [r for r, z in zip(rlist, z_scores) if abs(z) <= threshold]
+
+        if not filtered_rlist:
+            new_results[key] = rlist
+        else:
+            new_results[key] = filtered_rlist
+
+    return new_results
+
+def process_results(results: dict[str, list[Result]]) -> tuple[bool, list[Result]]:
+    processed: list[Result] = []
+    # technically, we can detect whether result is below or above threshold per
+    # individual result. However, we can't repeat benchmark runs with that
+    # granularity. So we just reject all results and try again.
+    valid_results = True # above stddev threshold
+
+    for label, rlist in remove_outliers(results).items():
+        if (len(rlist) == 0):
+            continue
+
+        if len(rlist) == 1:
+            processed.append(rlist[0])
+            continue
+
+        values = [r.value for r in rlist]
+
+        mean_value = statistics.mean(values)
+        stddev = statistics.stdev(values)
+
+        threshold = options.stddev_threshold * mean_value
+
+        if stddev > threshold:
+            print(f"stddev {stddev} above the threshold {threshold} for {label}")
+            valid_results = False
+
+        rlist.sort(key=lambda res: res.value)
+        median_index = len(rlist) // 2
+        median_result = rlist[median_index]
+
+        # only override the stddev if not already set
+        if median_result.stddev == 0.0:
+            median_result.stddev = stddev
+
+        processed.append(median_result)
+
+    return valid_results, processed
+
 def main(directory, additional_env_vars, save_name, compare_names, filter):
     prepare_workdir(directory, INTERNAL_WORKDIR_VERSION)
 
     suites = [
-        ComputeBench(directory),
-        VelocityBench(directory),
-        SyclBench(directory),
-        LlamaCppBench(directory),
+        UMFSuite(directory),
+        # ComputeBench(directory),
+        # VelocityBench(directory),
+        # SyclBench(directory),
+        # LlamaCppBench(directory),
         #TestSuite()
     ] if not options.dry_run else []
 
@@ -65,36 +157,14 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     for benchmark in benchmarks:
         try:
             merged_env_vars = {**additional_env_vars}
-            iteration_results = []
-            iterations = options.iterations if not benchmark.ignore_iterations() else 1
-            for iter in range(iterations):
-                print(f"running {benchmark.name()}, iteration {iter}... ", end='', flush=True)
-                bench_results = benchmark.run(merged_env_vars)
-                if bench_results is not None:
-                    for bench_result in bench_results:
-                        if bench_result.passed:
-                            print(f"complete ({bench_result.label}: {bench_result.value:.3f} {bench_result.unit}).")
-                        else:
-                            print(f"complete ({bench_result.label}: verification FAILED)")
-                        iteration_results.append(bench_result)
-                else:
-                    print(f"did not finish (OK for sycl-bench).")
+            intermediate_results: dict[str, list[Result]] = {}
+            processed: list[Result] = []
+            for _ in range(5):
+                run_iterations(benchmark, merged_env_vars, options.iterations, intermediate_results)
+                valid, processed = process_results(intermediate_results)
+                if valid:
                     break
-
-            if len(iteration_results) == 0:
-                continue
-
-            for label in set([result.label for result in iteration_results]):
-                label_results = [result for result in iteration_results if result.label == label and result.passed == True]
-                if len(label_results) > 0:
-                    label_results.sort(key=lambda res: res.value)
-                    median_index = len(label_results) // 2
-                    median_result = label_results[median_index]
-
-                    median_result.name = label
-                    median_result.lower_is_better = benchmark.lower_is_better()
-
-                    results.append(median_result)
+            results += processed
         except Exception as e:
             if options.exit_on_failure:
                 raise e
@@ -115,6 +185,9 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # should this be configurable?
     history.load(1000)
 
+    # remove duplicates. this can happen if e.g., --compare baseline is specified manually.
+    compare_names = list(dict.fromkeys(compare_names))
+
     for name in compare_names:
         compare_result = history.get_compare(name)
         if compare_result:
@@ -135,7 +208,8 @@ def main(directory, additional_env_vars, save_name, compare_names, filter):
     # Otherwise we might be comparing the results to themselves.
     if not options.dry_run:
         history.save(saved_name, results, save_name is not None)
-        compare_names.append(saved_name)
+        if saved_name not in compare_names:
+            compare_names.append(saved_name)
 
     if options.output_html:
         html_content = generate_html(history.runs, 'oneapi-src/unified-runtime', compare_names)
@@ -159,19 +233,21 @@ def validate_and_parse_env_args(env_args):
     parser.add_argument('benchmark_directory', type=str, help='Working directory to setup benchmarks.')
     parser.add_argument('--sycl', type=str, help='Root directory of the SYCL compiler.', default=None)
     parser.add_argument('--ur', type=str, help='UR install prefix path', default=None)
+    parser.add_argument('--umf', type=str, help='UMF install prefix path', default=None)
     parser.add_argument('--adapter', type=str, help='Options to build the Unified Runtime as part of the benchmark', default="level_zero")
     parser.add_argument("--no-rebuild", help='Rebuild the benchmarks from scratch.', action="store_true")
     parser.add_argument("--env", type=str, help='Use env variable for a benchmark run.', action="append", default=[])
     parser.add_argument("--save", type=str, help='Save the results for comparison under a specified name.')
     parser.add_argument("--compare", type=str, help='Compare results against previously saved data.', action="append", default=["baseline"])
-    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=5)
-    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=600)
+    parser.add_argument("--iterations", type=int, help='Number of times to run each benchmark to select a median value.', default=options.iterations)
+    parser.add_argument("--stddev-threshold", type=float, help='If stddev % is above this threshold, rerun all iterations', default=options.stddev_threshold)
+    parser.add_argument("--timeout", type=int, help='Timeout for individual benchmarks in seconds.', default=options.timeout)
     parser.add_argument("--filter", type=str, help='Regex pattern to filter benchmarks by name.', default=None)
-    parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=0.005)
+    parser.add_argument("--epsilon", type=float, help='Threshold to consider change of performance significant', default=options.epsilon)
     parser.add_argument("--verbose", help='Print output of all the commands.', action="store_true")
     parser.add_argument("--exit-on-failure", help='Exit on first failure.', action="store_true")
     parser.add_argument("--compare-type", type=str, choices=[e.value for e in Compare], help='Compare results against previously saved data.', default=Compare.LATEST.value)
-    parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=10)
+    parser.add_argument("--compare-max", type=int, help='How many results to read for comparisions', default=options.compare_max)
     parser.add_argument("--output-html", help='Create HTML output', action="store_true", default=False)
     parser.add_argument("--output-markdown", help='Create Markdown output', action="store_true", default=True)
     parser.add_argument("--dry-run", help='Do not run any actual benchmarks', action="store_true", default=False)
@@ -179,6 +255,7 @@ def validate_and_parse_env_args(env_args):
     args = parser.parse_args()
     additional_env_vars = validate_and_parse_env_args(args.env)
 
+    options.workdir = args.benchmark_directory
     options.verbose = args.verbose
     options.rebuild = not args.no_rebuild
     options.sycl = args.sycl
@@ -186,6 +263,7 @@ def validate_and_parse_env_args(env_args):
     options.timeout = args.timeout
     options.epsilon = args.epsilon
     options.ur = args.ur
+    options.umf = args.umf
     options.ur_adapter = args.adapter
     options.exit_on_failure = args.exit_on_failure
     options.compare = Compare(args.compare_type)
diff --git a/scripts/benchmarks/output_html.py b/scripts/benchmarks/output_html.py
index 4a04252797..bc9b4ffe64 100644
--- a/scripts/benchmarks/output_html.py
+++ b/scripts/benchmarks/output_html.py
@@ -9,8 +9,8 @@
 from collections import defaultdict
 from dataclasses import dataclass
 import matplotlib.dates as mdates
-import numpy as np
 from benches.result import BenchmarkRun, Result
+import numpy as np
 
 @dataclass
 class BenchmarkMetadata:
@@ -24,232 +24,44 @@ class BenchmarkSeries:
     runs: list[BenchmarkRun]
 
 @dataclass
-class LatestResults:
-    benchmark_label: str
-    run_values: dict[str, float]
-
-    @classmethod
-    def from_dict(cls, label: str, values: dict[str, float]) -> 'LatestResults':
-        return cls(benchmark_label=label, run_values=values)
-
-def get_latest_results(benchmarks: list[BenchmarkSeries]) -> dict[str, LatestResults]:
-    latest_results: dict[str, LatestResults] = {}
-    for benchmark in benchmarks:
-        run_values = {
-            run.name: max(run.results, key=lambda x: x.date).value
-            for run in benchmark.runs
-        }
-        latest_results[benchmark.label] = LatestResults.from_dict(benchmark.label, run_values)
-    return latest_results
-
-def prepare_normalized_data(latest_results: dict[str, LatestResults], 
-                          benchmarks: list[BenchmarkSeries],
-                          group_benchmarks: list[str],
-                          non_baseline_runs: list[str],
-                          baseline_name: str) -> list[list[float]]:
-    normalized_data = []
-    benchmark_map = {b.label: b for b in benchmarks}
-
-    for run_name in non_baseline_runs:
-        run_data: list[float] = []
-        for benchmark_label in group_benchmarks:
-            benchmark_data = latest_results[benchmark_label].run_values
-            if run_name not in benchmark_data or baseline_name not in benchmark_data:
-                run_data.append(None)
-                continue
-
-            baseline_value = benchmark_data[baseline_name]
-            current_value = benchmark_data[run_name]
-
-            normalized_value = ((baseline_value / current_value) if benchmark_map[benchmark_label].metadata.lower_is_better
-                              else (current_value / baseline_value)) * 100
-            run_data.append(normalized_value)
-        normalized_data.append(run_data)
-    return normalized_data
-
-def format_benchmark_label(label: str) -> list[str]:
-    words = re.split(' |_', label)
-    lines = []
-    current_line = []
-
-    # max line length 30
-    for word in words:
-        if len(' '.join(current_line + [word])) > 30:
-            lines.append(' '.join(current_line))
-            current_line = [word]
-        else:
-            current_line.append(word)
-
-    if current_line:
-        lines.append(' '.join(current_line))
-
-    return lines
-
-def create_bar_plot(ax: plt.Axes,
-                   normalized_data: list[list[float]],
-                   group_benchmarks: list[str],
-                   non_baseline_runs: list[str],
-                   latest_results: dict[str, LatestResults],
-                   benchmarks: list[BenchmarkSeries],
-                   baseline_name: str) -> float:
-    x = np.arange(len(group_benchmarks))
-    width = 0.8 / len(non_baseline_runs)
-    max_height = 0
-    benchmark_map = {b.label: b for b in benchmarks}
-
-    for i, (run_name, run_data) in enumerate(zip(non_baseline_runs, normalized_data)):
-        offset = width * i - width * (len(non_baseline_runs) - 1) / 2
-        positions = x + offset
-        valid_data = [v if v is not None else 0 for v in run_data]
-        rects = ax.bar(positions, valid_data, width, label=run_name)
-
-        for rect, value, benchmark_label in zip(rects, run_data, group_benchmarks):
-            if value is not None:
-                height = rect.get_height()
-                if height > max_height:
-                    max_height = height
-
-                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
-                       f'{value:.1f}%',
-                       ha='center', va='bottom')
-
-                benchmark_data = latest_results[benchmark_label].run_values
-                baseline_value = benchmark_data[baseline_name]
-                current_value = benchmark_data[run_name]
-                unit = benchmark_map[benchmark_label].metadata.unit
-
-                tooltip_labels = [
-                    f"Run: {run_name}\n"
-                    f"Value: {current_value:.2f} {unit}\n"
-                    f"Normalized to ({baseline_name}): {baseline_value:.2f} {unit}\n"
-                    f"Normalized: {value:.1f}%"
-                ]
-                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}')
-                mpld3.plugins.connect(ax.figure, tooltip)
-
-    return max_height
-
-def add_chart_elements(ax: plt.Axes,
-                      group_benchmarks: list[str],
-                      group_name: str,
-                      max_height: float) -> None:
-    top_padding = max_height * 0.2
-    ax.set_ylim(0, max_height + top_padding)
-    ax.set_ylabel('Performance relative to baseline (%)')
-    ax.set_title(f'Performance Comparison (Normalized to Baseline) - {group_name} Group')
-    ax.set_xticks([])
-
-    for idx, label in enumerate(group_benchmarks):
-        split_labels = format_benchmark_label(label)
-        for i, sublabel in enumerate(split_labels):
-            y_pos = max_height + (top_padding * 0.5) + 2 - (i * top_padding * 0.15)
-            ax.text(idx, y_pos, sublabel,
-                   ha='center',
-                   style='italic',
-                   color='#666666')
-
-    ax.grid(True, axis='y', alpha=0.2)
-    ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
-
-def split_large_groups(benchmark_groups):
-    miscellaneous = []
-    new_groups = defaultdict(list)
-
-    split_happened = False
-    for group, labels in benchmark_groups.items():
-        if len(labels) == 1:
-            miscellaneous.extend(labels)
-        elif len(labels) > 5:
-            split_happened = True
-            mid = len(labels) // 2
-            new_groups[group] = labels[:mid]
-            new_groups[group + '_'] = labels[mid:]
-        else:
-            new_groups[group] = labels
-
-    if miscellaneous:
-        new_groups['Miscellaneous'] = miscellaneous
-
-    if split_happened:
-        return split_large_groups(new_groups)
-    else:
-        return new_groups
-
-def group_benchmark_labels(benchmark_labels):
-    benchmark_groups = defaultdict(list)
-    for label in benchmark_labels:
-        group = re.match(r'^[^_\s]+', label)[0]
-        benchmark_groups[group].append(label)
-    return split_large_groups(benchmark_groups)
-
-def create_normalized_bar_chart(benchmarks: list[BenchmarkSeries], baseline_name: str) -> list[str]:
-    latest_results = get_latest_results(benchmarks)
-
-    run_names = sorted(list(set(
-        name for result in latest_results.values()
-        for name in result.run_values.keys()
-    )))
-
-    if baseline_name not in run_names:
-        return []
-
-    benchmark_labels = [b.label for b in benchmarks]
-
-    benchmark_groups = group_benchmark_labels(benchmark_labels)
-
-    html_charts = []
-
-    for group_name, group_benchmarks in benchmark_groups.items():
-        plt.close('all')
-        non_baseline_runs = [n for n in run_names if n != baseline_name]
-
-        if len(non_baseline_runs) == 0:
-            continue
-
-        normalized_data = prepare_normalized_data(
-            latest_results, benchmarks, group_benchmarks,
-            non_baseline_runs, baseline_name
-        )
-
-        fig, ax = plt.subplots(figsize=(10, 6))
-        max_height = create_bar_plot(
-            ax, normalized_data, group_benchmarks, non_baseline_runs,
-            latest_results, benchmarks, baseline_name
-        )
-        add_chart_elements(ax, group_benchmarks, group_name, max_height)
-
-        plt.tight_layout()
-        html_charts.append(mpld3.fig_to_html(fig))
-        plt.close(fig)
+class BenchmarkChart:
+    label: str
+    html: str
 
-    return html_charts
+def tooltip_css() -> str:
+    return '.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}'
 
-def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> str:
+def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str) -> list[BenchmarkChart]:
     plt.close('all')
 
     num_benchmarks = len(benchmarks)
     if num_benchmarks == 0:
-        return
+        return []
 
-    fig, axes = plt.subplots(num_benchmarks, 1, figsize=(10, max(4 * num_benchmarks, 30)))
+    html_charts = []
 
-    if num_benchmarks == 1:
-        axes = [axes]
+    for _, benchmark in enumerate(benchmarks):
+        fig, ax = plt.subplots(figsize=(10, 4))
 
-    for idx, benchmark in enumerate(benchmarks):
-        ax = axes[idx]
+        all_values = []
+        all_stddevs = []
 
         for run in benchmark.runs:
             sorted_points = sorted(run.results, key=lambda x: x.date)
             dates = [point.date for point in sorted_points]
             values = [point.value for point in sorted_points]
+            stddevs = [point.stddev for point in sorted_points]
+
+            all_values.extend(values)
+            all_stddevs.extend(stddevs)
 
-            ax.plot_date(dates, values, '-', label=run.name, alpha=0.5)
+            ax.errorbar(dates, values, yerr=stddevs, fmt='-', label=run.name, alpha=0.5)
             scatter = ax.scatter(dates, values, picker=True)
 
             tooltip_labels = [
                 f"Date: {point.date.strftime('%Y-%m-%d %H:%M:%S')}\n"
-                f"Value: {point.value:.2f}\n"
+                f"Value: {point.value:.2f} {benchmark.metadata.unit}\n"
+                f"Stddev: {point.stddev:.2f} {benchmark.metadata.unit}\n"
                 f"Git Hash: {point.git_hash}"
                 for point in sorted_points
             ]
@@ -258,10 +70,17 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
                       for point in sorted_points]
 
             tooltip = mpld3.plugins.PointHTMLTooltip(scatter, tooltip_labels,
-                css='.mpld3-tooltip{background:white;padding:8px;border:1px solid #ddd;border-radius:4px;font-family:monospace;white-space:pre;}',
+                css=tooltip_css(),
                 targets=targets)
             mpld3.plugins.connect(fig, tooltip)
 
+        # This is so that the stddev doesn't fill the entire y axis on the chart
+        if all_values and all_stddevs:
+            max_value = max(all_values)
+            min_value = min(all_values)
+            max_stddev = max(all_stddevs)
+            ax.set_ylim(min_value - 3 * max_stddev, max_value + 3 * max_stddev)
+
         ax.set_title(benchmark.label, pad=20)
         performance_indicator = "lower is better" if benchmark.metadata.lower_is_better else "higher is better"
         ax.text(0.5, 1.05, f"({performance_indicator})",
@@ -277,13 +96,118 @@ def create_time_series_chart(benchmarks: list[BenchmarkSeries], github_repo: str
         ax.grid(True, alpha=0.2)
         ax.legend(bbox_to_anchor=(1, 1), loc='upper left')
         ax.xaxis.set_major_formatter(mdates.ConciseDateFormatter('%Y-%m-%d %H:%M:%S'))
-        ax.xaxis.set_major_locator(mdates.AutoDateLocator())
 
-    plt.tight_layout()
-    html = mpld3.fig_to_html(fig)
+        plt.tight_layout()
+        html_charts.append(BenchmarkChart(html=mpld3.fig_to_html(fig), label=benchmark.label))
+        plt.close(fig)
+
+    return html_charts
+
+@dataclass
+class ExplicitGroup:
+    name: str
+    nnames: int
+    metadata: BenchmarkMetadata
+    runs: dict[str, dict[str, Result]]
+
+def create_explicit_groups(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[ExplicitGroup]:
+    groups = {}
+    counted = {}
+
+    for run in benchmark_runs:
+        if run.name in compare_names:
+            for res in run.results:
+                if res.explicit_group != '':
+                    if res.explicit_group not in groups:
+                        counted[res.explicit_group] = 1
+
+                        groups[res.explicit_group] = ExplicitGroup(name=res.explicit_group, nnames=len(compare_names),
+                                metadata=BenchmarkMetadata(unit=res.unit, lower_is_better=res.lower_is_better),
+                                runs={})
+                    else:
+                        counted[res.explicit_group] = counted[res.explicit_group] + 1
+
+                    group = groups[res.explicit_group]
+                    if res.label not in group.runs:
+                        group.runs[res.label] = {name: None for name in compare_names}
+
+                    if group.runs[res.label][run.name] is None:
+                        group.runs[res.label][run.name] = res
 
-    plt.close(fig)
-    return html
+    for key, val in counted.items():
+        if val == 1:
+            groups.pop(key)
+
+    return list(groups.values())
+
+def create_grouped_bar_charts(groups: list[ExplicitGroup]) -> list[BenchmarkChart]:
+    plt.close('all')
+
+    html_charts = []
+
+    for group in groups:
+        fig, ax = plt.subplots(figsize=(10, 6))
+
+        x = np.arange(group.nnames)
+        x_labels = []
+        width = 0.8 / len(group.runs)
+
+        max_height = 0
+
+        for i, (run_name, run_results) in enumerate(group.runs.items()):
+            offset = width * i
+
+            positions = x + offset
+            x_labels = run_results.keys()
+            valid_data = [r.value if r is not None else 0 for r in run_results.values()]
+            rects = ax.bar(positions, valid_data, width, label=run_name)
+            # This is a hack to disable all bar_label. Setting labels to empty doesn't work.
+            # We create our own labels below for each bar, this works better in mpld3.
+            ax.bar_label(rects, fmt='')
+
+            for rect, run, res in zip(rects, run_results.keys(), run_results.values()):
+                height = rect.get_height()
+                if height > max_height:
+                    max_height = height
+
+                ax.text(rect.get_x() + rect.get_width()/2., height + 2,
+                                    f'{res.value:.1f}',
+                                    ha='center', va='bottom', fontsize=9)
+
+                tooltip_labels = [
+                    f"Run: {run}\n"
+                    f"Label: {res.label}\n"
+                    f"Value: {res.value:.2f} {res.unit}\n"
+                ]
+                tooltip = mpld3.plugins.LineHTMLTooltip(rect, tooltip_labels, css=tooltip_css())
+                mpld3.plugins.connect(ax.figure, tooltip)
+
+        ax.set_xticks([])
+        ax.grid(True, axis='y', alpha=0.2)
+        ax.set_ylabel(f"Value ({group.metadata.unit})")
+        ax.legend(loc='upper left')
+        ax.set_title(group.name, pad=20)
+        performance_indicator = "lower is better" if group.metadata.lower_is_better else "higher is better"
+        ax.text(0.5, 1.03, f"({performance_indicator})",
+                ha='center',
+                transform=ax.transAxes,
+                style='italic',
+                fontsize=7,
+                color='#666666')
+
+        for idx, label in enumerate(x_labels):
+            # this is a hack to get labels to show above the legend
+            # we normalize the idx to transAxes transform and offset it a little.
+            x_norm = (idx + 0.3 - ax.get_xlim()[0]) / (ax.get_xlim()[1] - ax.get_xlim()[0])
+            ax.text(x_norm, 1.00, label,
+                transform=ax.transAxes,
+                color='#666666')
+
+        plt.tight_layout()
+        html_charts.append(BenchmarkChart(label=group.name, html=mpld3.fig_to_html(fig)))
+        plt.close(fig)
+
+    return html_charts
 
 def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: list[str]) -> list[BenchmarkSeries]:
     benchmark_metadata: dict[str, BenchmarkMetadata] = {}
@@ -319,12 +243,15 @@ def process_benchmark_data(benchmark_runs: list[BenchmarkRun], compare_names: li
     return benchmark_series
 
 def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_names: list[str]) -> str:
-    baseline_name = compare_names[0]
     benchmarks = process_benchmark_data(benchmark_runs, compare_names)
 
-    comparison_html_charts = create_normalized_bar_chart(benchmarks, baseline_name)
-    timeseries_html = create_time_series_chart(benchmarks, github_repo)
-    comparison_charts_html = '\n'.join(f'<div class="chart"><div>{chart}</div></div>' for chart in comparison_html_charts)
+    timeseries = create_time_series_chart(benchmarks, github_repo)
+    timeseries_charts_html = '\n'.join(f'<div class="chart" data-label="{ts.label}"><div>{ts.html}</div></div>' for ts in timeseries)
+
+    explicit_groups = create_explicit_groups(benchmark_runs, compare_names)
+
+    bar_charts = create_grouped_bar_charts(explicit_groups)
+    bar_charts_html = '\n'.join(f'<div class="chart" data-label="{bc.label}"><div>{bc.html}</div></div>' for bc in bar_charts)
 
     html_template = f"""
     <!DOCTYPE html>
@@ -375,22 +302,106 @@ def generate_html(benchmark_runs: list[BenchmarkRun], github_repo: str, compare_
                     margin-bottom: 16px;
                 }}
             }}
+            .filter-container {{
+                text-align: center;
+                margin-bottom: 24px;
+            }}
+            .filter-container input {{
+                padding: 8px;
+                font-size: 16px;
+                border: 1px solid #ccc;
+                border-radius: 4px;
+                width: 400px;
+                max-width: 100%;
+            }}
+            details {{
+                margin-bottom: 24px;
+            }}
+            summary {{
+                font-size: 18px;
+                font-weight: 500;
+                cursor: pointer;
+                padding: 12px;
+                background: #e9ecef;
+                border-radius: 8px;
+                user-select: none;
+            }}
+            summary:hover {{
+                background: #dee2e6;
+            }}
         </style>
+        <script>
+            function getQueryParam(param) {{
+                const urlParams = new URLSearchParams(window.location.search);
+                return urlParams.get(param);
+            }}
+
+            function filterCharts() {{
+                const regexInput = document.getElementById('bench-filter').value;
+                const regex = new RegExp(regexInput, 'i');
+                const charts = document.querySelectorAll('.chart');
+                let timeseriesVisible = false;
+                let barChartsVisible = false;
+
+                charts.forEach(chart => {{
+                    const label = chart.getAttribute('data-label');
+                    if (regex.test(label)) {{
+                        chart.style.display = '';
+                        if (chart.closest('.timeseries')) {{
+                            timeseriesVisible = true;
+                        }} else if (chart.closest('.bar-charts')) {{
+                            barChartsVisible = true;
+                        }}
+                    }} else {{
+                        chart.style.display = 'none';
+                    }}
+                }});
+
+                updateURL(regexInput);
+
+                document.querySelector('.timeseries').open = timeseriesVisible;
+                document.querySelector('.bar-charts').open = barChartsVisible;
+            }}
+
+            function updateURL(regex) {{
+                const url = new URL(window.location);
+                if (regex) {{
+                    url.searchParams.set('regex', regex);
+                }} else {{
+                    url.searchParams.delete('regex');
+                }}
+                history.replaceState(null, '', url);
+            }}
+
+            document.addEventListener('DOMContentLoaded', (event) => {{
+                const regexParam = getQueryParam('regex');
+                if (regexParam) {{
+                    document.getElementById('bench-filter').value = regexParam;
+                    filterCharts();
+                }}
+            }});
+        </script>
     </head>
     <body>
         <div class="container">
             <h1>Benchmark Results</h1>
-            <h2>Latest Results Comparison</h2>
-            <div class="chart">
-                {comparison_charts_html}
-            </div>
-            <h2>Historical Results</h2>
-            <div class="chart">
-                {timeseries_html}
+            <div class="filter-container">
+                <input type="text" id="bench-filter" placeholder="Regex..." oninput="filterCharts()">
             </div>
+            <details class="timeseries">
+                <summary>Historical Results</summary>
+                <div class="charts">
+                    {timeseries_charts_html}
+                </div>
+            </details>
+            <details class="bar-charts">
+                <summary>Comparisons</summary>
+                <div class="charts">
+                    {bar_charts_html}
+                </div>
+            </details>
         </div>
     </body>
     </html>
     """
-
     return html_template
diff --git a/scripts/benchmarks/utils/utils.py b/scripts/benchmarks/utils/utils.py
index d077184e5c..0bb954fab2 100644
--- a/scripts/benchmarks/utils/utils.py
+++ b/scripts/benchmarks/utils/utils.py
@@ -3,6 +3,7 @@
 # See LICENSE.TXT
 # SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
 
+import gzip
 import os
 import shutil
 import subprocess
@@ -58,7 +59,7 @@ def git_clone(dir, name, repo, commit):
     return repo_path
 
 def prepare_bench_cwd(dir):
-    # we need 2 deep to workaround a problem with a fixed relative path in cudaSift
+    # we need 2 deep to workaround a problem with a fixed relative paths in some velocity benchmarks
     options.benchmark_cwd = os.path.join(dir, 'bcwd', 'bcwd')
     if os.path.exists(options.benchmark_cwd):
         shutil.rmtree(options.benchmark_cwd)
@@ -97,7 +98,7 @@ def create_build_path(directory, name):
 
     return build_path
 
-def download(dir, url, file, untar = False):
+def download(dir, url, file, untar = False, unzip = False):
     data_file = os.path.join(dir, file)
     if not Path(data_file).exists():
         print(f"{data_file} does not exist, downloading")
@@ -106,6 +107,10 @@ def download(dir, url, file, untar = False):
             file = tarfile.open(data_file)
             file.extractall(dir)
             file.close()
+        if unzip:
+            [stripped_gz, _] = os.path.splitext(data_file)
+            with gzip.open(data_file, 'rb') as f_in, open(stripped_gz, 'wb') as f_out:
+                shutil.copyfileobj(f_in, f_out)
     else:
         print(f"{data_file} exists, skipping...")
     return data_file